In [209]:
# For reading data
import pandas as pd
# For performing text preprocessing
from nltk.stem import PorterStemmer, LancasterStemmer, SnowballStemmer, WordNetLemmatizer
from nltk.corpus import stopwords
from string import punctuation
import re
# For embedding the words
from sklearn.feature_extraction.text import TfidfVectorizer
# For encoding the targets
from sklearn.preprocessing import LabelEncoder
# For creating train, test and validation sets
from sklearn.model_selection import train_test_split
# For implementing machine learning models
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier,GradientBoostingClassifier, AdaBoostClassifier
from sklearn.svm import SVC
from xgboost import XGBClassifier
# For evaluating the models
from sklearn.metrics import classification_report, accuracy_score
# For saving the model for future use
import joblib

In [210]:
data = pd.read_csv('C:/Users/krina/OneDrive/Desktop/SEM-8/naukri_com-job_sample.csv')

In [211]:
data.head(5)

Unnamed: 0,company,education,experience,industry,jobdescription,jobid,joblocation_address,jobtitle,numberofpositions,payrate,postdate,site_name,skills,uniq_id
0,MM Media Pvt Ltd,UG: B.Tech/B.E. - Any Specialization PG:Any Po...,0 - 1 yrs,Media / Entertainment / Internet,Job Description Send me Jobs like this Quali...,210516002263,Chennai,Walkin Data Entry Operator (night Shift),,"1,50,000 - 2,25,000 P.A",2016-05-21 19:30:00 +0000,,ITES,43b19632647068535437c774b6ca6cf8
1,find live infotech,UG: B.Tech/B.E. - Any Specialization PG:MBA/PG...,0 - 0 yrs,Advertising / PR / MR / Event Management,Job Description Send me Jobs like this Quali...,210516002391,Chennai,Work Based Onhome Based Part Time.,60.0,"1,50,000 - 2,50,000 P.A. 20000",2016-05-21 19:30:00 +0000,,Marketing,d4c72325e57f89f364812b5ed5a795f0
2,Softtech Career Infosystem Pvt. Ltd,UG: Any Graduate - Any Specialization PG:Any P...,4 - 8 yrs,IT-Software / Software Services,Job Description Send me Jobs like this - as ...,101016900534,Bengaluru,Pl/sql Developer - SQL,,Not Disclosed by Recruiter,2016-10-13 16:20:55 +0000,,IT Software - Application Programming,c47df6f4cfdf5b46f1fd713ba61b9eba
3,Onboard HRServices LLP,UG: Any Graduate - Any Specialization PG:CA Do...,11 - 15 yrs,Banking / Financial Services / Broking,Job Description Send me Jobs like this - Inv...,81016900536,"Mumbai, Bengaluru, Kolkata, Chennai, Coimbator...",Manager/ad/partner - Indirect Tax - CA,,Not Disclosed by Recruiter,2016-10-13 16:20:55 +0000,,Accounts,115d28f140f694dd1cc61c53d03c66ae
4,Spire Technologies and Solutions Pvt. Ltd.,UG: B.Tech/B.E. - Any Specialization PG:Any Po...,6 - 8 yrs,IT-Software / Software Services,Job Description Send me Jobs like this Pleas...,120916002122,Bengaluru,JAVA Technical Lead (6-8 yrs) -,4.0,Not Disclosed by Recruiter,2016-10-13 16:20:55 +0000,,IT Software - Application Programming,a12553fc03bc7bcced8b1bb8963f97b4


In [212]:
data.isnull().sum()[data.isnull().sum()>0]

company                    4
education               1996
experience                 4
industry                   5
jobdescription             4
joblocation_address      501
numberofpositions      17536
payrate                   97
postdate                  23
site_name              18013
skills                   528
dtype: int64

In [213]:
data.drop(['numberofpositions','site_name'],axis=1,inplace=True)

In [214]:
from sklearn.impute import SimpleImputer
to_fill = ['education', 'skills']
for col in to_fill:
    imputer = SimpleImputer(strategy='most_frequent')
    data[col] = imputer.fit_transform(data[[col]])

In [215]:
data.isnull().sum()[data.isnull().sum()>0]

company                  4
experience               4
industry                 5
jobdescription           4
joblocation_address    501
payrate                 97
postdate                23
dtype: int64

In [216]:
data[data['experience'] == 'Not Mentioned'].shape

(111, 12)

In [217]:
data = data[data['experience'] != 'Not Mentioned']

In [218]:
experience_lower = []
experience_upper = []
invalid = []
for idx, row in data.iterrows():
    try:
        text = re.sub('yrs', '', row['experience'])
        splits = text.split('-')
        experience_lower.append(int(splits[0]))
        experience_upper.append(int(splits[1]))
    except:
        invalid.append(row['experience'])

In [219]:
data = data[~data['experience'].isin(invalid)]

In [220]:
data['experience_lower'] = data['experience'].apply(lambda x: int(x.split('-')[0]))
data['experience_upper'] = data['experience'].apply(lambda x: int(re.sub('yrs','', x.split('-')[1])))

In [221]:
data.isnull().sum()

company                  0
education                0
experience               0
industry                 1
jobdescription           0
jobid                    0
joblocation_address    497
jobtitle                 0
payrate                  1
postdate                19
skills                   0
uniq_id                  0
experience_lower         0
experience_upper         0
dtype: int64

In [222]:
#data.drop(['experience'], axis=1, inplace=True)

In [223]:
data.columns

Index(['company', 'education', 'experience', 'industry', 'jobdescription',
       'jobid', 'joblocation_address', 'jobtitle', 'payrate', 'postdate',
       'skills', 'uniq_id', 'experience_lower', 'experience_upper'],
      dtype='object')

In [224]:
data['postdate'] = data['postdate'].astype(str).apply(lambda x: x[:-5])

In [225]:
data.head(5)

Unnamed: 0,company,education,experience,industry,jobdescription,jobid,joblocation_address,jobtitle,payrate,postdate,skills,uniq_id,experience_lower,experience_upper
0,MM Media Pvt Ltd,UG: B.Tech/B.E. - Any Specialization PG:Any Po...,0 - 1 yrs,Media / Entertainment / Internet,Job Description Send me Jobs like this Quali...,210516002263,Chennai,Walkin Data Entry Operator (night Shift),"1,50,000 - 2,25,000 P.A",2016-05-21 19:30:00,ITES,43b19632647068535437c774b6ca6cf8,0,1
1,find live infotech,UG: B.Tech/B.E. - Any Specialization PG:MBA/PG...,0 - 0 yrs,Advertising / PR / MR / Event Management,Job Description Send me Jobs like this Quali...,210516002391,Chennai,Work Based Onhome Based Part Time.,"1,50,000 - 2,50,000 P.A. 20000",2016-05-21 19:30:00,Marketing,d4c72325e57f89f364812b5ed5a795f0,0,0
2,Softtech Career Infosystem Pvt. Ltd,UG: Any Graduate - Any Specialization PG:Any P...,4 - 8 yrs,IT-Software / Software Services,Job Description Send me Jobs like this - as ...,101016900534,Bengaluru,Pl/sql Developer - SQL,Not Disclosed by Recruiter,2016-10-13 16:20:55,IT Software - Application Programming,c47df6f4cfdf5b46f1fd713ba61b9eba,4,8
3,Onboard HRServices LLP,UG: Any Graduate - Any Specialization PG:CA Do...,11 - 15 yrs,Banking / Financial Services / Broking,Job Description Send me Jobs like this - Inv...,81016900536,"Mumbai, Bengaluru, Kolkata, Chennai, Coimbator...",Manager/ad/partner - Indirect Tax - CA,Not Disclosed by Recruiter,2016-10-13 16:20:55,Accounts,115d28f140f694dd1cc61c53d03c66ae,11,15
4,Spire Technologies and Solutions Pvt. Ltd.,UG: B.Tech/B.E. - Any Specialization PG:Any Po...,6 - 8 yrs,IT-Software / Software Services,Job Description Send me Jobs like this Pleas...,120916002122,Bengaluru,JAVA Technical Lead (6-8 yrs) -,Not Disclosed by Recruiter,2016-10-13 16:20:55,IT Software - Application Programming,a12553fc03bc7bcced8b1bb8963f97b4,6,8


In [226]:
data['job_age']=pd.datetime.today() - pd.to_datetime(data['postdate'])
data['job_age'] = data['job_age'].dt.days

  data['job_age']=pd.datetime.today() - pd.to_datetime(data['postdate'])


In [227]:
data.head()

Unnamed: 0,company,education,experience,industry,jobdescription,jobid,joblocation_address,jobtitle,payrate,postdate,skills,uniq_id,experience_lower,experience_upper,job_age
0,MM Media Pvt Ltd,UG: B.Tech/B.E. - Any Specialization PG:Any Po...,0 - 1 yrs,Media / Entertainment / Internet,Job Description Send me Jobs like this Quali...,210516002263,Chennai,Walkin Data Entry Operator (night Shift),"1,50,000 - 2,25,000 P.A",2016-05-21 19:30:00,ITES,43b19632647068535437c774b6ca6cf8,0,1,2469.0
1,find live infotech,UG: B.Tech/B.E. - Any Specialization PG:MBA/PG...,0 - 0 yrs,Advertising / PR / MR / Event Management,Job Description Send me Jobs like this Quali...,210516002391,Chennai,Work Based Onhome Based Part Time.,"1,50,000 - 2,50,000 P.A. 20000",2016-05-21 19:30:00,Marketing,d4c72325e57f89f364812b5ed5a795f0,0,0,2469.0
2,Softtech Career Infosystem Pvt. Ltd,UG: Any Graduate - Any Specialization PG:Any P...,4 - 8 yrs,IT-Software / Software Services,Job Description Send me Jobs like this - as ...,101016900534,Bengaluru,Pl/sql Developer - SQL,Not Disclosed by Recruiter,2016-10-13 16:20:55,IT Software - Application Programming,c47df6f4cfdf5b46f1fd713ba61b9eba,4,8,2325.0
3,Onboard HRServices LLP,UG: Any Graduate - Any Specialization PG:CA Do...,11 - 15 yrs,Banking / Financial Services / Broking,Job Description Send me Jobs like this - Inv...,81016900536,"Mumbai, Bengaluru, Kolkata, Chennai, Coimbator...",Manager/ad/partner - Indirect Tax - CA,Not Disclosed by Recruiter,2016-10-13 16:20:55,Accounts,115d28f140f694dd1cc61c53d03c66ae,11,15,2325.0
4,Spire Technologies and Solutions Pvt. Ltd.,UG: B.Tech/B.E. - Any Specialization PG:Any Po...,6 - 8 yrs,IT-Software / Software Services,Job Description Send me Jobs like this Pleas...,120916002122,Bengaluru,JAVA Technical Lead (6-8 yrs) -,Not Disclosed by Recruiter,2016-10-13 16:20:55,IT Software - Application Programming,a12553fc03bc7bcced8b1bb8963f97b4,6,8,2325.0


In [228]:
data['skills'].nunique()

45

In [229]:
# source: https://www.kaggle.com/code/karankrishna/job-market-analysis-of-india

replacements = {
   'joblocation_address': {
      r'(Bengaluru/Bangalore)': 'Bangalore',
      r'Bengaluru': 'Bangalore',
      r'Hyderabad / Secunderabad': 'Hyderabad',
      r'Mumbai , Mumbai': 'Mumbai',
      r'Noida': 'NCR',
      r'Delhi': 'NCR',
      r'Gurgaon': 'NCR',
      r'Delhi/NCR(National Capital Region)': 'NCR',
      r'Delhi , Delhi': 'NCR',
      r'Noida , Noida/Greater Noida': 'NCR',
      r'Ghaziabad': 'NCR',
      r'Delhi/NCR(National Capital Region) , Gurgaon': 'NCR',
      r'NCR , NCR': 'NCR',
      r'NCR/NCR(National Capital Region)': 'NCR',
      r'NCR , NCR/Greater NCR': 'NCR',
      r'NCR/NCR(National Capital Region) , NCR': 'NCR',
      r'NCR , NCR/NCR(National Capital Region)': 'NCR',
      r'Bangalore , Bangalore / Bangalore': 'Bangalore',
      r'Bangalore , karnataka': 'Bangalore',
      r'NCR/NCR(National Capital Region)': 'NCR',
      r'NCR/Greater NCR': 'NCR',
      r'NCR/NCR(National Capital Region) , NCR': 'NCR'

   }
}

data.replace(replacements, regex=True, inplace=True)

In [230]:
data.head()

Unnamed: 0,company,education,experience,industry,jobdescription,jobid,joblocation_address,jobtitle,payrate,postdate,skills,uniq_id,experience_lower,experience_upper,job_age
0,MM Media Pvt Ltd,UG: B.Tech/B.E. - Any Specialization PG:Any Po...,0 - 1 yrs,Media / Entertainment / Internet,Job Description Send me Jobs like this Quali...,210516002263,Chennai,Walkin Data Entry Operator (night Shift),"1,50,000 - 2,25,000 P.A",2016-05-21 19:30:00,ITES,43b19632647068535437c774b6ca6cf8,0,1,2469.0
1,find live infotech,UG: B.Tech/B.E. - Any Specialization PG:MBA/PG...,0 - 0 yrs,Advertising / PR / MR / Event Management,Job Description Send me Jobs like this Quali...,210516002391,Chennai,Work Based Onhome Based Part Time.,"1,50,000 - 2,50,000 P.A. 20000",2016-05-21 19:30:00,Marketing,d4c72325e57f89f364812b5ed5a795f0,0,0,2469.0
2,Softtech Career Infosystem Pvt. Ltd,UG: Any Graduate - Any Specialization PG:Any P...,4 - 8 yrs,IT-Software / Software Services,Job Description Send me Jobs like this - as ...,101016900534,Bangalore,Pl/sql Developer - SQL,Not Disclosed by Recruiter,2016-10-13 16:20:55,IT Software - Application Programming,c47df6f4cfdf5b46f1fd713ba61b9eba,4,8,2325.0
3,Onboard HRServices LLP,UG: Any Graduate - Any Specialization PG:CA Do...,11 - 15 yrs,Banking / Financial Services / Broking,Job Description Send me Jobs like this - Inv...,81016900536,"Mumbai, Bangalore, Kolkata, Chennai, Coimbator...",Manager/ad/partner - Indirect Tax - CA,Not Disclosed by Recruiter,2016-10-13 16:20:55,Accounts,115d28f140f694dd1cc61c53d03c66ae,11,15,2325.0
4,Spire Technologies and Solutions Pvt. Ltd.,UG: B.Tech/B.E. - Any Specialization PG:Any Po...,6 - 8 yrs,IT-Software / Software Services,Job Description Send me Jobs like this Pleas...,120916002122,Bangalore,JAVA Technical Lead (6-8 yrs) -,Not Disclosed by Recruiter,2016-10-13 16:20:55,IT Software - Application Programming,a12553fc03bc7bcced8b1bb8963f97b4,6,8,2325.0


In [231]:
data['industry'] = data['industry'].astype(str).apply(lambda x: x.split('/')[0])

In [232]:
data['industry'] = data['industry'].str.strip()

In [233]:
data['Education'] = data['education'].str.split(' ')
data['Education'] = data['Education'].apply(lambda x: x[1] if len(x) > 1 else x[0])

data['Education'] = data['Education'].replace(('B.Tech/B.E.','Graduation','Other','-','Not','B.Tech/B.E.,','Postgraduate',
                                               'PG:CA','Diploma,','B.Com,','B.Pharma,','B.A,','BCA,','B.Sc,','MBA/PGDM','B.B.A,',
                                              'PG:Other','Doctorate:Doctorate','Post'),
                                              ('B.Tech','B.Tech','B.Tech','B.Tech','B.Tech','B.Tech','B.Tech',
                                              'CA','Diploma','B.Com','B.Pharma','B.A','BCA','B.Sc','MBA','BBA',
                                              'B.Tech','Doctorate','B.Tech'))

data['Skills'] = data['skills'].str.split(" - ")
data['Skills'] = data['Skills'].apply(lambda x: x[1] if len(x) > 1 else x[0])

In [234]:
majority_industries = data['industry'].value_counts()[data['industry'].value_counts()>=10].index
data = data[data['industry'].isin(majority_industries)]
data.isnull().sum()[data.isnull().sum()>0]

joblocation_address    497
job_age                 19
dtype: int64

In [235]:
joblocation_imputer = SimpleImputer(strategy='most_frequent')
data['joblocation_address'] = joblocation_imputer.fit_transform(data[['joblocation_address']])
jobage_imputer = SimpleImputer(strategy='mean')
data['job_age'] = jobage_imputer.fit_transform(data[['job_age']])

In [236]:
data.isnull().sum()

company                0
education              0
experience             0
industry               0
jobdescription         0
jobid                  0
joblocation_address    0
jobtitle               0
payrate                0
postdate               0
skills                 0
uniq_id                0
experience_lower       0
experience_upper       0
job_age                0
Education              0
Skills                 0
dtype: int64

In [237]:
data.head()

Unnamed: 0,company,education,experience,industry,jobdescription,jobid,joblocation_address,jobtitle,payrate,postdate,skills,uniq_id,experience_lower,experience_upper,job_age,Education,Skills
0,MM Media Pvt Ltd,UG: B.Tech/B.E. - Any Specialization PG:Any Po...,0 - 1 yrs,Media,Job Description Send me Jobs like this Quali...,210516002263,Chennai,Walkin Data Entry Operator (night Shift),"1,50,000 - 2,25,000 P.A",2016-05-21 19:30:00,ITES,43b19632647068535437c774b6ca6cf8,0,1,2469.0,B.Tech,ITES
1,find live infotech,UG: B.Tech/B.E. - Any Specialization PG:MBA/PG...,0 - 0 yrs,Advertising,Job Description Send me Jobs like this Quali...,210516002391,Chennai,Work Based Onhome Based Part Time.,"1,50,000 - 2,50,000 P.A. 20000",2016-05-21 19:30:00,Marketing,d4c72325e57f89f364812b5ed5a795f0,0,0,2469.0,B.Tech,Marketing
2,Softtech Career Infosystem Pvt. Ltd,UG: Any Graduate - Any Specialization PG:Any P...,4 - 8 yrs,IT-Software,Job Description Send me Jobs like this - as ...,101016900534,Bangalore,Pl/sql Developer - SQL,Not Disclosed by Recruiter,2016-10-13 16:20:55,IT Software - Application Programming,c47df6f4cfdf5b46f1fd713ba61b9eba,4,8,2325.0,Any,Application Programming
3,Onboard HRServices LLP,UG: Any Graduate - Any Specialization PG:CA Do...,11 - 15 yrs,Banking,Job Description Send me Jobs like this - Inv...,81016900536,"Mumbai, Bangalore, Kolkata, Chennai, Coimbator...",Manager/ad/partner - Indirect Tax - CA,Not Disclosed by Recruiter,2016-10-13 16:20:55,Accounts,115d28f140f694dd1cc61c53d03c66ae,11,15,2325.0,Any,Accounts
4,Spire Technologies and Solutions Pvt. Ltd.,UG: B.Tech/B.E. - Any Specialization PG:Any Po...,6 - 8 yrs,IT-Software,Job Description Send me Jobs like this Pleas...,120916002122,Bangalore,JAVA Technical Lead (6-8 yrs) -,Not Disclosed by Recruiter,2016-10-13 16:20:55,IT Software - Application Programming,a12553fc03bc7bcced8b1bb8963f97b4,6,8,2325.0,B.Tech,Application Programming


In [238]:
# Splitting the data into X and y
X = data['skills'] # job description text for training model
y = data['industry'] # Industry in which the description belongs (used for recommendation purposes)
test_size = int(len(X) * 0.1)

X_train,X_test, y_train, y_test = train_test_split(X,y, test_size = test_size, stratify=y)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=test_size,
                                                  stratify=y_train)

In [239]:
print(X_train.iloc[0])

Financial Services


In [240]:
class Preprocess:
    def __init__(self,method='WordNetLemmatizer'):
        # WordNetLemmatizer is recommended because it reduces the given word to the root word
        # by referring to the WordNet corpus unlike other stemming techniques which just
        # truncate the word by removing the suffix, which is why I have set it as default
        self.method = method
        self.stemmers = {
            'PorterStemmer':PorterStemmer(),
            'LancasterStemmer':LancasterStemmer(),
            'SnowballStemmer':SnowballStemmer(language='english'),
            'WordNetLemmatizer':WordNetLemmatizer()
        }
        self.stemmer = self.stemmers[self.method]
        # Remove punctuation signs and stopwords for better results
        self.stopWords = list(punctuation) + list(stopwords.words('english'))
        # Adding custom stopwords for better preprocessing, feel free to add more
        self.moreStopWords = ['job','description','requirement','skill', 'qualification']
        self.stopWords.extend(self.moreStopWords)
        self.encoder = LabelEncoder()
        # Using tf-idf vectorizer because it not only relies on the count but also the
        # number of documents it occurs in
        # tf * log(N/df), where tf = term frequency/count of words
        # N = total number of documents
        # df = document frequency (number of documents containing that word)
        # Count vectorizer gets tricked by the term frequency but in tf-idf it does not happen
        # eg - if the word occurs frequently in almost all documents,it may be a filler word
        # which was ignored in stopwords, so it can trick the count vectorizer,but in tf-idf
        # N/df almost = 1, so log(N/df) will be almost 0 and hence the word will not be given
        # much importance which is desirable, hence we should use tfidf vectorizer instead of
        # count vectorizer
        self.vectorizer = TfidfVectorizer()
        self.isFitted = False
    def preprocess(self, message):
        message = message.lower()
        #Remove links
        message = re.sub('http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+#]|[!*\(\),]|'\
                        '(?:%[0-9a-fA-F][0-9a-fA-F]))+','', message)
        # Remove extra spaces
        message = re.sub(' +', ' ', message)
        # Remove mentions
        message =re.sub("(@[A-Za-z0-9_]+)","", message)
        # Remove Hashtags
        message = re.sub('#[A-Za-z0-9_]+','', message)
        # Remove all non alphanumeric characters
        message = re.sub("^[A-Za-z0-9_-]*$", "", message)
        # Remove Emojis
        emoji_pattern = re.compile(
            "["
            u"\U0001F600-\U0001F64F"  # emoticons
            u"\U0001F300-\U0001F5FF"  # symbols & pictographs
            u"\U0001F680-\U0001F6FF"  # transport & map symbols
            u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
            u"\U00002702-\U000027B0"
            u"\U000024C2-\U0001F251"
            "]+",
            flags=re.UNICODE,
        )
        message = emoji_pattern.sub('',message)
        if self.method == 'WordNetLemmatizer':
            message = ' '.join([self.stemmer.lemmatize(word) for word in message.split() if word not in self.moreStopWords])
        else:
            message = ' '.join([self.stemmer.stem(word) for word in message.split() if word not in self.moreStopWords])
        return message
    def fit(self,X,y=None):
        self.vectorizer.fit(X)
        if y is not None:
            self.encoder.fit(y)
        self.isFitted=True
    def transform(self, X, y=None):
        if not self.isFitted:
            raise NotImplementedError('Please fit first by calling the fit function')
        X = self.vectorizer.transform(X)
        if y is not None:
            y = self.encoder.transform(y)
            return X,y
        else:
            return X
    def fit_transform(self,X,y=None):
        self.fit(X,y)
        X,y = self.transform(X,y)
        return X,y



In [241]:
preprocess = Preprocess()
X_train, y_train = preprocess.fit_transform(X_train,y_train)
X_val, y_val = preprocess.transform(X_val, y_val)
X_test, y_test = preprocess.transform(X_test,y_test)

In [246]:
import time
class Models:
    def __init__(self, models=['dt','rf','xgb']):
        self.model_dict = {
            'dt':DecisionTreeClassifier(),
            'rf':RandomForestClassifier(n_jobs=-1),
            'xgb':XGBClassifier(
                use_label_encoder=False,
                eval_metric='logloss',
                n_jobs=-1),
            'logistic':LogisticRegression(),
            'gb':GradientBoostingClassifier(),
            'adaboost':AdaBoostClassifier(),
            'svc':SVC()
        }
        self.models = models
        self.selected_models = {}
        for model in self.models:
            self.selected_models[model] = self.model_dict[model]
        self.isFitted = False
    def fit(self, X,y):
        if pd.Series(y).nunique() > 2 and 'xgb' in self.selected_models:
            self.model_dict['xgb'].objective = 'multi:softmax'
        elif pd.Series(y).nunique == 2 and 'xgb' in self.selected_models:
            self.model_dict['xgb'].objective = 'binary:logistic'
        for name, model in self.selected_models.items():
            print(f'Training model {name}')
            start = time.time()
            model.fit(X,y)
            print(f'Took {time.time() - start}s to train')
        self.isFitted = True
        print('Finished training all models')
    def evaluate(self,X_train, y_train, X_val, y_val, X_test, y_test):
        if not self.isFitted:
            raise NotImplementedError('Please fit the models first by calling the fit funcion')
        for name, model in self.selected_models.items():
            print(f'Evaluating model {name}')
            trainPreds = model.predict(X_train)
            valPreds = model.predict(X_val)
            testPreds = model.predict(X_test)
            train_acc = accuracy_score(y_true=y_train, y_pred = trainPreds)
            val_acc = accuracy_score(y_true=y_val, y_pred=valPreds)
            test_acc = accuracy_score(y_true=y_test, y_pred=testPreds)
            print(f'Accuracy on train set is {train_acc:.3%}')
            print(f'Accuracy on validation set is {val_acc:.3%}')
            print(f'Accuracy on test set is {test_acc:.3%}')
            print('Classification report for train set')
            print(classification_report(y_true=y_train, y_pred=trainPreds))
            print('Classification report for validation set')
            print(classification_report(y_true=y_val, y_pred=valPreds))
            print('Classification report for test set')
            print(classification_report(y_true=y_test, y_pred=testPreds))
            print('Finished evaluating the models')

In [247]:
models = Models()
models.fit(X_train, y_train)



Training model dt
Took 0.027997970581054688s to train
Training model rf
Took 0.34241485595703125s to train
Training model xgb
Took 7.692988634109497s to train
Finished training all models


In [249]:
import warnings
warnings.filterwarnings('always')

In [250]:
models.evaluate(X_train, y_train, X_val, y_val, X_test, y_test)

Evaluating model dt
Accuracy on train set is 54.208%
Accuracy on validation set is 54.350%
Accuracy on test set is 54.487%
Classification report for train set
              precision    recall  f1-score   support

           0       0.17      0.64      0.27       181
           1       0.00      0.00      0.00       201
           2       0.00      0.00      0.00        36
           3       0.00      0.00      0.00        19
           4       0.00      0.00      0.00        52
           5       0.21      0.55      0.30       382
           6       0.00      0.00      0.00        25
           7       0.58      0.76      0.66      1004
           8       0.65      0.22      0.32       990
           9       0.00      0.00      0.00        17
          10       0.00      0.00      0.00        72
          11       0.33      0.28      0.30       359
          12       0.00      0.00      0.00       124
          13       0.00      0.00      0.00        84
          14       0.92      0

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


Accuracy on train set is 54.208%
Accuracy on validation set is 54.304%
Accuracy on test set is 54.533%
Classification report for train set
              precision    recall  f1-score   support

           0       0.17      0.64      0.27       181
           1       0.00      0.00      0.00       201
           2       0.00      0.00      0.00        36
           3       0.00      0.00      0.00        19
           4       0.00      0.00      0.00        52
           5       0.21      0.55      0.30       382
           6       0.00      0.00      0.00        25
           7       0.58      0.76      0.66      1004
           8       0.65      0.22      0.32       990
           9       0.00      0.00      0.00        17
          10       0.00      0.00      0.00        72
          11       0.33      0.28      0.30       359
          12       0.00      0.00      0.00       124
          13       0.00      0.00      0.00        84
          14       0.92      0.75      0.83       

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


Accuracy on train set is 54.191%
Accuracy on validation set is 54.350%
Accuracy on test set is 54.441%
Classification report for train set
              precision    recall  f1-score   support

           0       0.17      0.64      0.27       181
           1       0.00      0.00      0.00       201
           2       0.00      0.00      0.00        36
           3       0.00      0.00      0.00        19
           4       0.00      0.00      0.00        52
           5       0.21      0.55      0.30       382
           6       0.00      0.00      0.00        25
           7       0.58      0.76      0.66      1004
           8       0.65      0.22      0.32       990
           9       0.00      0.00      0.00        17
          10       0.00      0.00      0.00        72
          11       0.33      0.28      0.30       359
          12       0.00      0.00      0.00       124
          13       0.00      0.00      0.00        84
          14       0.92      0.75      0.83       

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [126]:
import pickle

In [127]:
pickle.dump(data, open('recc_model1.pkl','wb'))

In [130]:
# now we can load the model in the same file or in another file to make predictions
loaded_model = pickle.load(open('recc_model1.pkl','rb'))
#loaded_model

In [131]:
def predictNew(message, preprocessor, model):
    message = preprocessor.transform(message)
    pred = model.predict(message)
    decoded_pred = preprocessor.encoder.inverse_transform(pred)[0]
    return decoded_pred

In [132]:
random_job = data.sample(n=1,random_state=1)

In [133]:
print(random_job['skills'])
print(random_job['industry'])

17765    IT Software - System Programming
Name: skills, dtype: object
17765    IT-Software
Name: industry, dtype: object


In [137]:
#pred  = predictNew(message=random_job['skills'], preprocessor=preprocess, model=loaded_model)
#print(pred)

In [140]:
# Recommendations
# Sort the jobs on the basis of postdate, the jobs posted earlier are about to close so
# They need to be recommended on priority basis
recommendation_df = data[data['industry'] == pred].sort_values(by='postdate', ascending=True)
n_recommendations = 200
# Show the first 200 recommendations
recommendations = recommendation_df.iloc[:,:n_recommendations]

In [141]:
recommendations

Unnamed: 0,company,education,experience,industry,jobdescription,jobid,joblocation_address,jobtitle,payrate,postdate,skills,uniq_id,experience_lower,experience_upper,job_age,Education,Skills
4782,Kaay Labs,"UG: Any Graduate - Any Specialization, Graduat...",9 - 14 yrs,IT-Software,Job Description Send me Jobs like this Andro...,161015002907,Chennai,Mobile Solution Architects,"50,00,000 - 50,00,000 & above P.A",,IT Software - Application Programming,a9ea62e49868cb9e95bea8e74726d30c,9,14,2491.479109,Any,Application Programming
4357,Kaay Labs,"UG: Any Graduate - Any Specialization, Graduat...",1 - 2 yrs,IT-Software,Job Description Send me Jobs like this Linux...,181015000005,Chennai,Linux Administrator,"1,00,000 - 6,00,000 P.A",,IT Software - Network Administration,6facfd7b4346864f5415b5acb89e263a,1,2,2491.479109,Any,Network Administration
4573,Kaay Labs,"UG: Any Graduate - Any Specialization, Graduat...",5 - 10 yrs,IT-Software,Job Description Send me Jobs like this cassa...,161015003432,Chennai,Cassandra,"4,00,000 - 9,00,000 P.A",,IT Software - Application Programming,16849630645c5ada30956fc2256bbf34,5,10,2491.479109,Any,Application Programming
3735,Psiog Data Science,UG: Any Graduate - Any Specialization PG:MBA/P...,3 - 6 yrs,IT-Software,Job Description Send me Jobs like this A bri...,200815000502,Chennai,Core Java (data Structures/algorithms) - Data ...,"7,00,000 - 17,00,000 P.A",,IT Software - System Programming,426693a9122ada601d3506af9865cb1e,3,6,2491.479109,Any,System Programming
9730,Career Network hiring for Top MNC,UG: Any Graduate - Any Specialization PG:Any P...,9 - 14 yrs,IT-Software,Job Description Send me Jobs like this Exper...,301215005515,Bangalore,Immediate Openings for Abinitio Professionals ...,Not Disclosed by Recruiter,,IT Software - Application Programming,e48c1c8fcb10b6ce86c768d1ba8501fa,9,14,2491.479109,Any,Application Programming
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
20363,Prokarma Softech Pvt. Ltd.,"UG: Any Graduate - Any Specialization, Graduat...",4 - 9 yrs,IT-Software,Job Description Send me Jobs like this Exper...,41116006236,Hyderabad,J2EE - Software Devloper / Sr. Developer,Not Disclosed by Recruiter,2017-01-10 18:18:34,IT Software - Application Programming,690e5f11cd4a57f50936c504ad2b4d59,4,9,2233.000000,Any,Application Programming
21274,CipherCloud India Private Limited,UG: B.Tech/B.E. - Any Specialization PG:M.Tech...,5 - 10 yrs,IT-Software,Job Description Send me Jobs like this Hello...,191216000699,Hyderabad,Software Engineer / Senior Software Engineer -...,Not Disclosed by Recruiter,2017-01-10 18:19:23,IT Software - Application Programming,94d165394f843847c16e113fb68e206a,5,10,2233.000000,B.Tech,Application Programming
21569,Focus Softnet Pvt. Ltd.,UG: Any Graduate - Any Specialization PG:MBA/P...,2 - 4 yrs,IT-Software,Job Description Send me Jobs like this -Our ...,120416004021,Hyderabad,Technical Support Consultant (ERP Support with...,Salaries are best in the industry,2017-01-10 18:19:38,IT Software - ERP,ed91c6a1b971ee8f4eb45962a13eb015,2,4,2233.000000,Any,ERP
21390,Quest Consulting Services hiring for Leading M...,"UG: Any Graduate - Any Specialization, B.Tech/...",5 - 8 yrs,IT-Software,Job Description Send me Jobs like this Shoul...,41116001527,Hyderabad,SAP BPC Consultant Hyderabad Openings Quest,Not Disclosed by Recruiter,2017-01-11 21:00:00,IT Software - ERP,1948ff85eb8c2b907c4e16dd258f0170,5,8,2232.000000,Any,ERP
