In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn
import re
import sklearn
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import LabelEncoder
from nltk.stem import PorterStemmer
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score,confusion_matrix
from sklearn.linear_model import SGDClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.naive_bayes import MultinomialNB

In [3]:
#Input CSV into DataFrame
datasci_file="../../resources/cleaneddata/description_nouns_data.csv"
data = pd.read_csv(datasci_file)
data.head()

Unnamed: 0.1,Unnamed: 0,position,company,description,reviews,location,description_phrase_list,description_word_list,description_count_statistics,description_count_python,description_count_R,description_count_SQL,description_count_SAS,description_count_TABLEAU,description_count_modeling,description_count_database,description_count_BACHELOR,description_count_Master
0,0,Data Science Manager,ALS TDI,DEVELOPMENT DIRECTOR\r\nALS THERAPY DEVELOPMEN...,,"Atlanta, GA 30301",['development director als therapy development...,"['DEVELOPMENT', 'DIRECTOR', 'ALS', 'THERAPY', ...",0,0,0,0,0,0,0,1,1,0
1,1,Data Scientist,The Hexagon Lavish,"JOB DESCRIPTION\r\n\r\n""THE ROAD THAT LEADS TO...",,"Atlanta, GA","['job description', 'the road that leads to ac...","['JOB', 'DESCRIPTION', 'THE', 'ROAD', 'THAT', ...",2,0,3,0,0,0,1,0,0,0
2,2,Data Scientist,Xpert Staffing,"GROWING COMPANY LOCATED IN THE ATLANTA, GA ARE...",,"Atlanta, GA","['growing company located in the atlanta', 'ar...","['GROWING', 'COMPANY', 'LOCATED', 'IN', 'THE',...",0,3,2,1,0,0,2,0,0,0
3,3,Data Analyst,Operation HOPE,DEPARTMENT: PROGRAM OPERATIONSPOSITION LOCATIO...,44.0,"Atlanta, GA 30303","['department', 'program operationsposition loc...","['DEPARTMENT', 'PROGRAM', 'OPERATIONSPOSITION'...",0,2,2,3,1,1,3,4,0,0
4,4,Others,Emory University,DESCRIPTION\r\nTHE EMORY UNIVERSITY DEPARTMENT...,550.0,"Atlanta, GA",['description the emory university department ...,"['DESCRIPTION', 'THE', 'EMORY', 'UNIVERSITY', ...",0,0,0,0,0,0,0,1,0,0


In [3]:
# Data Cleansing
X=data.description
Y=data.position

X=[re.sub(r"[^a-zA-Z0-9]+", ' ', k) for k in X]
X=[re.sub("[0-9]+",' ',k) for k in X]

#applying stemmer
ps =PorterStemmer()
X=[ps.stem(k) for k in X]

#Note: I have not removed stop words because there are important key words mentioned in job description which are of length 2, I feel they have weightage while classifing
tfidf=TfidfVectorizer()
label_enc=LabelEncoder()

X=tfidf.fit_transform(X)
Y=label_enc.fit_transform(Y)

x_train,x_test,y_train,y_test=train_test_split(X,Y,stratify=Y,test_size=0.3)

In [13]:
# first algorithm SVM
#SVM classification
svm=SVC(kernel='rbf')
svm.fit(x_train,y_train)

svm_y=svm.predict(x_test)

print('Accuracy of SVM :', accuracy_score(y_test,svm_y))
print ('Confusion Matrix of SVM : ', '\n\n', confusion_matrix(y_test,svm_y))

#crossfold Validation of 7 folds for SVM
cross_val_SVM=sklearn.model_selection.cross_validate(svm, x_train, y=y_train,cv=7)

print ('SVM Train fit score is : ', '\n\n', cross_val_SVM ['train_score'])
print ('SVM TEST score is : ', '\n\n', cross_val_SVM ['test_score'])



Accuracy of SVM : 0.38063279002876316
Confusion Matrix of SVM :  

 [[  0   0 254   0   0]
 [  0   0 234   0   0]
 [  0   0 794   0   0]
 [  0   0 415   0   0]
 [  0   0 389   0   0]]




SVM Train fit score is :  

 [0.38042696 0.38033573 0.38033573 0.38030194 0.38030194 0.38030194
 0.38021083]
SVM TEST score is :  

 [0.37965616 0.38020086 0.38020086 0.38040346 0.38040346 0.38040346
 0.38095238]




In [14]:
#Naive Bayes classification
NB=MultinomialNB()
NB.fit(x_train,y_train)
NB_y=NB.predict(x_test)

print('Accuracy of NB :', accuracy_score(y_test,NB_y))
print ('Confusion Matrix of NB : ', '\n\n', confusion_matrix(y_test,NB_y))

#crossfold Validation of 7 folds for NB
cross_val_NB=sklearn.model_selection.cross_validate(NB, x_train, y=y_train,cv=7)

print ('NB Train fit score is : ', '\n\n', cross_val_NB ['train_score'])
print ('NB TEST score is : ', '\n\n', cross_val_NB ['test_score'])

Accuracy of NB : 0.4103547459252157
Confusion Matrix of NB :  

 [[  0   0 254   0   0]
 [  0   0 231   3   0]
 [  0   0 794   0   0]
 [  0   0 358  57   0]
 [  0   0 369  15   5]]
NB Train fit score is :  

 [0.41352842 0.41438849 0.41438849 0.4128924  0.41409058 0.4121735
 0.41638716]
NB TEST score is :  

 [0.40544413 0.40602582 0.41606887 0.40634006 0.41066282 0.39913545
 0.3968254 ]


