In [1]:
# Import libraries
import pandas as pd
import numpy as np
%matplotlib inline

import matplotlib.pyplot as plt
from sklearn.metrics import classification_report
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import confusion_matrix
from sklearn import metrics
import matplotlib.cm as cm
from sklearn import neighbors, datasets
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import LinearSVC
from sklearn.multiclass import OneVsOneClassifier

In [39]:
data = pd.read_csv('z_train.csv')

In [3]:
data.head()

# target: 0 – Not looking for job change, 1 – Looking for a job change

Unnamed: 0,enrollee_id,city,city_development_index,gender,relevent_experience,enrolled_university,education_level,major_discipline,experience,company_size,company_type,last_new_job,training_hours,target
0,8949,city_103,0.92,Male,Has relevent experience,no_enrollment,Graduate,STEM,>20,,,1,36,1.0
1,29725,city_40,0.776,Male,No relevent experience,no_enrollment,Graduate,STEM,15,50-99,Pvt Ltd,>4,47,0.0
2,11561,city_21,0.624,,No relevent experience,Full time course,Graduate,STEM,5,,,never,83,0.0
3,33241,city_115,0.789,,No relevent experience,,Graduate,Business Degree,<1,,Pvt Ltd,never,52,1.0
4,21651,city_176,0.764,,Has relevent experience,Part time course,Graduate,STEM,11,,,1,24,1.0


In [4]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 15326 entries, 0 to 15325
Data columns (total 14 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   enrollee_id             15326 non-null  int64  
 1   city                    15326 non-null  object 
 2   city_development_index  15326 non-null  float64
 3   gender                  11725 non-null  object 
 4   relevent_experience     15326 non-null  object 
 5   enrolled_university     15024 non-null  object 
 6   education_level         14957 non-null  object 
 7   major_discipline        13089 non-null  object 
 8   experience              15276 non-null  object 
 9   company_size            10592 non-null  object 
 10  company_type            10435 non-null  object 
 11  last_new_job            14987 non-null  object 
 12  training_hours          15326 non-null  int64  
 13  target                  15326 non-null  float64
dtypes: float64(2), int64(2), object(10)
me

In [5]:
data.relevent_experience.value_counts()

Has relevent experience    11068
No relevent experience      4258
Name: relevent_experience, dtype: int64

In [41]:
data['relevent_experience'] = data['relevent_experience'].apply(lambda x: 1 if x == 'Has relevent experience' else 0) 

In [6]:
data.gender.value_counts()

Male      10577
Female      998
Other       150
Name: gender, dtype: int64

In [42]:
company_sizes = list(data.company_size.unique())
comp_size_categories = {'small':['<10', '10/49','50-99'],
                        'medium':['100-500','500-999', '1000-4999'],
                        'large' : ['5000-9999','10000+']}

for cat, comp_size in comp_size_categories.items():
    data.loc[data['company_size'].isin(comp_size),'company_size'] = cat 

In [105]:
experience_ranges = {3:['<1', '1', '2', '3'],
                    7:['4','5','6','7'],
                    10:['8','9','10'],
                    15:['11','12','13','14','15'],
                    20:['16','17','18','19','20', '>20']}

In [40]:
for cat, comp_size in experience_ranges.items():
    data.loc[data['experience'].isin(comp_size),'experience'] = cat

data.experience.value_counts()

7     4058
20    3907
3     2837
15    2258
10    2216
Name: experience, dtype: int64

In [8]:
data.enrolled_university.value_counts()

no_enrollment       11071
Full time course     2990
Part time course      963
Name: enrolled_university, dtype: int64

In [43]:
data.drop(columns = ['company_size', 'company_type','gender'], inplace = True)
data.isnull().sum()

enrollee_id                  0
city                         0
city_development_index       0
relevent_experience          0
enrolled_university        302
education_level            369
major_discipline          2237
experience                  50
last_new_job               339
training_hours               0
target                       0
dtype: int64

In [71]:
test_drop = data.dropna()

In [48]:
test_drop.shape

(12653, 11)

In [55]:
test_drop.city.value_counts().nlargest(10)

other_city    6696
city_103      3113
city_21       1786
city_16       1058
Name: city, dtype: int64

In [75]:
big_cities = ['city_103', 'city_21', 'city_16']

test_drop.loc[~test_drop['city'].isin(big_cities),'city'] ='other_city'

In [56]:
test_drop.last_new_job.value_counts()

1        5594
>4       2420
2        2080
never    1058
4         755
3         746
Name: last_new_job, dtype: int64

In [60]:
test_drop.columns

Index(['city', 'city_development_index', 'relevent_experience',
       'enrolled_university', 'education_level', 'major_discipline',
       'experience', 'last_new_job', 'training_hours', 'target'],
      dtype='object')

In [76]:
test_drop.set_index('enrollee_id', inplace = True)

In [72]:
test_drop.loc[:,'city_development_index'] = pd.qcut(test_drop['city_development_index'], 3, labels = [0,1,2])

In [74]:
test_drop['city_development_index'].value_counts()

1    6837
0    4288
2    1528
Name: city_development_index, dtype: int64

In [77]:
dummy_df = pd.get_dummies(test_drop)

In [78]:
dummy_df.head()

Unnamed: 0_level_0,relevent_experience,training_hours,target,city_city_103,city_city_16,city_city_21,city_other_city,city_development_index_0,city_development_index_1,city_development_index_2,...,experience_7,experience_10,experience_15,experience_20,last_new_job_1,last_new_job_2,last_new_job_3,last_new_job_4,last_new_job_>4,last_new_job_never
enrollee_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
8949,1,36,1.0,1,0,0,0,0,1,0,...,0,0,0,1,1,0,0,0,0,0
29725,0,47,0.0,0,0,0,1,1,0,0,...,0,0,1,0,0,0,0,0,1,0
11561,0,83,0.0,0,0,1,0,1,0,0,...,1,0,0,0,0,0,0,0,0,1
21651,1,24,1.0,0,0,0,1,1,0,0,...,0,0,1,0,1,0,0,0,0,0
402,1,18,1.0,0,0,0,1,1,0,0,...,0,0,1,0,0,0,0,0,1,0


In [79]:
X = dummy_df.drop(columns = 'target')
y = dummy_df['target']

In [80]:
classifier = neighbors.KNeighborsClassifier()
classifier.fit(X,y)

KNeighborsClassifier()

In [81]:
y_pred = classifier.predict(X)

In [83]:
print(classification_report(y, y_pred))

              precision    recall  f1-score   support

         0.0       0.84      0.94      0.88      9445
         1.0       0.71      0.46      0.56      3208

    accuracy                           0.81     12653
   macro avg       0.77      0.70      0.72     12653
weighted avg       0.80      0.81      0.80     12653



In [95]:
data_test = pd.read_csv('z_test.csv')

In [96]:
data_test['relevent_experience'] = data_test['relevent_experience'].apply(lambda x: 1 if x == 'Has relevent experience' else 0) 

In [97]:
data_test.drop(columns = ['company_size', 'company_type','gender'], inplace = True)

In [107]:
for cat, comp_size in experience_ranges.items():
    data_test.loc[data_test['experience'].isin(comp_size),'experience'] = cat

In [99]:
data_test.loc[~data_test['city'].isin(big_cities),'city'] ='other_city'

In [100]:
data_test.set_index('enrollee_id', inplace = True)

In [101]:
data_test.loc[:,'city_development_index'] = pd.qcut(data_test['city_development_index'], 3, labels = [0,1,2])

In [109]:
data_test = pd.get_dummies(data_test)

In [113]:
data_test.columns

Index(['relevent_experience', 'training_hours', 'city_city_103',
       'city_city_16', 'city_city_21', 'city_other_city',
       'city_development_index_0', 'city_development_index_1',
       'city_development_index_2', 'enrolled_university_Full time course',
       'enrolled_university_Part time course',
       'enrolled_university_no_enrollment', 'education_level_Graduate',
       'education_level_High School', 'education_level_Masters',
       'education_level_Phd', 'education_level_Primary School',
       'major_discipline_Arts', 'major_discipline_Business Degree',
       'major_discipline_Humanities', 'major_discipline_No Major',
       'major_discipline_Other', 'major_discipline_STEM', 'experience_3',
       'experience_7', 'experience_10', 'experience_15', 'experience_20',
       'last_new_job_1', 'last_new_job_2', 'last_new_job_3', 'last_new_job_4',
       'last_new_job_>4', 'last_new_job_never'],
      dtype='object')

In [114]:
X.columns

Index(['relevent_experience', 'training_hours', 'city_city_103',
       'city_city_16', 'city_city_21', 'city_other_city',
       'city_development_index_0', 'city_development_index_1',
       'city_development_index_2', 'enrolled_university_Full time course',
       'enrolled_university_Part time course',
       'enrolled_university_no_enrollment', 'education_level_Graduate',
       'education_level_Masters', 'education_level_Phd',
       'major_discipline_Arts', 'major_discipline_Business Degree',
       'major_discipline_Humanities', 'major_discipline_No Major',
       'major_discipline_Other', 'major_discipline_STEM', 'experience_3',
       'experience_7', 'experience_10', 'experience_15', 'experience_20',
       'last_new_job_1', 'last_new_job_2', 'last_new_job_3', 'last_new_job_4',
       'last_new_job_>4', 'last_new_job_never'],
      dtype='object')

In [93]:
y_train = classifier.predict(data_test)

ValueError: query data dimension must match training data dimension

In [94]:
data_test.shape

(3832, 51)

In [115]:
def Diff(li1, li2):
    return (list(list(set(li1)-set(li2)) + list(set(li2)-set(li1))))

In [116]:
Diff(X.columns, data_test.columns)

['education_level_Primary School', 'education_level_High School']