In [2]:
# Import libraries
import pandas as pd
import numpy as np
%matplotlib inline

import matplotlib.pyplot as plt
from sklearn.metrics import classification_report
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import confusion_matrix
from sklearn import metrics
import matplotlib.cm as cm
from sklearn import neighbors, datasets
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import LinearSVC
from sklearn.multiclass import OneVsOneClassifier

In [49]:
data = pd.read_csv('z_train.csv')

In [4]:
data.head()

# target: 0 – Not looking for job change, 1 – Looking for a job change

Unnamed: 0,enrollee_id,city,city_development_index,gender,relevent_experience,enrolled_university,education_level,major_discipline,experience,company_size,company_type,last_new_job,training_hours,target
0,8949,city_103,0.92,Male,Has relevent experience,no_enrollment,Graduate,STEM,>20,,,1,36,1.0
1,29725,city_40,0.776,Male,No relevent experience,no_enrollment,Graduate,STEM,15,50-99,Pvt Ltd,>4,47,0.0
2,11561,city_21,0.624,,No relevent experience,Full time course,Graduate,STEM,5,,,never,83,0.0
3,33241,city_115,0.789,,No relevent experience,,Graduate,Business Degree,<1,,Pvt Ltd,never,52,1.0
4,21651,city_176,0.764,,Has relevent experience,Part time course,Graduate,STEM,11,,,1,24,1.0


In [5]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 15326 entries, 0 to 15325
Data columns (total 14 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   enrollee_id             15326 non-null  int64  
 1   city                    15326 non-null  object 
 2   city_development_index  15326 non-null  float64
 3   gender                  11725 non-null  object 
 4   relevent_experience     15326 non-null  object 
 5   enrolled_university     15024 non-null  object 
 6   education_level         14957 non-null  object 
 7   major_discipline        13089 non-null  object 
 8   experience              15276 non-null  object 
 9   company_size            10592 non-null  object 
 10  company_type            10435 non-null  object 
 11  last_new_job            14987 non-null  object 
 12  training_hours          15326 non-null  int64  
 13  target                  15326 non-null  float64
dtypes: float64(2), int64(2), object(10)
me

In [6]:
data.relevent_experience.value_counts()

Has relevent experience    11068
No relevent experience      4258
Name: relevent_experience, dtype: int64

In [50]:
data['relevent_experience'] = data['relevent_experience'].apply(lambda x: 1 if x == 'Has relevent experience' else 0) 

In [51]:
data['gender'].fillna('Other', inplace = True)

In [52]:
data.gender.value_counts()

Male      10577
Other      3751
Female      998
Name: gender, dtype: int64

In [42]:
# company_sizes = list(data.company_size.unique())
# comp_size_categories = {'small':['<10', '10/49','50-99'],
#                         'medium':['100-500','500-999', '1000-4999'],
#                         'large' : ['5000-9999','10000+']}

# for cat, comp_size in comp_size_categories.items():
#     data.loc[data['company_size'].isin(comp_size),'company_size'] = cat 

In [53]:
experience_ranges = {3:['<1', '1', '2', '3'],
                    7:['4','5','6','7'],
                    10:['8','9','10'],
                    15:['11','12','13','14','15'],
                    20:['16','17','18','19','20', '>20']}

In [54]:
for cat, comp_size in experience_ranges.items():
    data.loc[data['experience'].isin(comp_size),'experience'] = cat

data.experience.value_counts()

7     4058
20    3907
3     2837
15    2258
10    2216
Name: experience, dtype: int64

In [8]:
data.enrolled_university.value_counts()

no_enrollment       11071
Full time course     2990
Part time course      963
Name: enrolled_university, dtype: int64

In [55]:
data.drop(columns = ['company_size', 'company_type'], inplace = True)
data.isnull().sum()

enrollee_id                  0
city                         0
city_development_index       0
gender                       0
relevent_experience          0
enrolled_university        302
education_level            369
major_discipline          2237
experience                  50
last_new_job               339
training_hours               0
target                       0
dtype: int64

In [56]:
data['major_discipline'].fillna('No Major', inplace = True)

In [57]:
data.isnull().sum()

enrollee_id                 0
city                        0
city_development_index      0
gender                      0
relevent_experience         0
enrolled_university       302
education_level           369
major_discipline            0
experience                 50
last_new_job              339
training_hours              0
target                      0
dtype: int64

In [58]:
data.dropna(inplace = True)

In [17]:
data.city.value_counts().nlargest(10)

city_103    3368
city_21     1975
city_16     1204
city_114     999
city_160     647
city_136     440
city_67      332
city_75      229
city_102     225
city_104     222
Name: city, dtype: int64

In [59]:
big_cities = ['city_103', 'city_21', 'city_16']

data.loc[~data['city'].isin(big_cities),'city'] ='other_city'

In [60]:
data.set_index('enrollee_id', inplace = True)

In [61]:
data.loc[:,'city_development_index'] = pd.qcut(data['city_development_index'], 3, labels = [0,1,2])

In [62]:
dummy_df = pd.get_dummies(data)

In [63]:
dummy_df.head()

Unnamed: 0_level_0,relevent_experience,training_hours,target,city_city_103,city_city_16,city_city_21,city_other_city,city_development_index_0,city_development_index_1,city_development_index_2,...,experience_7,experience_10,experience_15,experience_20,last_new_job_1,last_new_job_2,last_new_job_3,last_new_job_4,last_new_job_>4,last_new_job_never
enrollee_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
8949,1,36,1.0,1,0,0,0,0,1,0,...,0,0,0,1,1,0,0,0,0,0
29725,0,47,0.0,0,0,0,1,1,0,0,...,0,0,1,0,0,0,0,0,1,0
11561,0,83,0.0,0,0,1,0,1,0,0,...,1,0,0,0,0,0,0,0,0,1
21651,1,24,1.0,0,0,0,1,1,0,0,...,0,0,1,0,1,0,0,0,0,0
28806,1,24,0.0,0,0,0,1,0,1,0,...,1,0,0,0,1,0,0,0,0,0


In [64]:
X = dummy_df.drop(columns = 'target')
y = dummy_df['target']

In [65]:
classifier = neighbors.KNeighborsClassifier()
classifier.fit(X,y)

KNeighborsClassifier()

In [66]:
y_pred = classifier.predict(X)

In [67]:
print(classification_report(y, y_pred))

              precision    recall  f1-score   support

         0.0       0.84      0.94      0.88     10868
         1.0       0.70      0.44      0.54      3547

    accuracy                           0.82     14415
   macro avg       0.77      0.69      0.71     14415
weighted avg       0.80      0.82      0.80     14415



In [68]:
data_test = pd.read_csv('z_test.csv')

In [69]:
data_test['relevent_experience'] = data_test['relevent_experience'].apply(lambda x: 1 if x == 'Has relevent experience' else 0) 

In [70]:
data_test.drop(columns = ['company_size', 'company_type'], inplace = True)

In [71]:
for cat, comp_size in experience_ranges.items():
    data_test.loc[data_test['experience'].isin(comp_size),'experience'] = cat

In [72]:
data_test.loc[~data_test['city'].isin(big_cities),'city'] ='other_city'

In [73]:
data_test.set_index('enrollee_id', inplace = True)

In [74]:
data_test.loc[:,'city_development_index'] = pd.qcut(data_test['city_development_index'], 3, labels = [0,1,2])

In [75]:
data_test['major_discipline'].fillna('No Major', inplace = True)

In [76]:
data['gender'].fillna('Other', inplace = True)

In [77]:
data_test = pd.get_dummies(data_test)
data_test.head()

Unnamed: 0_level_0,relevent_experience,training_hours,city_city_103,city_city_16,city_city_21,city_other_city,city_development_index_0,city_development_index_1,city_development_index_2,gender_Female,...,experience_7,experience_10,experience_15,experience_20,last_new_job_1,last_new_job_2,last_new_job_3,last_new_job_4,last_new_job_>4,last_new_job_never
enrollee_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
23603,0,78,0,0,0,1,0,1,0,0,...,1,0,0,0,1,0,0,0,0,0
22499,1,36,0,0,0,1,0,1,0,0,...,1,0,0,0,1,0,0,0,0,0
10465,0,34,0,0,0,1,1,0,0,0,...,1,0,0,0,0,0,0,0,0,1
8293,1,149,0,0,0,1,0,1,0,1,...,0,1,0,0,1,0,0,0,0,0
4246,1,7,1,0,0,0,0,1,0,0,...,0,0,0,1,0,1,0,0,0,0


In [80]:
y_test = classifier.predict(data_test)

In [81]:
data_test['target'] = y_test

In [82]:
data_test['target'].to_csv('KNNv1.csv')

In [47]:
def Diff(li1, li2):
    return (list(list(set(li1)-set(li2)) + list(set(li2)-set(li1))))

In [48]:
Diff(X.columns, data_test.columns)

['gender_Unknown']