In [444]:
import numpy as np
import pandas as pd
import random
random.seed(666)
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import MinMaxScaler
from scipy.sparse import coo_matrix, hstack
from sklearn.metrics import roc_auc_score
from sklearn.metrics import f1_score
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import LinearSVC

In [395]:
train = pd.read_csv('../../data/aug_train.csv')
test = pd.read_csv('../../data/aug_test.csv')

In [396]:
train.head()

Unnamed: 0,enrollee_id,city,city_development_index,gender,relevent_experience,enrolled_university,education_level,major_discipline,experience,company_size,company_type,last_new_job,training_hours,target
0,8949,city_103,0.92,Male,Has relevent experience,no_enrollment,Graduate,STEM,>20,,,1,36,1.0
1,29725,city_40,0.776,Male,No relevent experience,no_enrollment,Graduate,STEM,15,50-99,Pvt Ltd,>4,47,0.0
2,11561,city_21,0.624,,No relevent experience,Full time course,Graduate,STEM,5,,,never,83,0.0
3,33241,city_115,0.789,,No relevent experience,,Graduate,Business Degree,<1,,Pvt Ltd,never,52,1.0
4,666,city_162,0.767,Male,Has relevent experience,no_enrollment,Masters,STEM,>20,50-99,Funded Startup,4,8,0.0


In [397]:
train.corr()

Unnamed: 0,enrollee_id,city_development_index,training_hours,target
enrollee_id,1.0,-0.040455,0.000998,0.049475
city_development_index,-0.040455,1.0,0.00192,-0.341665
training_hours,0.000998,0.00192,1.0,-0.021577
target,0.049475,-0.341665,-0.021577,1.0


In [398]:
test.corr()

Unnamed: 0,enrollee_id,city_development_index,training_hours
enrollee_id,1.0,-0.045087,0.003239
city_development_index,-0.045087,1.0,-0.022653
training_hours,0.003239,-0.022653,1.0


In [399]:
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 19158 entries, 0 to 19157
Data columns (total 14 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   enrollee_id             19158 non-null  int64  
 1   city                    19158 non-null  object 
 2   city_development_index  19158 non-null  float64
 3   gender                  14650 non-null  object 
 4   relevent_experience     19158 non-null  object 
 5   enrolled_university     18772 non-null  object 
 6   education_level         18698 non-null  object 
 7   major_discipline        16345 non-null  object 
 8   experience              19093 non-null  object 
 9   company_size            13220 non-null  object 
 10  company_type            13018 non-null  object 
 11  last_new_job            18735 non-null  object 
 12  training_hours          19158 non-null  int64  
 13  target                  19158 non-null  float64
dtypes: float64(2), int64(2), object(10)
me

In [400]:
# y = train['target'].astype('int')
# train = train.drop('target', axis=1)

In [401]:
train.isna().sum()/train.shape[0]

enrollee_id               0.000000
city                      0.000000
city_development_index    0.000000
gender                    0.235306
relevent_experience       0.000000
enrolled_university       0.020148
education_level           0.024011
major_discipline          0.146832
experience                0.003393
company_size              0.309949
company_type              0.320493
last_new_job              0.022080
training_hours            0.000000
target                    0.000000
dtype: float64

In [402]:
test.isna().sum()/test.shape[0]

enrollee_id               0.000000
city                      0.000000
city_development_index    0.000000
gender                    0.238610
relevent_experience       0.000000
enrolled_university       0.014561
education_level           0.024425
major_discipline          0.146548
experience                0.002349
company_size              0.292156
company_type              0.297792
last_new_job              0.018788
training_hours            0.000000
dtype: float64

In [403]:
obj_column = train.dtypes[train.dtypes == 'object'].index
for obj in obj_column:
    print(train[obj].value_counts(normalize=True), '\n')

city_103    0.227320
city_21     0.141038
city_16     0.080019
city_114    0.069736
city_160    0.044107
              ...   
city_129    0.000157
city_111    0.000157
city_121    0.000157
city_140    0.000052
city_171    0.000052
Name: city, Length: 123, dtype: float64 

Male      0.902457
Female    0.084505
Other     0.013038
Name: gender, dtype: float64 

Has relevent experience    0.719908
No relevent experience     0.280092
Name: relevent_experience, dtype: float64 

no_enrollment       0.736043
Full time course    0.200139
Part time course    0.063818
Name: enrolled_university, dtype: float64 

Graduate          0.620280
Masters           0.233234
High School       0.107872
Phd               0.022141
Primary School    0.016472
Name: education_level, dtype: float64 

STEM               0.886632
Humanities         0.040930
Other              0.023310
Business Degree    0.020006
Arts               0.015479
No Major           0.013643
Name: major_discipline, dtype: float64 

>20    0

In [404]:
train.isna().sum()/train.shape[0] < 0.1

enrollee_id                True
city                       True
city_development_index     True
gender                    False
relevent_experience        True
enrolled_university        True
education_level            True
major_discipline          False
experience                 True
company_size              False
company_type              False
last_new_job               True
training_hours             True
target                     True
dtype: bool

In [405]:
def fill_NA(df, *cols):
    temp_df = df[df.columns[df.isna().sum()/df.shape[0] < 0.1]]
    temp_df = temp_df.fillna('NA').drop('enrollee_id', axis=1)
    oh_enc = OneHotEncoder()
    oh_enc.fit(temp_df)
    for col in cols:
        X_test = temp_df[df[col].isna()] #X_test, y_test will be predicted
        X_train = temp_df[~df[col].isna()] #X_train
        y_train = df[~df[col].isna()][col] #y_train
        lb_enc = LabelEncoder()
        y_train_enc = lb_enc.fit_transform(y_train)
        X_train_enc = oh_enc.transform(X_train)
        X_test_enc = oh_enc.transform(X_test)
#         clf = LogisticRegression(max_iter = 1000, n_jobs=-1)
        clf = KNeighborsClassifier(n_neighbors=5)
        clf.fit(X_train_enc, y_train_enc)
        y_pred = clf.predict(X_test_enc)
        print(np.unique(y_pred, return_counts=True))
        y_pred = lb_enc.inverse_transform(y_pred)
        df.loc[df.isna()[col], col] = y_pred

In [406]:
fill_NA(train, 'gender', 'major_discipline', 'company_size', 'company_type')

(array([0, 1, 2]), array([  37, 4470,    1], dtype=int64))
(array([0, 1, 2, 3, 4, 5]), array([  11,    8,   43,    8,    7, 2736], dtype=int64))
(array([0, 1, 2, 3, 4, 5, 6, 7]), array([1121, 1379,  473,  941, 1351,  148,   53,  472], dtype=int64))
(array([0, 1, 2, 3, 4, 5]), array([  84,  142,   92,    3,  408, 5411], dtype=int64))


In [407]:
fill_NA(test, 'gender', 'major_discipline', 'company_size', 'company_type')

(array([0, 1]), array([  9, 499], dtype=int64))
(array([2, 5]), array([  4, 308], dtype=int64))
(array([0, 1, 2, 3, 4, 5, 6, 7]), array([122, 170,  41, 105,  96,  13,  20,  55], dtype=int64))
(array([0, 1, 2, 4, 5]), array([ 14,   1,   2,  35, 582], dtype=int64))


In [408]:
global_enc = OneHotEncoder(handle_unknown='ignore')

In [420]:
y_train = train['target']
X_train = train.drop(['target', 'enrollee_id'], axis=1).fillna('NA')
X_test = test.drop(['enrollee_id'], axis=1).fillna('NA')

In [421]:
X_train_obj = X_train[X_train.columns[X_train.dtypes == 'object']]
X_test_obj = X_test[X_test.columns[X_test.dtypes == 'object']]

In [422]:
global_enc.fit(X_train_obj)
X_train_obj_sparse = global_enc.transform(X_train_obj)
X_test_obj_sparse = global_enc.transform(X_test_obj)

In [423]:
scl = MinMaxScaler()
X_train_th = scl.fit_transform(X_train[['training_hours', 'city_development_index']])
X_test_th = scl.fit_transform(X_test[['training_hours', 'city_development_index']])

In [424]:
X_train = hstack([X_train_obj_sparse, X_train_th])
X_test = hstack([X_test_obj_sparse, X_test_th])

In [425]:
X_train, X_holdout, y_train, y_holdout = train_test_split(X_train, y_train, train_size=0.7)

In [447]:
clf_1 = LogisticRegression(n_jobs=-1)
clf_2 = RandomForestClassifier(n_jobs=-1)
clf_3 = LinearSVC()

In [448]:
clf_1.fit(X_train, y_train)
clf_2.fit(X_train, y_train)
clf_3.fit(X_train, y_train)

LinearSVC()

In [449]:
# y_pred = clf_1.predict(X_test).astype('int')
y_pred = clf_2.predict(X_test).astype('int')

In [450]:
roc_auc_score(y_holdout, clf_1.predict(X_holdout))

0.6377448172577215

In [451]:
f1_score(y_holdout, clf_1.predict(X_holdout))

0.442367601246106

In [452]:
# pd.DataFrame(index=test['enrollee_id'], data=y_pred, columns=['target'])

In [453]:
roc_auc_score(y_holdout, clf_2.predict(X_holdout))

0.6598056117902334

In [454]:
f1_score(y_holdout, clf_2.predict(X_holdout))

0.48341232227488146

In [455]:
roc_auc_score(y_holdout, clf_3.predict(X_holdout))

0.6462178898698075

In [456]:
f1_score(y_holdout, clf_3.predict(X_holdout))

0.45842498900131984