In [36]:
import pandas as pd
from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier
from sklearn import metrics

In [37]:
#  Problem Statement: the agency wants you to design a model that uses the current credentials/demographics/experience 
#  to predict the probability of an enrollee to look for a new job.

df_raw= pd.read_csv('train_jqd04QH.csv')
# print(df.columns)
print(df_raw.head())

   enrollee_id      city  city_development_index gender  \
0        23798  city_149                   0.689   Male   
1        29166   city_83                   0.923   Male   
2           46   city_16                   0.910    NaN   
3        18527   city_64                   0.666   Male   
4        21751  city_100                   0.887    NaN   

       relevent_experience enrolled_university education_level  \
0  Has relevent experience       no_enrollment        Graduate   
1  Has relevent experience       no_enrollment        Graduate   
2  Has relevent experience       no_enrollment        Graduate   
3  Has relevent experience       no_enrollment        Graduate   
4   No relevent experience       no_enrollment         Masters   

  major_discipline experience company_size    company_type last_new_job  \
0             STEM          3      100-500         Pvt Ltd            1   
1             STEM         14          <10  Funded Startup            1   
2             STEM     

In [38]:
df_raw.dtypes

enrollee_id                 int64
city                       object
city_development_index    float64
gender                     object
relevent_experience        object
enrolled_university        object
education_level            object
major_discipline           object
experience                 object
company_size               object
company_type               object
last_new_job               object
training_hours              int64
target                      int64
dtype: object

In [39]:
for c in df_raw.columns:
    print(c, df_raw[c].isnull().sum())

enrollee_id 0
city 0
city_development_index 0
gender 4098
relevent_experience 0
enrolled_university 342
education_level 457
major_discipline 2838
experience 59
company_size 4779
company_type 5039
last_new_job 367
training_hours 0
target 0


In [40]:
obj_df = df_raw.select_dtypes(include=['object']).copy()
obj_df

Unnamed: 0,city,gender,relevent_experience,enrolled_university,education_level,major_discipline,experience,company_size,company_type,last_new_job
0,city_149,Male,Has relevent experience,no_enrollment,Graduate,STEM,3,100-500,Pvt Ltd,1
1,city_83,Male,Has relevent experience,no_enrollment,Graduate,STEM,14,<10,Funded Startup,1
2,city_16,,Has relevent experience,no_enrollment,Graduate,STEM,6,50-99,Public Sector,2
3,city_64,Male,Has relevent experience,no_enrollment,Graduate,STEM,14,50-99,Pvt Ltd,1
4,city_100,,No relevent experience,no_enrollment,Masters,STEM,8,,,2
...,...,...,...,...,...,...,...,...,...,...
18354,city_103,Male,Has relevent experience,Full time course,Graduate,STEM,5,<10,Pvt Ltd,1
18355,city_160,Male,No relevent experience,no_enrollment,Graduate,Humanities,15,50-99,Pvt Ltd,1
18356,city_114,Male,Has relevent experience,no_enrollment,Masters,STEM,11,50-99,Pvt Ltd,3
18357,city_75,Male,Has relevent experience,no_enrollment,Graduate,STEM,>20,10/49,Pvt Ltd,3


In [41]:
# Make strings into numbers 
def makeCategories(columnname, newcolumnname, dataframe, newdataframe):
#     newdataframe[columnname] = dataframe[columnname].astype('category')
    newdataframe[newcolumnname] = dataframe[columnname].astype('category').cat.codes
# def makeCategories(columnname, newcolumnname):
#     df_raw[columnname] = df_raw[columnname].astype('category')
#     df_raw[newcolumnname] = df_raw[columnname].cat.codes

df_cat = pd.DataFrame()
makeCategories('city', 'city_cat', df_raw, df_cat)
makeCategories('gender', 'gender_cat', df_raw, df_cat)
makeCategories('relevent_experience', 'relevent_experience_cat', df_raw, df_cat)
makeCategories('enrolled_university', 'enrolled_university_cat', df_raw, df_cat)
makeCategories('education_level', 'education_level_cat', df_raw, df_cat)
makeCategories('major_discipline', 'major_discipline_cat', df_raw, df_cat)
makeCategories('experience', 'experience_cat', df_raw, df_cat)
makeCategories('company_size', 'company_size_cat', df_raw, df_cat)
makeCategories('company_type', 'company_type_cat', df_raw, df_cat)
makeCategories('last_new_job', 'last_new_job_cat', df_raw, df_cat)
print(df_cat)

       city_cat  gender_cat  relevent_experience_cat  enrolled_university_cat  \
0            41           1                        0                        2   
1           112           1                        0                        2   
2            48          -1                        0                        2   
3            93           1                        0                        2   
4             2          -1                        1                        2   
...         ...         ...                      ...                      ...   
18354         5           1                        0                        0   
18355        49           1                        1                        2   
18356        13           1                        0                        2   
18357       103           1                        0                        2   
18358         7           0                        0                        2   

       education_level_cat 

In [42]:
# First method of filling in missing values -  put a number to indicate null
# for c in df_raw.columns:
#     print(c, df_raw[c].isnull().sum())

df_cat['enrollee_id'] = df_raw['enrollee_id']
df_cat['city_development_index'] = df_raw['city_development_index']
df_cat['training_hours'] = df_raw['training_hours']
df_cat['target'] = df_raw['target']

for c in df_cat.columns:
    print(c, df_cat[c].isnull().sum())
    print(df_cat[c].size)
    print(pd.unique(df_cat[c]))


city_cat 0
18359
[ 41 112  48  93   2  64  13   5 120  49  94 116 103  30  47  95  70   1
 101 104   6  69  71  91 122  78  36 115  15  24 102  96   0  57  77  21
  43  51 114  74  81  83  82  29 118  60  50 113  31  55  61  99  46  65
   4   7 117  11  26  89  63  33  75  14  85  53  18  37  22 105   3  42
  58   9  56  40 121  52  17  25  23  66 110  68  32 119  98  39  45  12
 106 109  73  97 100  80  38  76  90  84  27  20  35  79  16  88  87 108
  92  86   8  44  28  54  62  19  59 107  72  10  67 111  34]
gender_cat 0
18359
[ 1 -1  0  2]
relevent_experience_cat 0
18359
[0 1]
enrolled_university_cat 0
18359
[ 2  0  1 -1]
education_level_cat 0
18359
[ 0  2  1  3 -1  4]
major_discipline_cat 0
18359
[ 5  4  3 -1  1  0  2]
experience_cat 0
18359
[13  5 16 18 21 14 19  6  1  0 15  7  2  3 17 11  4 20 10  9  8 -1 12]
company_size_cat 0
18359
[ 1  7  4 -1  6  3  2  5  0]
company_type_cat 0
18359
[ 5  1  4 -1  0  2  3]
last_new_job_cat 0
18359
[ 0  1  2  4  5  3 -1]
enrollee_id 0
18359
[2

In [43]:
# Second method of filling in missing values - list wise deletion

In [44]:
# Third method of filling in missing values - pair wise deletion

In [45]:
m = RandomForestRegressor(n_jobs=-1)
m.fit(df_cat.drop('target', axis=1), df_cat.target)
m.score(df_cat.drop('target', axis=1), df_cat.target)



0.8003598705484867

In [None]:
# results = pd.DataFrame(data=m.predict(testingmodelready))
# print(results)