In [93]:
import os 
import csv 
import random
import statsmodels.api as sm
import statsmodels.formula.api as smf

import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.preprocessing import MinMaxScaler  

from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2

from sklearn.svm import SVC 
from sklearn.metrics import accuracy_score

from sklearn.tree import DecisionTreeClassifier 


In [94]:
dtypes = {
    's.no':'int64',
    'Timestamp':'object',
    'Age' : 'float64',
    'Gender' : 'category',
    'Country' : 'category',
    'state' : 'category',
    'self_employed' : 'category',
    'family_history' : 'category',
    'treatment' : 'category',
    'work_interfere' : 'category',
    'no_employees' : 'category',
    'remote_work' : 'category',
    'tech_company' : 'category',
    'benefits' : 'category',
    'care_options' : 'category',
    'wellness_program' : 'category',
    'seek_help' : 'category',
    'anonymity' : 'category',
    'leave' : 'category',
    'mental_health_consequence' : 'category',
    'phys_health_consequence' : 'category',
    'coworkers' : 'category',
    'supervisor' : 'category',
    'mental_health_interview' : 'category',
    'phys_health_interview' : 'category',
    'mental_vs_physical' : 'category',
    'obs_consequence' : 'category',
    'comments' : 'str',    
}

# Importing Data

In [115]:
train_df = pd.read_csv('trainms.csv',dtype = dtypes)
test_df = pd.read_csv('testms.csv',dtype = dtypes)
samdf = pd.read_csv('samplems.csv', dtype=dtypes)
test_df['treatment'] = samdf['treatment']

# Data Preprocessing

In [116]:
frames = [train_df, test_df]
result = pd.concat(frames)

#Male
result.loc[result['Gender'] == 'M','Gender'] = 'Male'
result.loc[result['Gender'] == 'male','Gender'] = 'Male'
result.loc[result['Gender'] == 'msle','Gender'] = 'Male'
result.loc[result['Gender'] == 'm','Gender'] = 'Male'
result.loc[result['Gender'] == 'maile','Gender'] = 'Male'
result.loc[result['Gender'] == 'mal','Gender'] = 'Male'
result.loc[result['Gender'] == 'Male-ish','Gender'] = 'Male'
result.loc[result['Gender'] == 'ostensibly male, unsure what that really means','Gender'] = 'Male'
result.loc[result['Gender'] == 'Cis Man','Gender'] = 'Male'
result.loc[result['Gender'] == 'something kinda male?','Gender'] = 'Male'
result.loc[result['Gender'] == 'make','Gender'] = 'Male'
result.loc[result['Gender'] == 'Make','Gender'] = 'Male'
result.loc[result['Gender'] == 'cis Man','Gender'] = 'Male'
result.loc[result['Gender'] == 'Cis Male','Gender'] = 'Male'
result.loc[result['Gender'] == 'cis Male','Gender'] = 'Male'
result.loc[result['Gender'] == 'cis male','Gender'] = 'Male'
result.loc[result['Gender'] == 'Man','Gender'] = 'Male'
result.loc[result['Gender'] == 'man','Gender'] = 'Male'



#Female
result.loc[result['Gender'] == 'F','Gender'] = 'Female'
result.loc[result['Gender'] == 'female','Gender'] = 'Female'
result.loc[result['Gender'] == 'f','Gender'] = 'Female'
result.loc[result['Gender'] == 'Cis Female','Gender'] = 'Female'
result.loc[result['Gender'] == 'Femake','Gender'] = 'Female'
result.loc[result['Gender'] == 'cis-female/femme','Gender'] = 'Female'
result.loc[result['Gender'] == 'Female (cis)','Gender'] = 'Female'
result.loc[result['Gender'] == 'cis female','Gender'] = 'Female'
result.loc[result['Gender'] == 'Woman','Gender'] = 'Female'

#Transgender
result.loc[result['Gender'] == 'Trans woman','Gender'] = 'Transgender'
result.loc[result['Gender'] == 'Female (trans)','Gender'] = 'Transgender'
result.loc[result['Gender'] == 'Female (trans)','Gender'] = 'Transgender'
result.loc[result['Gender'] == 'Trans-female','Gender'] = 'Transgender'


#Others
result.loc[result['Gender'] == 'non-binary','Gender'] = 'Others'
result.loc[result['Gender'] == 'Nah','Gender'] = 'Others'
result.loc[result['Gender'] == 'Enby','Gender'] = 'Others'
result.loc[result['Gender'] == 'fluid','Gender'] = 'Others'
result.loc[result['Gender'] == 'Genderqueer','Gender'] = 'Others'
result.loc[result['Gender'] == 'Androgyne','Gender'] = 'Others'
result.loc[result['Gender'] == 'Agender','Gender'] = 'Others'
result.loc[result['Gender'] == 'Guy (-ish) ^_^','Gender'] = 'Others'
result.loc[result['Gender'] == 'male leaning androgynous','Gender'] = 'Others'
result.loc[result['Gender'] == 'Neuter','Gender'] = 'Others'
result.loc[result['Gender'] == 'queer','Gender'] = 'Others'
result.loc[result['Gender'] == 'A little about you','Gender'] = 'Others'
result.loc[result['Gender'] == 'p','Gender'] = 'Others'


of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


  


# Data Stacking

In [117]:
data = result.loc[:,['s.no','Age']]

data['Country'] = pd.factorize(result['Country'], sort=True)[0]
data['Gender'] = pd.factorize(result['Gender'], sort=True)[0]
data['Timestamp'] = pd.factorize(result['Timestamp'], sort=True)[0]
data['anonymity'] = pd.factorize(result['anonymity'], sort=True)[0]
data['benefits'] = pd.factorize(result['benefits'], sort=True)[0]
data['care_options'] = pd.factorize(result['care_options'], sort=True)[0]
data['comments'] = pd.factorize(result['comments'], sort=True)[0]
data['coworkers'] = pd.factorize(result['coworkers'], sort=True)[0]
data['family_history'] = pd.factorize(result['family_history'], sort=True)[0]
data['leave'] = pd.factorize(result['leave'], sort=True)[0]
data['mental_health_consequence'] = pd.factorize(result['mental_health_consequence'], sort=True)[0]
data['mental_health_interview'] = pd.factorize(result['mental_health_interview'], sort=True)[0]
data['mental_vs_physical'] = pd.factorize(result['mental_vs_physical'], sort=True)[0]
data['no_employees'] = pd.factorize(result['no_employees'], sort=True)[0]
data['obs_consequence'] = pd.factorize(result['obs_consequence'], sort=True)[0]
data['phys_health_consequence'] = pd.factorize(result['phys_health_consequence'], sort=True)[0]
data['phys_health_interview'] = pd.factorize(result['phys_health_interview'], sort=True)[0]
data['remote_work'] = pd.factorize(result['remote_work'], sort=True)[0]
data['seek_help'] = pd.factorize(result['seek_help'], sort=True)[0]
data['self_employed'] = pd.factorize(result['self_employed'], sort=True)[0]
data['state'] = pd.factorize(result['state'], sort=True)[0]
data['supervisor'] = pd.factorize(result['supervisor'], sort=True)[0]
data['tech_company'] = pd.factorize(result['tech_company'], sort=True)[0]
data['treatment'] = pd.factorize(result['treatment'], sort=True)[0]
data['wellness_program'] = pd.factorize(result['wellness_program'], sort=True)[0]
data['work_interfere'] = pd.factorize(result['work_interfere'], sort=True)[0]




In [118]:
treatment = data['treatment']
y = treatment                    #treatment is the predicting class


In [119]:
sno = data['s.no']
age = data['Age']
country = data['Country']
Gender = data['Gender'] 
Timestamp = data['Timestamp']
anonymity = data['anonymity'] 
benefits = data['benefits']
care_options = data['care_options']
comments = data['comments'] 
coworkers = data['coworkers']
family_history = data['family_history'] 
leave = data['leave'] 
mental_health_consequence = data['mental_health_consequence']
mental_health_interview = data['mental_health_interview'] 
mental_vs_physical = data['mental_vs_physical']
no_employees = data['no_employees'] 
obs_consequence = data['obs_consequence'] 
phys_health_consequence = data['phys_health_consequence'] 
phys_health_interview = data['phys_health_interview'] 
remote_work = data['remote_work'] 
seek_help = data['seek_help'] 
self_employed = data['self_employed']
state = data['state'] 
supervisor = data['supervisor'] 
tech_company = data['tech_company'] 
wellness_program = data['wellness_program'] 
work_interfere = data['work_interfere']

In [120]:
x = np.column_stack((age,country,Gender,Timestamp,anonymity,benefits,care_options,coworkers,family_history,leave,mental_health_consequence,mental_health_interview,mental_vs_physical,no_employees,obs_consequence,phys_health_consequence,phys_health_interview,remote_work,seek_help,self_employed,state,supervisor,tech_company,wellness_program,work_interfere))


# Model Training

In [121]:
etc_model = ExtraTreesClassifier(n_estimators=10)
etc_model.fit(x, y)
print(etc_model.feature_importances_)

[0.05176119 0.03509247 0.02263754 0.09460576 0.02522961 0.03182027
 0.03624065 0.0313689  0.06148293 0.03626774 0.03679671 0.01520048
 0.03173112 0.03820893 0.02041769 0.02874327 0.03324063 0.02496846
 0.02820681 0.01594751 0.03478054 0.02645434 0.01906991 0.02877961
 0.19094692]


In [122]:
x1_train = x[:len(train_df)]
x1_test = x[len(train_df):]
y1_train = y[:len(train_df)]
y1_test = y[len(train_df):]


logreg = LogisticRegression().fit(x1_train,y1_train)

print("Training set score: {:.3f}".format(logreg.score(x1_train,y1_train)))
print("Test set score: {:.3f}".format(logreg.score(x1_test,y1_test)))

Training set score: 0.831
Test set score: 0.429


