In [1]:
import os 
import csv 
import random
import statsmodels.api as sm
import statsmodels.formula.api as smf

import numpy as np
import pandas as pd
import matplotlib as pt

from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.preprocessing import MinMaxScaler  
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score




# Importing Data

In [2]:
dtypes = {
    's.no':'int64',
    'Timestamp':'object',
    'Age' : 'float64',
    'Gender' : 'category',
    'Country' : 'category',
    'state' : 'category',
    'self_employed' : 'category',
    'family_history' : 'category',
    'treatment' : 'category',
    'work_interfere' : 'category',
    'no_employees' : 'category',
    'remote_work' : 'category',
    'tech_company' : 'category',
    'benefits' : 'category',
    'care_options' : 'category',
    'wellness_program' : 'category',
    'seek_help' : 'category',
    'anonymity' : 'category',
    'leave' : 'category',
    'mental_health_consequence' : 'category',
    'phys_health_consequence' : 'category',
    'coworkers' : 'category',
    'supervisor' : 'category',
    'mental_health_interview' : 'category',
    'phys_health_interview' : 'category',
    'mental_vs_physical' : 'category',
    'obs_consequence' : 'category',
    'comments' : 'str'    
}

In [4]:
train_df = pd.read_csv('trainms.csv',dtype = dtypes)
test_df = pd.read_csv('testms.csv',dtype = dtypes)
samdf = pd.read_csv('samplems.csv', dtype=dtypes)
test_df['treatment'] = samdf['treatment']

In [14]:
print(train_df.isnull().sum())

s.no                           0
Timestamp                      0
Age                            0
Gender                         0
Country                        0
state                          0
self_employed                  0
family_history                 0
treatment                      0
work_interfere                 0
no_employees                   0
remote_work                    0
tech_company                   0
benefits                       0
care_options                   0
wellness_program               0
seek_help                      0
anonymity                      0
leave                          0
mental_health_consequence      0
phys_health_consequence        0
coworkers                      0
supervisor                     0
mental_health_interview        0
phys_health_interview          0
mental_vs_physical             0
obs_consequence                0
comments                     873
dtype: int64


In [5]:
train_df['state'] = train_df['state'].fillna('CA')


In [6]:
train_df['self_employed'] = train_df['self_employed'].fillna('No')

In [7]:
train_df['work_interfere'] = train_df['work_interfere'].fillna('Sometimes')

# Preprocessing and Cleaning

In [9]:
frames = [train_df, test_df]
result = pd.concat(frames)

#Male
result.loc[result['Gender'] == 'M','Gender'] = 'Male'
result.loc[result['Gender'] == 'male','Gender'] = 'Male'
result.loc[result['Gender'] == 'malr','Gender'] = 'Male'
result.loc[result['Gender'] == 'Malr','Gender'] = 'Male'
result.loc[result['Gender'] == 'mail','Gender'] = 'Male'
result.loc[result['Gender'] == 'Male ','Gender'] = 'Male'
result.loc[result['Gender'] == 'msle','Gender'] = 'Male'
result.loc[result['Gender'] == 'm','Gender'] = 'Male'
result.loc[result['Gender'] == 'maile','Gender'] = 'Male'
result.loc[result['Gender'] == 'mal','Gender'] = 'Male'
result.loc[result['Gender'] == 'Mal','Gender'] = 'Male'
result.loc[result['Gender'] == 'Male-ish','Gender'] = 'Male'
result.loc[result['Gender'] == 'ostensibly male, unsure what that really means','Gender'] = 'Male'
result.loc[result['Gender'] == 'Cis Man','Gender'] = 'Male'
result.loc[result['Gender'] == 'something kinda male?','Gender'] = 'Male'
result.loc[result['Gender'] == 'make','Gender'] = 'Male'
result.loc[result['Gender'] == 'Make','Gender'] = 'Male'
result.loc[result['Gender'] == 'cis Man','Gender'] = 'Male'
result.loc[result['Gender'] == 'Cis Male','Gender'] = 'Male'
result.loc[result['Gender'] == 'cis Male','Gender'] = 'Male'
result.loc[result['Gender'] == 'cis male','Gender'] = 'Male'
result.loc[result['Gender'] == 'Man','Gender'] = 'Male'
result.loc[result['Gender'] == 'man','Gender'] = 'Male'



#Female
result.loc[result['Gender'] == 'F','Gender'] = 'Female'
result.loc[result['Gender'] == 'female','Gender'] = 'Female'
result.loc[result['Gender'] == 'femail','Gender'] = 'Female'
result.loc[result['Gender'] == 'Female ','Gender'] = 'Female'
result.loc[result['Gender'] == 'f','Gender'] = 'Female'
result.loc[result['Gender'] == 'Cis Female','Gender'] = 'Female'
result.loc[result['Gender'] == 'Femake','Gender'] = 'Female'
result.loc[result['Gender'] == 'cis-female/femme','Gender'] = 'Female'
result.loc[result['Gender'] == 'Female (cis)','Gender'] = 'Female'
result.loc[result['Gender'] == 'cis female','Gender'] = 'Female'
result.loc[result['Gender'] == 'Woman','Gender'] = 'Female'
result.loc[result['Gender'] == 'woman','Gender'] = 'Female'

#Transgender
result.loc[result['Gender'] == 'Trans woman','Gender'] = 'Transgender'
result.loc[result['Gender'] == 'Female (trans)','Gender'] = 'Transgender'
result.loc[result['Gender'] == 'Female (trans)','Gender'] = 'Transgender'
result.loc[result['Gender'] == 'Trans-female','Gender'] = 'Transgender'


#Others
result.loc[result['Gender'] == 'non-binary','Gender'] = 'Others'
result.loc[result['Gender'] == 'Nah','Gender'] = 'Others'
result.loc[result['Gender'] == 'Enby','Gender'] = 'Others'
result.loc[result['Gender'] == 'fluid','Gender'] = 'Others'
result.loc[result['Gender'] == 'Genderqueer','Gender'] = 'Others'
result.loc[result['Gender'] == 'Androgyne','Gender'] = 'Others'
result.loc[result['Gender'] == 'Agender','Gender'] = 'Others'
result.loc[result['Gender'] == 'Guy (-ish) ^_^','Gender'] = 'Others'
result.loc[result['Gender'] == 'male leaning androgynous','Gender'] = 'Others'
result.loc[result['Gender'] == 'Neuter','Gender'] = 'Others'
result.loc[result['Gender'] == 'queer','Gender'] = 'Others'
result.loc[result['Gender'] == 'A little about you','Gender'] = 'Others'
result.loc[result['Gender'] == 'p','Gender'] = 'Others'


of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


  


# Feature Engineering and Selection

In [16]:
result['Gender'].unique()

array(['Female', 'Male', 'Transgender', 'Others'], dtype=object)

In [10]:
df_sex = pd.get_dummies(result['Gender'])

In [11]:
df_new = pd.concat([result, df_sex], axis=1)
result['Male'] = df_new['Male']
result['Female'] = df_new['Female']
result['Transgender'] = df_new['Transgender']
result['Others'] = df_new['Others']

#1 Result - No cange in Accuracy

In [27]:
result['no_employees'].unique()

[6-25, More than 1000, 26-100, 100-500, 1-5, 500-1000]
Categories (6, object): [6-25, More than 1000, 26-100, 100-500, 1-5, 500-1000]

# Data Combining

In [14]:
data = result.loc[:,['s.no','Age','Male','Female','Transgender','Others']]

data['Country'] = pd.factorize(result['Country'], sort=True)[0]
#1 - change gender with dummy variables--> male,female,Transgender,Others
data['Gender'] = pd.factorize(result['Gender'], sort=True)[0]

data['Timestamp'] = pd.factorize(result['Timestamp'], sort=True)[0]
data['anonymity'] = pd.factorize(result['anonymity'], sort=True)[0]
data['benefits'] = pd.factorize(result['benefits'], sort=True)[0]
data['care_options'] = pd.factorize(result['care_options'], sort=True)[0]
data['comments'] = pd.factorize(result['comments'], sort=True)[0]
data['coworkers'] = pd.factorize(result['coworkers'], sort=True)[0]
data['family_history'] = pd.factorize(result['family_history'], sort=True)[0]
data['leave'] = pd.factorize(result['leave'], sort=True)[0]
data['mental_health_consequence'] = pd.factorize(result['mental_health_consequence'], sort=True)[0]
data['mental_health_interview'] = pd.factorize(result['mental_health_interview'], sort=True)[0]
data['mental_vs_physical'] = pd.factorize(result['mental_vs_physical'], sort=True)[0]
data['no_employees'] = pd.factorize(result['no_employees'], sort=True)[0]
data['obs_consequence'] = pd.factorize(result['obs_consequence'], sort=True)[0]
data['phys_health_consequence'] = pd.factorize(result['phys_health_consequence'], sort=True)[0]
data['phys_health_interview'] = pd.factorize(result['phys_health_interview'], sort=True)[0]
data['remote_work'] = pd.factorize(result['remote_work'], sort=True)[0]
data['seek_help'] = pd.factorize(result['seek_help'], sort=True)[0]
data['self_employed'] = pd.factorize(result['self_employed'], sort=True)[0]
data['state'] = pd.factorize(result['state'], sort=True)[0]
data['supervisor'] = pd.factorize(result['supervisor'], sort=True)[0]
data['tech_company'] = pd.factorize(result['tech_company'], sort=True)[0]
data['treatment'] = pd.factorize(result['treatment'], sort=True)[0]
data['wellness_program'] = pd.factorize(result['wellness_program'], sort=True)[0]
data['work_interfere'] = pd.factorize(result['work_interfere'], sort=True)[0]




In [15]:
treatment = data['treatment']
y = treatment                    #treatment is the predicting class


In [16]:
age = data['Age']
country = data['Country']

#1 - newly added dummy variable

Male = data['Male']
Female = data['Female']
Transgender = data['Transgender']
Others = data['Others']

#--------------------------

anonymity = data['anonymity'] 
benefits = data['benefits']
care_options = data['care_options']
comments = data['comments'] 
coworkers = data['coworkers']
family_history = data['family_history'] 
leave = data['leave'] 
mental_health_consequence = data['mental_health_consequence']
mental_vs_physical = data['mental_vs_physical']
obs_consequence = data['obs_consequence'] 
phys_health_consequence = data['phys_health_consequence'] 
seek_help = data['seek_help'] 
state = data['state'] 
supervisor = data['supervisor'] 
wellness_program = data['wellness_program'] 
work_interfere = data['work_interfere']

In [21]:
x = np.column_stack((age,country,Male,Female,Transgender,Others,anonymity,benefits,care_options,family_history,leave,mental_health_consequence,mental_vs_physical,obs_consequence,phys_health_consequence,seek_help,seek_help,supervisor,state,wellness_program,work_interfere))


# Extra Tree Classifier

In [49]:
etc_model = ExtraTreesClassifier(n_estimators=10)
etc_model.fit(x, y)
print(etc_model.feature_importances_)

[0.07810634 0.04679768 0.01463305 0.01260081 0.03262006 0.04656871
 0.0470228  0.06280472 0.05783373 0.04197852 0.02706317 0.04176448
 0.05478107 0.02377242 0.0375697  0.04993054 0.03145559 0.04051454
 0.01706633 0.07119133 0.04223664 0.02562795 0.03302756 0.06303225]


# Model Training

In [27]:
x1_train = x[:len(train_df)]
x1_test = x[len(train_df):len(train_df)+1]
y1_train = y[:len(train_df)]
y1_test = y[len(train_df):len(train_df)+1]



logreg = LogisticRegression().fit(x1_train,y1_train)

print("Training set score: {:.3f}".format(logreg.score(x1_train,y1_train)))
print("Test set score: {:.3f}".format(logreg.score(x1_test,y1_test)))

Training set score: 0.726
Test set score: 0.000




In [36]:
y_pred = logreg.predict(x1_test)

a = y_pred[0]




In [37]:
if a == 1:
    return 1

SyntaxError: 'return' outside function (<ipython-input-37-a4882210525a>, line 2)

In [64]:
result_train = result[:len(train_df):]

result_test = result[len(train_df):]


result_train

Unnamed: 0,Age,Country,Gender,Timestamp,anonymity,benefits,care_options,comments,coworkers,family_history,...,tech_company,treatment,wellness_program,work_interfere,Male,Female,Transgender,Others,no_employees_lower,no_employees_upper
0,37.0,United States,Female,2014-08-27 11:29:31,Yes,Yes,Not sure,,Some of them,No,...,Yes,Yes,No,Often,0,1,0,0,,
1,44.0,United States,Male,2014-08-27 11:29:37,Don't know,Don't know,No,,No,No,...,No,No,Don't know,Rarely,1,0,0,0,,
2,32.0,Canada,Male,2014-08-27 11:29:44,Don't know,No,No,,Yes,No,...,Yes,No,No,Rarely,1,0,0,0,,
3,31.0,United Kingdom,Male,2014-08-27 11:29:46,No,No,Yes,,Some of them,Yes,...,Yes,Yes,No,Often,1,0,0,0,,
4,31.0,United States,Male,2014-08-27 11:30:22,Don't know,Yes,No,,Some of them,No,...,Yes,No,Don't know,Never,1,0,0,0,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
995,35.0,United Kingdom,Male,2014-08-29 09:19:25,Don't know,Don't know,Not sure,,Some of them,No,...,Yes,No,No,Sometimes,1,0,0,0,,
996,39.0,United States,Male,2014-08-29 09:24:34,Yes,Yes,Yes,,Some of them,No,...,Yes,Yes,No,Rarely,1,0,0,0,,
997,31.0,United States,Female,2014-08-29 09:23:22,Don't know,Yes,Not sure,,No,Yes,...,No,No,No,Never,0,1,0,0,,
998,32.0,United Kingdom,Male,2014-08-29 09:23:44,Don't know,No,No,,No,No,...,Yes,No,No,Rarely,1,0,0,0,,


In [65]:
result_train.to_csv('result_train.csv')

result_test.to_csv('result_test.csv')