In [2]:
%matplotlib inline
import pandas as pd
import numpy as np
from IPython.display import display
import matplotlib.pyplot as plt
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report

In [3]:
df = pd.read_csv('survey.csv')

In [4]:
df.shape

(1259, 27)

In [5]:
print(df.columns)
df

Index(['Timestamp', 'Age', 'Gender', 'Country', 'state', 'self_employed',
       'family_history', 'treatment', 'work_interfere', 'no_employees',
       'remote_work', 'tech_company', 'benefits', 'care_options',
       'wellness_program', 'seek_help', 'anonymity', 'leave',
       'mental_health_consequence', 'phys_health_consequence', 'coworkers',
       'supervisor', 'mental_health_interview', 'phys_health_interview',
       'mental_vs_physical', 'obs_consequence', 'comments'],
      dtype='object')


Unnamed: 0,Timestamp,Age,Gender,Country,state,self_employed,family_history,treatment,work_interfere,no_employees,...,leave,mental_health_consequence,phys_health_consequence,coworkers,supervisor,mental_health_interview,phys_health_interview,mental_vs_physical,obs_consequence,comments
0,2014-08-27 11:29:31,37,Female,United States,IL,,No,Yes,Often,6-25,...,Somewhat easy,No,No,Some of them,Yes,No,Maybe,Yes,No,
1,2014-08-27 11:29:37,44,M,United States,IN,,No,No,Rarely,More than 1000,...,Don't know,Maybe,No,No,No,No,No,Don't know,No,
2,2014-08-27 11:29:44,32,Male,Canada,,,No,No,Rarely,6-25,...,Somewhat difficult,No,No,Yes,Yes,Yes,Yes,No,No,
3,2014-08-27 11:29:46,31,Male,United Kingdom,,,Yes,Yes,Often,26-100,...,Somewhat difficult,Yes,Yes,Some of them,No,Maybe,Maybe,No,Yes,
4,2014-08-27 11:30:22,31,Male,United States,TX,,No,No,Never,100-500,...,Don't know,No,No,Some of them,Yes,Yes,Yes,Don't know,No,
5,2014-08-27 11:31:22,33,Male,United States,TN,,Yes,No,Sometimes,6-25,...,Don't know,No,No,Yes,Yes,No,Maybe,Don't know,No,
6,2014-08-27 11:31:50,35,Female,United States,MI,,Yes,Yes,Sometimes,1-5,...,Somewhat difficult,Maybe,Maybe,Some of them,No,No,No,Don't know,No,
7,2014-08-27 11:32:05,39,M,Canada,,,No,No,Never,1-5,...,Don't know,No,No,No,No,No,No,No,No,
8,2014-08-27 11:32:39,42,Female,United States,IL,,Yes,Yes,Sometimes,100-500,...,Very difficult,Maybe,No,Yes,Yes,No,Maybe,No,No,
9,2014-08-27 11:32:43,23,Male,Canada,,,No,No,Never,26-100,...,Don't know,No,No,Yes,Yes,Maybe,Maybe,Yes,No,


In [6]:
df.isnull().sum()

Timestamp                       0
Age                             0
Gender                          0
Country                         0
state                         515
self_employed                  18
family_history                  0
treatment                       0
work_interfere                264
no_employees                    0
remote_work                     0
tech_company                    0
benefits                        0
care_options                    0
wellness_program                0
seek_help                       0
anonymity                       0
leave                           0
mental_health_consequence       0
phys_health_consequence         0
coworkers                       0
supervisor                      0
mental_health_interview         0
phys_health_interview           0
mental_vs_physical              0
obs_consequence                 0
comments                     1095
dtype: int64

In [7]:
for col in df.columns:
    if col == 'Timestamp' or col == 'comments' :
        continue;
    print(col + ':\n' + str(df[col].unique().tolist()) + '\n')


Age:
[37, 44, 32, 31, 33, 35, 39, 42, 23, 29, 36, 27, 46, 41, 34, 30, 40, 38, 50, 24, 18, 28, 26, 22, 19, 25, 45, 21, -29, 43, 56, 60, 54, 329, 55, 99999999999, 48, 20, 57, 58, 47, 62, 51, 65, 49, -1726, 5, 53, 61, 8, 11, -1, 72]

Gender:
['Female', 'M', 'Male', 'male', 'female', 'm', 'Male-ish', 'maile', 'Trans-female', 'Cis Female', 'F', 'something kinda male?', 'Cis Male', 'Woman', 'f', 'Mal', 'Male (CIS)', 'queer/she/they', 'non-binary', 'Femake', 'woman', 'Make', 'Nah', 'All', 'Enby', 'fluid', 'Genderqueer', 'Female ', 'Androgyne', 'Agender', 'cis-female/femme', 'Guy (-ish) ^_^', 'male leaning androgynous', 'Male ', 'Man', 'Trans woman', 'msle', 'Neuter', 'Female (trans)', 'queer', 'Female (cis)', 'Mail', 'cis male', 'A little about you', 'Malr', 'p', 'femail', 'Cis Man', 'ostensibly male, unsure what that really means']

Country:
['United States', 'Canada', 'United Kingdom', 'Bulgaria', 'France', 'Portugal', 'Netherlands', 'Switzerland', 'Poland', 'Australia', 'Germany', 'Russia'

In [8]:
# Genderの名寄せ
col = "Gender"
df[col].replace("All","all", inplace=True)
df_gender = pd.crosstab(index=df["treatment"], columns=df[col], margins=True)
display(df_gender.T.sort_values("All",ascending=False))

treatment,No,Yes,All
Gender,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
All,622,637,1259
Male,330,285,615
male,112,94,206
Female,37,84,121
M,69,47,116
female,16,46,62
F,12,26,38
m,20,14,34
f,6,9,15
Make,1,3,4


In [9]:
encode_rule = {
    "Male":"Male",
    "Male ": "Male",
    "Female":"Female",
    "Female ": "Female",
    "male":"Male",
    "female":"Female",
    "M":"Male",
    "m": "Male",
    "F":"Female",
    "f": "Female",
    "Make": "Male",
    "Mail": "Male",
    "Malr": "Male",
    "Mal": "Male",
    "Femake": "Female",
    "msle": "Male",
    "woman": "Female",
    "femail": "Female",
    "maile": "Male",
    "Woman": "Female",
    "Man": "Male",
    "Female (trans)": "non-binary",
    "Cis Male": "non-binary",
    "non-binary": "non-binary",
    "cis-female/femme": "non-binary",
    "fluid": "non-binary",
    "ostensibly male, unsure what that really means": "non-binary",
    "something kinda male?": "non-binary",
    "queer": "non-binary",
    "male leaning androgynous": "non-binary",
    "cis male": "non-binary",
    "queer/she/they": "non-binary",
    "all": "non-binary",
    "Guy (-ish) ^_^": "non-binary",
    "Androgyne": "non-binary",
    "Cis Female": "non-binary",
    "Cis Man": "non-binary",
    "Female (cis)": "non-binary",
    "Genderqueer": "non-binary",
    "Trans-female": "non-binary",
    "Male (CIS)": "non-binary",
    "Male-ish": "non-binary",
    "Agender": "non-binary",
    "Neuter": "non-binary",
    "Trans woman": "non-binary",
    "Nah": "non-binary",
    "Enby": "non-binary",
    "p": "non-binary",
    "A little about you": "non-binary"
}
df["Gender2"] = df["Gender"].map(encode_rule)

In [10]:
col = "Gender2"
df_gender = pd.crosstab(index=df["treatment"], columns=df[col], margins=True)
display(df_gender)

Gender2,Female,Male,non-binary,All
treatment,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
No,76,539,7,622
Yes,168,446,23,637
All,244,985,30,1259


In [11]:
# ageの外れ値
df.isnull().sum()

Timestamp                       0
Age                             0
Gender                          0
Country                         0
state                         515
self_employed                  18
family_history                  0
treatment                       0
work_interfere                264
no_employees                    0
remote_work                     0
tech_company                    0
benefits                        0
care_options                    0
wellness_program                0
seek_help                       0
anonymity                       0
leave                           0
mental_health_consequence       0
phys_health_consequence         0
coworkers                       0
supervisor                      0
mental_health_interview         0
phys_health_interview           0
mental_vs_physical              0
obs_consequence                 0
comments                     1095
Gender2                         0
dtype: int64

In [12]:
df = df[(df["Age"]>=0) & (df["Age"] <= 100)]

In [13]:
def name_identification(df, columns):
    for col in columns:
        df[col] = df[col].replace("Don't know", "No")
    return df

columns = ["mental_vs_physical", "anonymity", "seek_help", "wellness_program", "benefits"]

df = name_identification(df, columns)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  This is separate from the ipykernel package so we can avoid doing imports until


In [14]:
df["self_employed"].fillna("No", inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self._update_inplace(new_data)


In [15]:
df["work_interfere"].fillna("Never", inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self._update_inplace(new_data)


In [16]:
df["care_options"] = df["care_options"].replace("Not sure", "No")

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.


In [17]:
df["state"].fillna("nothing", inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self._update_inplace(new_data)


In [18]:
df["comment_flag"] = df["comments"].map(lambda x:1 if not pd.isnull(x) else 0)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.


In [19]:
for col in df.columns:
    if col == 'Timestamp' or col == 'comments' :
        continue;
    print(col + ':\n' + str(df[col].unique().tolist()) + '\n')


Age:
[37, 44, 32, 31, 33, 35, 39, 42, 23, 29, 36, 27, 46, 41, 34, 30, 40, 38, 50, 24, 18, 28, 26, 22, 19, 25, 45, 21, 43, 56, 60, 54, 55, 48, 20, 57, 58, 47, 62, 51, 65, 49, 5, 53, 61, 8, 11, 72]

Gender:
['Female', 'M', 'Male', 'male', 'female', 'm', 'Male-ish', 'maile', 'Trans-female', 'Cis Female', 'F', 'something kinda male?', 'Cis Male', 'Woman', 'f', 'Mal', 'Male (CIS)', 'queer/she/they', 'non-binary', 'Femake', 'woman', 'Make', 'Nah', 'Enby', 'fluid', 'Genderqueer', 'Female ', 'Androgyne', 'Agender', 'cis-female/femme', 'Guy (-ish) ^_^', 'male leaning androgynous', 'Male ', 'Man', 'Trans woman', 'msle', 'Neuter', 'Female (trans)', 'queer', 'Female (cis)', 'Mail', 'cis male', 'A little about you', 'Malr', 'femail', 'Cis Man', 'ostensibly male, unsure what that really means']

Country:
['United States', 'Canada', 'United Kingdom', 'Bulgaria', 'France', 'Portugal', 'Netherlands', 'Switzerland', 'Poland', 'Australia', 'Germany', 'Russia', 'Mexico', 'Brazil', 'Slovenia', 'Costa Rica'

In [20]:
df.isnull().sum()

Timestamp                       0
Age                             0
Gender                          0
Country                         0
state                           0
self_employed                   0
family_history                  0
treatment                       0
work_interfere                  0
no_employees                    0
remote_work                     0
tech_company                    0
benefits                        0
care_options                    0
wellness_program                0
seek_help                       0
anonymity                       0
leave                           0
mental_health_consequence       0
phys_health_consequence         0
coworkers                       0
supervisor                      0
mental_health_interview         0
phys_health_interview           0
mental_vs_physical              0
obs_consequence                 0
comments                     1091
Gender2                         0
comment_flag                    0
dtype: int64

In [21]:
encode_rule = {
    "Yes": 1,
    "No": 0,
}
columns = [
    "self_employed",
    "family_history",
    "treatment",
    "remote_work",
    "tech_company",
    "benefits",
    "care_options",
    "wellness_program",
    "seek_help",
    "anonymity",
    "mental_vs_physical",
    "obs_consequence",
]

for col in columns:
    df[col] = df[col].map(encode_rule)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


In [22]:
df.head()

Unnamed: 0,Timestamp,Age,Gender,Country,state,self_employed,family_history,treatment,work_interfere,no_employees,...,phys_health_consequence,coworkers,supervisor,mental_health_interview,phys_health_interview,mental_vs_physical,obs_consequence,comments,Gender2,comment_flag
0,2014-08-27 11:29:31,37,Female,United States,IL,0,0,1,Often,6-25,...,No,Some of them,Yes,No,Maybe,1,0,,Female,0
1,2014-08-27 11:29:37,44,M,United States,IN,0,0,0,Rarely,More than 1000,...,No,No,No,No,No,0,0,,Male,0
2,2014-08-27 11:29:44,32,Male,Canada,nothing,0,0,0,Rarely,6-25,...,No,Yes,Yes,Yes,Yes,0,0,,Male,0
3,2014-08-27 11:29:46,31,Male,United Kingdom,nothing,0,1,1,Often,26-100,...,Yes,Some of them,No,Maybe,Maybe,0,1,,Male,0
4,2014-08-27 11:30:22,31,Male,United States,TX,0,0,0,Never,100-500,...,No,Some of them,Yes,Yes,Yes,0,0,,Male,0


In [23]:
columns = [
    "Gender2",
    "Country",
    "state",
    "work_interfere",
    "no_employees",
    "leave",
    "mental_health_consequence",
    "phys_health_consequence",
    "coworkers",
    "supervisor",
    "mental_health_interview",
    "phys_health_interview",
]

df = pd.concat([df, pd.get_dummies(df[columns])], axis=1)
df = df.drop(columns, axis=1)
df = df.drop(['Gender', 'comments', 'Timestamp'], axis=1)
df.head()

Unnamed: 0,Age,self_employed,family_history,treatment,remote_work,tech_company,benefits,care_options,wellness_program,seek_help,...,coworkers_Yes,supervisor_No,supervisor_Some of them,supervisor_Yes,mental_health_interview_Maybe,mental_health_interview_No,mental_health_interview_Yes,phys_health_interview_Maybe,phys_health_interview_No,phys_health_interview_Yes
0,37,0,0,1,0,1,1,0,0,1,...,0,0,0,1,0,1,0,1,0,0
1,44,0,0,0,0,0,0,0,0,0,...,0,1,0,0,0,1,0,0,1,0
2,32,0,0,0,0,1,0,0,0,0,...,1,0,0,1,0,0,1,0,0,1
3,31,0,1,1,0,1,0,1,0,0,...,0,1,0,0,1,0,0,1,0,0
4,31,0,0,0,1,1,1,0,0,0,...,0,0,0,1,0,0,1,0,0,1


In [24]:
df.corr().style.background_gradient().format('{:.2f}')

Unnamed: 0,Age,self_employed,family_history,treatment,remote_work,tech_company,benefits,care_options,wellness_program,seek_help,anonymity,mental_vs_physical,obs_consequence,comment_flag,Gender2_Female,Gender2_Male,Gender2_non-binary,Country_Australia,Country_Austria,"Country_Bahamas, The",Country_Belgium,Country_Bosnia and Herzegovina,Country_Brazil,Country_Bulgaria,Country_Canada,Country_China,Country_Colombia,Country_Costa Rica,Country_Croatia,Country_Czech Republic,Country_Denmark,Country_Finland,Country_France,Country_Georgia,Country_Germany,Country_Greece,Country_Hungary,Country_India,Country_Ireland,Country_Israel,Country_Italy,Country_Japan,Country_Latvia,Country_Mexico,Country_Moldova,Country_Netherlands,Country_New Zealand,Country_Nigeria,Country_Norway,Country_Philippines,Country_Poland,Country_Portugal,Country_Romania,Country_Russia,Country_Singapore,Country_Slovenia,Country_South Africa,Country_Spain,Country_Sweden,Country_Switzerland,Country_Thailand,Country_United Kingdom,Country_United States,Country_Uruguay,state_AL,state_AZ,state_CA,state_CO,state_CT,state_DC,state_FL,state_GA,state_IA,state_ID,state_IL,state_IN,state_KS,state_KY,state_LA,state_MA,state_MD,state_ME,state_MI,state_MN,state_MO,state_MS,state_NC,state_NE,state_NH,state_NJ,state_NM,state_NV,state_NY,state_OH,state_OK,state_OR,state_PA,state_RI,state_SC,state_SD,state_TN,state_TX,state_UT,state_VA,state_VT,state_WA,state_WI,state_WV,state_WY,state_nothing,work_interfere_Never,work_interfere_Often,work_interfere_Rarely,work_interfere_Sometimes,no_employees_1-5,no_employees_100-500,no_employees_26-100,no_employees_500-1000,no_employees_6-25,no_employees_More than 1000,leave_Don't know,leave_Somewhat difficult,leave_Somewhat easy,leave_Very difficult,leave_Very easy,mental_health_consequence_Maybe,mental_health_consequence_No,mental_health_consequence_Yes,phys_health_consequence_Maybe,phys_health_consequence_No,phys_health_consequence_Yes,coworkers_No,coworkers_Some of them,coworkers_Yes,supervisor_No,supervisor_Some of them,supervisor_Yes,mental_health_interview_Maybe,mental_health_interview_No,mental_health_interview_Yes,phys_health_interview_Maybe,phys_health_interview_No,phys_health_interview_Yes
Age,1.0,0.06,0.01,0.08,0.14,-0.06,0.15,0.12,0.12,0.17,0.03,-0.04,0.07,0.02,-0.08,0.09,-0.04,-0.05,-0.04,-0.09,-0.02,-0.03,-0.04,-0.03,-0.09,0.03,-0.03,0.02,0.03,0.0,0.02,-0.02,-0.01,-0.05,-0.04,0.02,-0.02,-0.09,-0.0,-0.06,0.01,0.07,-0.01,0.02,-0.02,-0.0,0.03,-0.03,0.0,-0.0,-0.02,0.0,-0.01,-0.05,0.02,-0.05,0.04,-0.01,-0.05,-0.02,0.03,-0.06,0.17,-0.02,0.01,0.01,0.05,0.01,0.02,0.02,0.08,0.01,0.06,0.09,-0.02,0.06,0.02,-0.04,0.01,0.03,0.01,0.03,0.02,-0.0,0.01,0.0,0.02,-0.03,0.02,-0.0,-0.01,-0.02,-0.03,-0.01,-0.03,0.04,0.02,-0.03,-0.02,0.02,0.05,-0.01,-0.03,0.09,0.0,0.03,0.03,-0.03,0.05,-0.15,-0.03,-0.02,0.03,0.02,0.02,-0.0,-0.07,0.06,-0.1,0.12,0.01,0.0,-0.02,0.05,-0.04,0.01,-0.06,0.06,0.09,-0.1,0.03,0.02,-0.01,-0.01,0.0,-0.02,0.01,-0.06,0.05,0.0,-0.0,0.05,-0.06
self_employed,0.06,1.0,0.0,0.02,0.32,0.08,-0.18,0.05,-0.01,-0.04,0.09,0.16,0.07,0.03,-0.05,0.03,0.05,-0.01,0.03,0.08,-0.02,-0.01,0.05,-0.02,0.03,0.08,-0.01,-0.01,0.05,-0.01,0.11,-0.02,0.01,-0.01,0.09,-0.01,0.08,0.05,0.05,-0.02,0.01,0.08,-0.01,0.03,-0.01,0.07,-0.03,-0.01,-0.01,-0.01,0.04,-0.01,-0.01,-0.02,-0.02,-0.01,0.05,-0.01,-0.03,0.01,0.08,0.05,-0.16,-0.01,0.01,0.01,-0.05,-0.0,-0.02,-0.02,0.03,-0.01,-0.02,-0.01,-0.01,-0.04,-0.02,-0.02,-0.01,-0.03,0.0,-0.01,-0.01,-0.05,-0.04,0.08,-0.01,-0.01,-0.02,-0.02,-0.01,-0.02,-0.03,-0.02,0.01,-0.04,-0.01,-0.01,-0.02,-0.02,-0.03,-0.04,-0.03,0.08,-0.02,-0.04,0.02,-0.01,-0.01,0.16,-0.05,0.08,-0.04,0.02,0.58,-0.11,-0.15,-0.08,-0.01,-0.17,-0.19,0.06,0.03,0.12,0.08,-0.06,0.08,-0.02,-0.0,-0.02,0.04,-0.05,-0.01,0.07,-0.03,-0.01,0.04,0.05,-0.07,0.06,0.04,-0.05,0.01
family_history,0.01,0.0,1.0,0.38,0.02,-0.05,0.15,0.12,0.05,0.06,0.05,-0.01,0.12,0.12,0.14,-0.15,0.02,0.04,-0.01,0.04,-0.03,0.04,-0.01,-0.05,-0.01,-0.02,-0.03,-0.02,0.01,0.04,0.05,-0.04,-0.05,-0.02,-0.03,-0.03,0.04,-0.05,-0.02,-0.02,-0.02,0.04,-0.02,-0.01,-0.02,-0.03,-0.02,-0.02,0.04,-0.02,-0.02,-0.03,-0.02,-0.04,-0.02,-0.02,0.02,0.04,-0.02,0.01,-0.02,-0.06,0.12,-0.02,0.03,0.01,0.07,-0.03,-0.02,0.01,-0.03,0.02,-0.02,0.04,0.04,0.02,-0.04,-0.02,-0.02,-0.04,-0.0,0.04,0.01,0.02,0.04,-0.02,0.02,0.01,0.03,-0.01,-0.03,-0.04,-0.0,-0.03,-0.01,0.03,0.05,-0.02,-0.02,-0.01,0.08,-0.04,-0.01,-0.01,-0.01,0.05,0.04,-0.02,0.01,-0.11,-0.35,0.12,0.06,0.23,0.03,0.05,-0.01,0.03,-0.07,0.0,-0.05,0.06,0.01,0.03,-0.01,0.02,-0.09,0.07,0.03,-0.05,0.05,0.01,-0.01,0.01,-0.0,-0.0,0.0,-0.06,0.07,-0.03,-0.06,0.06,-0.0
treatment,0.08,0.02,0.38,1.0,0.03,-0.03,0.21,0.27,0.08,0.09,0.13,0.0,0.15,0.11,0.18,-0.2,0.07,0.03,-0.05,0.03,-0.05,-0.03,-0.02,-0.0,0.0,-0.03,-0.04,-0.03,0.04,-0.03,0.04,-0.02,-0.07,-0.03,-0.01,-0.04,-0.03,-0.04,-0.01,-0.06,-0.05,0.03,-0.03,-0.02,0.03,-0.05,0.02,-0.03,-0.03,-0.03,0.01,-0.04,-0.03,-0.05,-0.03,0.03,0.02,-0.03,-0.03,-0.01,-0.03,-0.0,0.1,-0.03,0.05,0.05,0.08,-0.01,-0.0,-0.03,0.01,-0.0,0.03,0.03,0.06,-0.01,-0.05,-0.04,0.03,-0.0,-0.04,0.03,-0.01,0.02,-0.03,0.03,-0.0,-0.0,0.02,-0.0,-0.04,0.02,0.01,0.05,-0.02,0.03,-0.01,-0.03,-0.01,-0.02,-0.04,0.02,0.02,-0.02,-0.05,0.04,0.05,-0.03,-0.0,-0.1,-0.68,0.25,0.16,0.41,0.04,0.03,0.01,-0.02,-0.07,0.02,-0.1,0.1,-0.01,0.1,-0.01,0.04,-0.12,0.1,0.03,-0.04,0.03,-0.05,-0.0,0.06,0.02,0.01,-0.03,-0.1,0.08,0.02,-0.05,0.02,0.04
remote_work,0.14,0.32,0.02,0.03,1.0,0.13,-0.1,0.02,-0.07,-0.05,-0.0,0.07,-0.04,0.06,-0.01,0.01,0.01,0.02,0.04,0.04,-0.05,0.04,-0.02,0.06,0.0,0.04,-0.03,0.04,0.06,-0.02,-0.03,0.0,-0.01,-0.02,0.02,0.06,0.04,-0.04,-0.04,-0.04,-0.05,0.04,-0.02,0.04,-0.02,-0.0,0.01,-0.02,-0.02,-0.02,0.04,-0.03,0.04,0.0,0.03,-0.02,0.03,-0.02,-0.05,-0.05,-0.02,-0.07,0.04,0.04,0.02,-0.05,-0.12,0.01,0.03,0.03,0.09,0.01,-0.04,0.04,-0.02,-0.0,0.0,0.01,-0.02,0.0,0.04,0.04,0.02,-0.03,0.01,0.04,0.03,-0.03,0.08,0.03,0.06,0.0,-0.02,0.05,0.03,0.03,0.0,-0.02,0.01,0.0,0.02,0.04,0.05,0.03,0.0,-0.03,0.1,-0.02,0.02,-0.04,-0.05,0.04,0.01,0.02,0.27,-0.04,-0.02,-0.01,0.01,-0.17,-0.08,-0.0,-0.02,0.06,0.08,-0.06,0.04,0.03,0.02,-0.02,0.01,-0.05,-0.02,0.07,-0.0,-0.04,0.04,0.06,-0.08,0.05,0.03,-0.04,0.01
tech_company,-0.06,0.08,-0.05,-0.03,0.13,1.0,-0.08,-0.04,-0.14,-0.12,-0.04,0.08,-0.07,-0.0,-0.07,0.08,-0.01,-0.0,0.02,0.01,-0.06,0.01,0.03,0.03,-0.03,0.01,0.02,0.01,0.02,0.01,0.02,-0.02,0.03,-0.06,0.03,0.02,0.01,0.02,0.06,-0.0,-0.02,0.01,0.01,0.02,0.01,0.04,0.01,0.01,0.01,0.01,0.04,0.02,-0.06,0.02,0.03,0.01,-0.03,0.01,0.04,0.04,0.01,-0.07,-0.02,0.01,-0.05,0.01,0.11,0.04,-0.01,-0.01,-0.01,-0.02,-0.05,0.01,-0.08,-0.06,-0.02,-0.07,-0.06,0.03,-0.04,0.01,-0.02,-0.06,-0.04,0.01,-0.01,-0.03,0.02,0.0,-0.03,0.02,0.02,0.03,-0.09,0.06,0.0,0.01,0.03,0.02,-0.04,-0.02,-0.02,0.01,-0.02,0.04,-0.06,-0.06,-0.03,0.02,0.02,-0.04,-0.03,0.03,0.12,-0.05,0.03,-0.08,0.15,-0.19,-0.02,-0.03,0.0,-0.01,0.06,-0.07,0.12,-0.07,-0.07,0.07,0.01,-0.05,-0.01,0.07,-0.04,-0.01,0.05,0.07,-0.08,0.04,0.03,-0.02,-0.02
benefits,0.15,-0.18,0.15,0.21,-0.1,-0.08,1.0,0.43,0.41,0.45,0.31,0.1,0.02,0.01,0.14,-0.15,0.04,-0.05,-0.0,0.04,-0.01,-0.02,-0.05,-0.04,0.01,-0.02,-0.03,-0.02,0.01,-0.02,-0.03,-0.04,-0.05,-0.02,-0.11,-0.03,-0.02,-0.07,-0.08,-0.05,-0.06,-0.02,-0.02,-0.04,-0.02,-0.07,-0.04,-0.02,0.04,-0.02,-0.06,-0.03,-0.02,-0.0,-0.04,-0.02,-0.05,-0.02,-0.01,-0.01,-0.02,-0.23,0.38,-0.02,0.01,0.05,0.17,0.03,0.01,0.01,0.0,0.08,0.01,-0.02,0.07,0.02,-0.0,0.03,0.04,0.11,0.02,-0.02,0.01,0.01,-0.01,-0.02,0.04,0.01,0.03,0.07,-0.03,-0.04,0.09,-0.02,-0.01,0.06,0.08,-0.02,0.0,0.03,0.04,0.0,-0.02,0.04,-0.0,0.13,0.02,-0.02,-0.03,-0.37,-0.12,0.01,0.1,0.04,-0.21,0.06,-0.03,0.09,-0.2,0.3,0.03,-0.05,0.03,-0.1,0.03,0.06,-0.04,-0.03,-0.01,0.02,-0.03,-0.02,0.03,-0.02,-0.05,0.03,0.02,-0.05,0.05,-0.02,-0.07,0.11,-0.05
care_options,0.12,0.05,0.12,0.27,0.02,-0.04,0.43,1.0,0.31,0.33,0.39,0.14,0.09,0.01,0.09,-0.11,0.08,0.02,-0.0,0.04,-0.03,-0.02,-0.0,-0.04,0.02,0.04,0.01,0.04,0.01,0.04,0.05,-0.04,-0.03,-0.02,-0.07,-0.03,-0.02,-0.07,-0.02,-0.02,-0.06,0.04,0.04,-0.04,0.04,-0.06,0.05,-0.02,0.04,-0.02,-0.03,0.01,0.04,-0.0,-0.04,-0.02,-0.0,-0.02,0.01,-0.01,-0.02,-0.12,0.16,-0.02,0.01,0.08,0.03,0.06,-0.01,0.02,-0.0,0.05,-0.04,0.04,0.05,0.01,-0.0,0.01,-0.02,0.05,-0.02,-0.02,-0.02,0.01,-0.06,-0.02,0.02,0.01,0.03,0.02,-0.03,0.03,0.06,-0.0,-0.03,-0.0,0.05,-0.02,0.03,-0.0,-0.02,-0.0,0.02,0.0,-0.0,0.07,0.08,-0.02,0.01,-0.16,-0.22,0.06,0.1,0.11,0.08,-0.03,-0.02,-0.01,-0.12,0.1,-0.17,0.01,0.08,0.08,0.08,-0.02,0.0,0.02,-0.03,0.02,0.02,-0.02,-0.01,0.03,-0.08,0.02,0.05,-0.03,0.01,0.02,-0.03,0.03,-0.0
wellness_program,0.12,-0.01,0.05,0.08,-0.07,-0.14,0.41,0.31,1.0,0.6,0.29,0.22,0.08,0.04,0.05,-0.06,0.04,0.07,-0.02,0.06,-0.03,-0.01,-0.03,-0.03,-0.0,-0.01,-0.02,-0.01,-0.02,-0.01,0.08,-0.02,-0.01,-0.01,-0.05,-0.02,-0.01,-0.04,-0.03,0.0,-0.04,0.06,-0.01,-0.02,-0.01,-0.03,0.01,-0.01,-0.01,-0.01,-0.01,-0.02,-0.01,-0.02,-0.03,-0.01,-0.03,-0.01,-0.01,0.02,-0.01,-0.09,0.13,-0.01,-0.04,0.02,0.07,-0.02,0.01,0.01,-0.01,0.04,-0.03,-0.01,0.04,0.02,0.02,0.04,-0.01,-0.03,0.04,-0.01,-0.02,0.01,-0.03,-0.01,0.03,0.03,0.02,-0.0,-0.02,-0.02,0.04,-0.02,-0.0,-0.0,0.01,-0.01,0.04,0.02,0.02,-0.06,0.0,0.01,-0.02,0.14,0.02,-0.01,-0.02,-0.13,-0.06,-0.01,0.06,0.02,-0.06,-0.04,-0.06,0.06,-0.17,0.29,-0.07,-0.04,0.09,-0.06,0.08,-0.05,0.07,-0.02,-0.0,0.01,-0.01,-0.06,0.04,0.01,-0.09,0.0,0.09,-0.03,0.03,-0.01,-0.04,0.05,-0.02
seek_help,0.17,-0.04,0.06,0.09,-0.05,-0.12,0.45,0.33,0.6,1.0,0.32,0.22,0.09,0.06,0.03,-0.04,0.03,0.06,-0.02,0.06,-0.03,-0.01,-0.03,-0.03,-0.05,-0.01,-0.02,-0.01,-0.02,-0.01,-0.02,-0.02,-0.05,-0.01,-0.05,-0.02,-0.01,-0.04,-0.03,0.0,-0.04,-0.01,-0.01,-0.02,-0.01,-0.03,0.01,-0.01,-0.01,-0.01,-0.01,-0.02,-0.01,-0.02,-0.03,-0.01,-0.03,-0.01,-0.01,0.02,-0.01,-0.06,0.16,-0.01,-0.04,-0.01,0.1,0.01,0.01,0.01,0.02,0.05,-0.03,-0.01,0.03,0.01,-0.02,0.0,-0.01,-0.03,-0.04,-0.01,-0.04,0.0,-0.03,-0.01,0.0,0.03,0.02,-0.01,-0.02,-0.02,0.05,-0.01,-0.01,0.08,0.02,-0.01,0.0,0.06,0.0,-0.03,0.04,0.02,-0.02,0.12,0.05,-0.01,-0.02,-0.16,-0.07,-0.02,0.09,0.02,-0.07,-0.02,-0.11,0.05,-0.15,0.32,-0.07,-0.01,0.08,-0.05,0.05,-0.05,0.09,-0.05,-0.02,0.03,-0.02,-0.05,0.02,0.03,-0.1,-0.01,0.1,-0.01,-0.0,0.03,-0.05,0.04,0.01


In [25]:
X = df.drop("treatment", axis=1)
y = df["treatment"]

In [26]:
lr = LogisticRegression()

lr.fit(X,y)

print(lr.coef_,lr.intercept_)
y_pred = lr.predict(X)

print(classification_report(y,y_pred))

[[ 0.02976373 -0.11349338  0.96666125 -0.09138126 -0.05672257  0.70288707
   0.65026433 -0.09811783 -0.42249938  0.418545    0.08920237  0.19029255
   0.15260171  0.23748985 -0.45169721 -0.07817561  0.23190024 -0.52143254
   0.02548302 -0.6207435  -0.42912983  0.10906975  0.56120443  0.2335361
  -0.30569643 -0.60066324 -0.08542011  0.29809158 -0.69365784  0.32194197
   0.23722951  0.11459245 -0.26700694  0.25324993 -0.0529143  -0.51090326
   0.50331445  0.51817541 -0.44979847 -0.3788487   0.1537037  -0.05054149
   0.304468    0.06939478 -0.06636708  0.18300629 -0.03506546 -0.17125847
  -0.34867124  0.21650208 -0.06002963 -0.02185763 -0.43441205 -0.45204726
   0.59645314  0.50062147 -0.06939318 -0.5082942   0.45749863 -0.02244402
   0.91069945  0.08882837 -0.02475086  0.47153428  0.79017298  0.24477114
   0.56220922  0.28632188 -0.17065067  0.17718652 -0.03103505  0.92525281
   0.10067507  0.07800329  0.12236873 -0.23273658 -0.27275137  0.06903658
  -0.40117969 -0.99248574  0.25877545 -