### *Modeling & Accuracy of Models*
---

In [19]:
import numpy as np
import pandas as pd
from sklearn.impute import SimpleImputer
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV

In [20]:
df = pd.read_csv('../datasets/cleandata.csv')
df.head()

Unnamed: 0.1,Unnamed: 0,What is your age?,Does your employer provide mental health benefits as part of healthcare coverage?_No,Does your employer provide mental health benefits as part of healthcare coverage?_Not eligible for coverage / N/A,Does your employer provide mental health benefits as part of healthcare coverage?_Yes,Do you know the options for mental health care available under your employer-provided coverage?_No,Do you know the options for mental health care available under your employer-provided coverage?_Yes,"Has your employer ever formally discussed mental health (for example, as part of a wellness campaign or other official communication)?_No","Has your employer ever formally discussed mental health (for example, as part of a wellness campaign or other official communication)?_Yes",Does your employer offer resources to learn more about mental health concerns and options for seeking help?_No,...,"Do you feel that being identified as a person with a mental health issue would hurt your career?_No, it has not","Do you feel that being identified as a person with a mental health issue would hurt your career?_Yes, I think it would","Do you feel that being identified as a person with a mental health issue would hurt your career?_Yes, it has","Do you think that team members/co-workers would view you more negatively if they knew you suffered from a mental health issue?_No, I don't think they would","Do you think that team members/co-workers would view you more negatively if they knew you suffered from a mental health issue?_No, they do not","Do you think that team members/co-workers would view you more negatively if they knew you suffered from a mental health issue?_Yes, I think they would","Do you think that team members/co-workers would view you more negatively if they knew you suffered from a mental health issue?_Yes, they do",Have you ever sought treatment for a mental health issue from a mental health professional?_1,What is your gender?_male,What is your gender?_other
0,0,39.0,0,1,0,1,0,1,0,1,...,0,0,0,1,0,0,0,0,1,0
1,1,29.0,1,0,0,0,1,0,1,0,...,0,0,0,1,0,0,0,1,1,0
2,2,38.0,1,0,0,1,0,1,0,1,...,0,0,0,0,0,0,0,1,1,0
3,3,43.0,0,0,1,1,0,1,0,1,...,0,1,0,0,0,0,0,1,1,0
4,4,43.0,0,0,1,0,1,1,0,1,...,0,1,0,0,0,0,0,1,0,0


### Drop Unnamed: 0 Column

In [21]:
df.drop(columns = 'Unnamed: 0', inplace=True)

### Set X and y Variable

In [22]:
X = df.drop(columns = 'Have you ever sought treatment for a mental health issue from a mental health professional?_1')
y = df['Have you ever sought treatment for a mental health issue from a mental health professional?_1']

### Train Test Split

In [23]:
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state = 42)

### Grid-search: Random Forests

In [24]:
rf = RandomForestClassifier(random_state=42)

rf_params = {
    'n_estimators': [150, 200, 250],
    'max_depth': [6, 7, 8]
}

gs = GridSearchCV(rf, rf_params, cv = 5)

# model fit
gs.fit(X_train, y_train)

### Scores

In [25]:
# best score
print(f"Best score: {gs.best_score_}")

# best params
print(f"Best params: {gs.best_params_}")

Best score: 0.6359660943273202
Best params: {'max_depth': 8, 'n_estimators': 250}


In [26]:
#training score
gs.score(X_train, y_train)

0.8361266294227188

In [27]:
#testing score
gs.score(X_test, y_test)

0.724233983286908