# Data Preprocessing

In [1]:
import pandas as pd
import numpy as np

In [2]:
data = pd.read_csv("data_for_visualization.csv") # loading data
data.head()

Unnamed: 0,self_employed,n_employees,tech_company,tech_job,employer_MH_benefits,knows_coverage,formal_discussion,learning_resources,anonymity_protected,ask_MH_leave,...,overall_support,followup_talk,age,gender,country_live,us_state_live,country_work,us_state_work,race_complete,historic_disorder
0,0,6-25,1.0,1.0,I don't know,No,I don't know,No,Yes,Difficult,...,1.0,1.0,Millennials,Male,United States of America,Missouri,United States of America,Missouri,White,Yes
1,0,100-500,1.0,0.0,Yes,No,No,I don't know,Yes,Somewhat easy,...,2.0,0.0,Millennials,Male,United States of America,California,United States of America,California,White,No
2,0,6-25,1.0,1.0,Yes,Yes,No,No,Yes,Very easy,...,2.0,1.0,Millennials,Female,United States of America,Washington,United States of America,Washington,Asian,Yes
3,0,26-100,1.0,1.0,Yes,No,No,No,I don't know,Somewhat easy,...,2.0,1.0,Gen X,Female,United States of America,Georgia,United States of America,Georgia,White,No
4,0,100-500,0.0,1.0,Yes,Yes,No,No,I don't know,I don't know,...,1.0,0.0,Gen X,Male,United States of America,Pennsylvania,United States of America,Pennsylvania,White,Yes


In [3]:
data = data.drop(["country_live", "country_work"], axis = 1) 

# dropping these two columns since they won't be useful for our analysis

In [5]:
# too few entries for certain classes makes it difficult for models to predict

data['overall_support'] = data['overall_support'].astype(str)

data['overall_support'] = data['overall_support'].replace(['1.0','2.0'], 'Unsupportive')
data['overall_support'] = data['overall_support'].replace(['4.0','5.0'], 'Supportive')
data['overall_support'] = data['overall_support'].replace(['3.0'], 'Neutral')

In [6]:
data.overall_support.value_counts() # sanity check

Neutral         218
Unsupportive    213
Supportive       77
Name: overall_support, dtype: int64

In [7]:
data.overall_support.info()

<class 'pandas.core.series.Series'>
RangeIndex: 508 entries, 0 to 507
Series name: overall_support
Non-Null Count  Dtype 
--------------  ----- 
508 non-null    object
dtypes: object(1)
memory usage: 4.1+ KB


## X/y Split

In [8]:
# separation of X (explanatory variables) & y (target variable)

X = data.drop("overall_support", axis = 1)
y = data["overall_support"]

## Continuous Variables

In [9]:
X_num = X.select_dtypes(include = 'number') 
X_num

Unnamed: 0,self_employed,tech_company,tech_job,ever_discuss_employer,ever_discuss_coworker,coworker_discuss_coworker,importance_employer_PH,importance_employer_MH,previous_employers,previous_tech_company,...,adjustment_disorder,dissociative_disorder,substance_use_disorders,addictive_disorders,other_disorders,sought_treatment,share_with_family,identified_MH,expected_reaction,followup_talk
0,0,1.0,1.0,1.0,1.0,0.0,0.0,1.0,1,1.0,...,0,0,1,0,0,1,5,1.0,5.0,1.0
1,0,1.0,0.0,0.0,0.0,0.0,9.0,5.0,1,1.0,...,0,0,0,0,0,0,5,0.0,4.0,0.0
2,0,1.0,1.0,0.0,1.0,1.0,10.0,10.0,1,1.0,...,0,0,0,0,0,1,8,1.0,5.0,1.0
3,0,1.0,1.0,0.0,0.0,1.0,10.0,8.0,1,1.0,...,0,0,0,0,0,1,3,0.0,7.0,1.0
4,0,0.0,1.0,1.0,1.0,1.0,1.0,0.0,1,1.0,...,0,0,0,0,0,1,2,0.0,2.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
503,0,1.0,1.0,0.0,0.0,1.0,0.0,2.0,1,1.0,...,0,0,0,0,0,0,5,0.0,5.0,0.0
504,0,0.0,1.0,1.0,1.0,1.0,9.0,0.0,1,1.0,...,0,0,0,0,0,1,0,0.0,6.0,1.0
505,0,1.0,1.0,1.0,1.0,1.0,6.0,5.0,1,1.0,...,0,0,0,0,0,1,10,0.0,10.0,0.0
506,0,1.0,1.0,1.0,1.0,1.0,0.0,0.0,1,0.0,...,0,0,0,0,0,0,7,0.0,5.0,0.0


In [10]:
# we do notice that some features are been read as float, we'll turn them to int next

X_num.tech_company.dtypes

dtype('float64')

In [11]:
X_num = X_num.astype(int) # turning all continuous variables into int

# iterating through the continuous variables to see value counts and their percentage per feature

for feature in X_num:
    print('\033[1m', feature)
    print('\033[0m', X_num[feature].value_counts(dropna=False, normalize=True))

[1m self_employed
[0m 0    1.0
Name: self_employed, dtype: float64
[1m tech_company
[0m 1    0.748031
0    0.251969
Name: tech_company, dtype: float64
[1m tech_job
[0m 1    0.929134
0    0.070866
Name: tech_job, dtype: float64
[1m ever_discuss_employer
[0m 0    0.647638
1    0.352362
Name: ever_discuss_employer, dtype: float64
[1m ever_discuss_coworker
[0m 1    0.507874
0    0.492126
Name: ever_discuss_coworker, dtype: float64
[1m coworker_discuss_coworker
[0m 1    0.568898
0    0.431102
Name: coworker_discuss_coworker, dtype: float64
[1m importance_employer_PH
[0m 7     0.222441
5     0.188976
8     0.163386
6     0.129921
9     0.078740
10    0.072835
3     0.043307
4     0.031496
0     0.027559
2     0.027559
1     0.013780
Name: importance_employer_PH, dtype: float64
[1m importance_employer_MH
[0m 5     0.238189
7     0.147638
8     0.108268
6     0.106299
3     0.102362
4     0.084646
2     0.059055
0     0.057087
9     0.037402
1     0.031496
10    0.027559
Name: 

In [12]:
X_num = X_num.drop("self_employed", axis = 1) # dropping columns since 100% of it is 0s

In [13]:
X_num # sanity check

Unnamed: 0,tech_company,tech_job,ever_discuss_employer,ever_discuss_coworker,coworker_discuss_coworker,importance_employer_PH,importance_employer_MH,previous_employers,previous_tech_company,ever_discuss_previous_employer,...,adjustment_disorder,dissociative_disorder,substance_use_disorders,addictive_disorders,other_disorders,sought_treatment,share_with_family,identified_MH,expected_reaction,followup_talk
0,1,1,1,1,0,0,1,1,1,0,...,0,0,1,0,0,1,5,1,5,1
1,1,0,0,0,0,9,5,1,1,0,...,0,0,0,0,0,0,5,0,4,0
2,1,1,0,1,1,10,10,1,1,1,...,0,0,0,0,0,1,8,1,5,1
3,1,1,0,0,1,10,8,1,1,0,...,0,0,0,0,0,1,3,0,7,1
4,0,1,1,1,1,1,0,1,1,0,...,0,0,0,0,0,1,2,0,2,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
503,1,1,0,0,1,0,2,1,1,0,...,0,0,0,0,0,0,5,0,5,0
504,0,1,1,1,1,9,0,1,1,1,...,0,0,0,0,0,1,0,0,6,1
505,1,1,1,1,1,6,5,1,1,0,...,0,0,0,0,0,1,10,0,10,0
506,1,1,1,1,1,0,0,1,0,0,...,0,0,0,0,0,0,7,0,5,0


## Categorical Variables

In [14]:
X_cat = X.select_dtypes(exclude='number') # creating slice with all categorical 
X_cat

Unnamed: 0,n_employees,employer_MH_benefits,knows_coverage,formal_discussion,learning_resources,anonymity_protected,ask_MH_leave,talk_about_health,discuss_MH_supervisor,discuss_MH_coworker,...,PH_during_interview,MH_during_interview,observed_negative_reaction,observed_positive_reaction,age,gender,us_state_live,us_state_work,race_complete,historic_disorder
0,6-25,I don't know,No,I don't know,No,Yes,Difficult,Same level of comfort for each,Yes,Maybe,...,Maybe,No,"Yes, I experienced","Yes, I experienced",Millennials,Male,Missouri,Missouri,White,Yes
1,100-500,Yes,No,No,I don't know,Yes,Somewhat easy,Physical health,Maybe,Maybe,...,Maybe,No,"Yes, I observed","Yes, I observed",Millennials,Male,California,California,White,No
2,6-25,Yes,Yes,No,No,Yes,Very easy,Same level of comfort for each,Yes,No,...,No,No,No,"Yes, I observed",Millennials,Female,Washington,Washington,Asian,Yes
3,26-100,Yes,No,No,No,I don't know,Somewhat easy,Physical health,Yes,Maybe,...,No,No,No,"Yes, I observed",Gen X,Female,Georgia,Georgia,White,No
4,100-500,Yes,Yes,No,No,I don't know,I don't know,Physical health,No,No,...,No,No,"Yes, I experienced",No,Gen X,Male,Pennsylvania,Pennsylvania,White,Yes
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
503,26-100,Yes,Yes,Yes,Yes,Yes,Somewhat difficult,Physical health,Maybe,Maybe,...,No,No,No,No,Millennials,Male,Illinois,Illinois,White,No
504,26-100,No,No,No,No,I don't know,Difficult,Same level of comfort for each,Yes,Yes,...,No,Maybe,Maybe/Not sure,Maybe/Not sure,Millennials,Female,Minnesota,Minnesota,White,Yes
505,6-25,Yes,No,No,No,Yes,Somewhat easy,Same level of comfort for each,Yes,Yes,...,Yes,Maybe,Maybe/Not sure,Maybe/Not sure,Millennials,Male,Tennessee,Tennessee,White,Yes
506,6-25,Yes,Yes,I don't know,Yes,I don't know,Somewhat easy,Same level of comfort for each,Yes,Yes,...,Yes,Yes,Maybe/Not sure,Maybe/Not sure,Millennials,Male,Tennessee,Tennessee,White,Possibly


In [15]:
# iterating through the continuous variables to see value counts and their percentage per feature

for feature in X_cat:
    print('\033[1m', feature)
    print('\033[0m', X_cat[feature].value_counts(dropna=False, normalize=True))

[1m n_employees
[0m 100-500           0.334646
More than 1000    0.261811
26-100            0.165354
6-25              0.127953
500-1000          0.098425
1-5               0.011811
Name: n_employees, dtype: float64
[1m employer_MH_benefits
[0m Yes                               0.742126
I don't know                      0.216535
No                                0.029528
Not eligible for coverage / NA    0.011811
Name: employer_MH_benefits, dtype: float64
[1m knows_coverage
[0m Yes    0.547244
No     0.452756
Name: knows_coverage, dtype: float64
[1m formal_discussion
[0m No              0.590551
Yes             0.291339
I don't know    0.118110
Name: formal_discussion, dtype: float64
[1m learning_resources
[0m Yes             0.374016
No              0.332677
I don't know    0.293307
Name: learning_resources, dtype: float64
[1m anonymity_protected
[0m I don't know    0.553150
Yes             0.411417
No              0.035433
Name: anonymity_protected, dtype: float64
[1m as

In [16]:
X_cat.info() # checking dtypes

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 508 entries, 0 to 507
Data columns (total 33 columns):
 #   Column                          Non-Null Count  Dtype 
---  ------                          --------------  ----- 
 0   n_employees                     508 non-null    object
 1   employer_MH_benefits            508 non-null    object
 2   knows_coverage                  508 non-null    object
 3   formal_discussion               508 non-null    object
 4   learning_resources              508 non-null    object
 5   anonymity_protected             508 non-null    object
 6   ask_MH_leave                    508 non-null    object
 7   talk_about_health               508 non-null    object
 8   discuss_MH_supervisor           508 non-null    object
 9   discuss_MH_coworker             508 non-null    object
 10  previous_employer_MH_benefits   508 non-null    object
 11  previous_aware_coverage         508 non-null    object
 12  previous_formal_discussion      508 non-null    ob

In [17]:
from sklearn.preprocessing import OneHotEncoder

**Note:** Encoding was particularly necessarily for the models that I've used in my initial notebook. Most (i.e. SVM) could not handled not encoded data.

In [18]:
# function to encode categorical variable from different df and create the final X

def encoding_categoricals(X_categorical, X_numerical):
    encoder = OneHotEncoder(handle_unknown='error', drop='first')
    
    encoder.fit(X_categorical)
    
    encoded = encoder.transform(X_categorical).toarray() # extracting the encoded array from the encoder
    
    cat_one_hot_encoder = pd.DataFrame(encoded) # transforming the numpy array to a Pandas dataframe
    
    cat_one_hot_encoder.columns = encoder.get_feature_names_out() # adding column names to the dataframe
    
    X_encoded = pd.concat([X_numerical, cat_one_hot_encoder], axis=1) # concatenating X_num and X_cat
    
    return X_encoded

In [19]:
encoding_categoricals(X_cat, X_num)

Unnamed: 0,tech_company,tech_job,ever_discuss_employer,ever_discuss_coworker,coworker_discuss_coworker,importance_employer_PH,importance_employer_MH,previous_employers,previous_tech_company,ever_discuss_previous_employer,...,race_complete_Black or African American,race_complete_Hispanic/Latino,race_complete_I prefer not to answer,race_complete_Jewish,race_complete_Mixed,race_complete_Native American,race_complete_White,historic_disorder_No,historic_disorder_Possibly,historic_disorder_Yes
0,1,1,1,1,0,0,1,1,1,0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0
1,1,0,0,0,0,9,5,1,1,0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0
2,1,1,0,1,1,10,10,1,1,1,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
3,1,1,0,0,1,10,8,1,1,0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0
4,0,1,1,1,1,1,0,1,1,0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
503,1,1,0,0,1,0,2,1,1,0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0
504,0,1,1,1,1,9,0,1,1,1,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0
505,1,1,1,1,1,6,5,1,1,0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0
506,1,1,1,1,1,0,0,1,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0


In [20]:
# finalizing the encoded X

X = encoding_categoricals(X_cat, X_num)

In [23]:
X # sanity check 

Unnamed: 0,tech_company,tech_job,ever_discuss_employer,ever_discuss_coworker,coworker_discuss_coworker,importance_employer_PH,importance_employer_MH,previous_employers,previous_tech_company,ever_discuss_previous_employer,...,race_complete_Black or African American,race_complete_Hispanic/Latino,race_complete_I prefer not to answer,race_complete_Jewish,race_complete_Mixed,race_complete_Native American,race_complete_White,historic_disorder_No,historic_disorder_Possibly,historic_disorder_Yes
0,1,1,1,1,0,0,1,1,1,0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0
1,1,0,0,0,0,9,5,1,1,0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0
2,1,1,0,1,1,10,10,1,1,1,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
3,1,1,0,0,1,10,8,1,1,0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0
4,0,1,1,1,1,1,0,1,1,0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
503,1,1,0,0,1,0,2,1,1,0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0
504,0,1,1,1,1,9,0,1,1,1,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0
505,1,1,1,1,1,6,5,1,1,0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0
506,1,1,1,1,1,0,0,1,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0


## 'y' Label Encoder / Ordinal Encoding

In [24]:
y.value_counts() # sanity check

Neutral         218
Unsupportive    213
Supportive       77
Name: overall_support, dtype: int64

In [25]:
y = y.replace(['Unsupportive'], '0')
y = y.replace(['Neutral'], '1')
y = y.replace(['Supportive'], '2')

## Train/Test/Val Split

In [214]:
from sklearn.model_selection import train_test_split # importing traintest split

X_train_val, X_test, y_train_val, y_test = train_test_split(X, y, test_size = 0.2, stratify=y)

X_train, X_val, y_train, y_val = train_test_split(X_train_val, y_train_val, test_size = 0.25)

In [213]:
y_val

140    2
419    0
188    1
325    2
0      0
      ..
458    0
314    1
407    2
226    2
374    0
Name: overall_support, Length: 102, dtype: object

## Scaling

In [112]:
from sklearn.preprocessing import StandardScaler, MinMaxScaler, PolynomialFeatures, RobustScaler

In [113]:
#defining different scalers

scaler1 = StandardScaler()
scaler2 = MinMaxScaler()
scaler3 = PolynomialFeatures(degree=2, interaction_only=True)
scaler4 = RobustScaler()

# Modeling

###  Univariate Selection

Statistical tests can be used to select those features that have the strongest relationship with the output variable.

A chi-square test is used in statistics to test the independence of two events. In feature selection, we aim to select the features which are highly dependent on the response.

In [202]:
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2

# apply SelectKBest class to extract top 10 best features

bestfeatures = SelectKBest(score_func=chi2, k=25)
fit = bestfeatures.fit(X_train, y_train)
dfscores = pd.DataFrame(fit.scores_)
dfcolumns = pd.DataFrame(X_train.columns)

# concat two dataframes for better visualization 

featureScores = pd.concat([dfcolumns,dfscores],axis=1)
featureScores.columns = ['Features','Score']  # naming the dataframe columns
print(featureScores.nlargest(25,'Score'))  # print 25 best features

                                              Features      Score
42                                   expected_reaction  24.339341
13                     importance_previous_employer_MH  23.867225
6                               importance_employer_MH  22.730866
107                            observed_neg_impact_Yes  14.207921
67                           discuss_MH_supervisor_Yes  12.678204
66                            discuss_MH_supervisor_No  11.213903
68                              discuss_MH_coworker_No  10.027126
76   previous_aware_coverage_Yes, I was aware of al...   9.924002
81       previous_learning_resources_Yes, they all did   8.965674
79        previous_formal_discussion_Yes, they all did   6.922050
99                   work_interference_treatment_Often   6.395898
116      observed_positive_reaction_Yes, I experienced   6.375139
33                                                ptsd   6.302992
41                                       identified_MH   6.028780
113      o

In [None]:
# this is just to give us a general idea of potentially important features.

## Random Forest (Baseline / Feature Importance)

### Classification

In [61]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import confusion_matrix, classification_report

### Model Function

In [None]:
def model_application_and_evaluation(scaler, classifier, X_train, X_test, y_train, y_test): 
    X_train = scaler.fit_transform(X_train)
    X_test = scaler.transform(X_test)
    classifier.fit(X_train, y_train)
    print("Train", classification_report(y_train, classifier.predict(X_train)))
    print("Val/Test", classification_report(y_test, classifier.predict(X_test)))

In [217]:
rf_vanilla = RandomForestClassifier()

model_application_and_evaluation(scaler1, rf_vanilla, X_train, X_test, y_train, y_test)

Train               precision    recall  f1-score   support

           0       1.00      1.00      1.00       130
           1       1.00      1.00      1.00       128
           2       1.00      1.00      1.00        46

    accuracy                           1.00       304
   macro avg       1.00      1.00      1.00       304
weighted avg       1.00      1.00      1.00       304

Test               precision    recall  f1-score   support

           0       0.55      0.63      0.59        43
           1       0.53      0.64      0.58        44
           2       0.00      0.00      0.00        15

    accuracy                           0.54       102
   macro avg       0.36      0.42      0.39       102
weighted avg       0.46      0.54      0.50       102



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [None]:
# the model is performing poorly but more importantly there's a noticable imbalance between the classes.

# Addressing Imbalance

In [68]:
y_train

# sanity check

347    1
137    1
486    2
451    1
411    1
      ..
106    1
270    1
348    1
435    1
102    1
Name: overall_support, Length: 406, dtype: object

In [69]:
y.value_counts() # imbalance confirmed

1    218
0    213
2     77
Name: overall_support, dtype: int64

## Oversampling

In [65]:
# balancing techniques
from imblearn.over_sampling import SMOTE

sm = SMOTE(random_state = 2)
X_train_smote, y_train_smote = sm.fit_resample(X_train, y_train)

y_train_smote.value_counts()

# this produces the folowing errors: Expected n_neighbors <= n_samples,  but n_samples = 3, n_neighbors = 6

2    130
1    130
0    130
Name: overall_support, dtype: int64

## Random Forest with SMOTE

In [194]:
rf_vanilla = RandomForestClassifier()

model_application_and_evaluation(scaler1, rf_vanilla, X_train_smote, X_test, y_train_smote, y_test)

Train               precision    recall  f1-score   support

           0       1.00      1.00      1.00       130
           1       1.00      1.00      1.00       130
           2       1.00      1.00      1.00       130

    accuracy                           1.00       390
   macro avg       1.00      1.00      1.00       390
weighted avg       1.00      1.00      1.00       390

Test               precision    recall  f1-score   support

           0       0.52      0.53      0.53        43
           1       0.47      0.57      0.52        44
           2       0.40      0.13      0.20        15

    accuracy                           0.49       102
   macro avg       0.46      0.41      0.41       102
weighted avg       0.48      0.49      0.47       102



In [195]:
rf_vanilla = RandomForestClassifier()

model_application_and_evaluation(scaler1, rf_vanilla, X_train_smote, X_test, y_train_smote, y_test)

Train               precision    recall  f1-score   support

           0       1.00      1.00      1.00       130
           1       1.00      1.00      1.00       130
           2       1.00      1.00      1.00       130

    accuracy                           1.00       390
   macro avg       1.00      1.00      1.00       390
weighted avg       1.00      1.00      1.00       390

Test               precision    recall  f1-score   support

           0       0.52      0.60      0.56        43
           1       0.50      0.57      0.53        44
           2       0.50      0.07      0.12        15

    accuracy                           0.51       102
   macro avg       0.51      0.41      0.40       102
weighted avg       0.51      0.51      0.48       102



## RandomOversampler

In [86]:
# RandomOverSampler is over-sampling by duplicating some of the original samples of the minority class
#SMOTE and ADASYN generate new samples in by interpolation 
#the samples used to interpolate/generate new synthetic samples differ.

from imblearn.over_sampling import RandomOverSampler

ros = RandomOverSampler(random_state=42)

X_train_ros, y_train_ros = ros.fit_resample(X_train, y_train)

In [87]:
y_train_ros.value_counts()

0    130
2    130
1    130
Name: overall_support, dtype: int64

## Random Forest with RandomOversampler

In [188]:
rf_vanilla = RandomForestClassifier()

model_application_and_evaluation(scaler1, rf_vanilla, X_train_ros, X_test, y_train_ros, y_test)

Train               precision    recall  f1-score   support

           0       1.00      1.00      1.00       130
           1       1.00      1.00      1.00       130
           2       1.00      1.00      1.00       130

    accuracy                           1.00       390
   macro avg       1.00      1.00      1.00       390
weighted avg       1.00      1.00      1.00       390

Test               precision    recall  f1-score   support

           0       0.49      0.56      0.52        43
           1       0.44      0.50      0.47        44
           2       0.33      0.07      0.11        15

    accuracy                           0.46       102
   macro avg       0.42      0.37      0.37       102
weighted avg       0.45      0.46      0.44       102



In [189]:
rf_vanilla = RandomForestClassifier()

model_application_and_evaluation(scaler2, rf_vanilla, X_train_ros, X_test, y_train_ros, y_test)

Train               precision    recall  f1-score   support

           0       1.00      1.00      1.00       130
           1       1.00      1.00      1.00       130
           2       1.00      1.00      1.00       130

    accuracy                           1.00       390
   macro avg       1.00      1.00      1.00       390
weighted avg       1.00      1.00      1.00       390

Test               precision    recall  f1-score   support

           0       0.53      0.58      0.56        43
           1       0.46      0.55      0.50        44
           2       0.33      0.07      0.11        15

    accuracy                           0.49       102
   macro avg       0.44      0.40      0.39       102
weighted avg       0.47      0.49      0.47       102



In [299]:
rf_vanilla = RandomForestClassifier()

model_application_and_evaluation(scaler3, rf_vanilla, X_train_ros, X_val, y_train_ros, y_val)

Train               precision    recall  f1-score   support

           0       1.00      1.00      1.00       130
           1       1.00      1.00      1.00       130
           2       1.00      1.00      1.00       130

    accuracy                           1.00       390
   macro avg       1.00      1.00      1.00       390
weighted avg       1.00      1.00      1.00       390

Test               precision    recall  f1-score   support

           0       0.75      0.82      0.79        40
           1       0.78      0.78      0.78        46
           2       0.92      0.69      0.79        16

    accuracy                           0.78       102
   macro avg       0.82      0.77      0.78       102
weighted avg       0.79      0.78      0.78       102



In [191]:
rf_vanilla = RandomForestClassifier()

model_application_and_evaluation(scaler4, rf_vanilla, X_train_ros, X_test, y_train_ros, y_test)

Train               precision    recall  f1-score   support

           0       1.00      1.00      1.00       130
           1       1.00      1.00      1.00       130
           2       1.00      1.00      1.00       130

    accuracy                           1.00       390
   macro avg       1.00      1.00      1.00       390
weighted avg       1.00      1.00      1.00       390

Test               precision    recall  f1-score   support

           0       0.52      0.63      0.57        43
           1       0.47      0.50      0.48        44
           2       0.33      0.07      0.11        15

    accuracy                           0.49       102
   macro avg       0.44      0.40      0.39       102
weighted avg       0.47      0.49      0.46       102



**Comment:** The model that actually produces results I can work with is the one with the Polynomial Scaler and the RandomOversampler. I'm using this model next for hyperparameter tuning.

# Hypermeter Tuning

In [140]:
param_grid = { 
    'n_estimators': [200, 300, 400, 500],
    'max_features': ['sqrt', 'log2'],
    'max_depth' : [4,5,6,7,8,9,10],
    'criterion' :['gini', 'entropy', 'log_loss']
}

In [141]:
%%time

CV_rfc = GridSearchCV(estimator=rf_vanilla, param_grid=param_grid, cv= 5)
CV_rfc.fit(X_train_ros, y_train_ros)

CPU times: total: 4min 32s
Wall time: 5min 30s


In [142]:
CV_rfc.best_params_

{'criterion': 'gini',
 'max_depth': 10,
 'max_features': 'sqrt',
 'n_estimators': 400}

In [143]:
rf_optimized = RandomForestClassifier(max_features='sqrt', n_estimators= 400, max_depth= 10, criterion='gini')

In [145]:
# applying function again

model_application_and_evaluation(scaler3, rf_optimized, X_train_ros, X_val, y_train_ros, y_val)

Train               precision    recall  f1-score   support

           0       1.00      1.00      1.00       130
           1       1.00      1.00      1.00       130
           2       1.00      1.00      1.00       130

    accuracy                           1.00       390
   macro avg       1.00      1.00      1.00       390
weighted avg       1.00      1.00      1.00       390

Test               precision    recall  f1-score   support

           0       0.94      0.85      0.89        40
           1       0.83      0.96      0.89        47
           2       0.83      0.67      0.74        15

    accuracy                           0.87       102
   macro avg       0.87      0.82      0.84       102
weighted avg       0.88      0.87      0.87       102



# Cross Validation

In [312]:
%%time

# import k-folder
from sklearn.model_selection import cross_val_score

# use the same model as before
rf_optimized = RandomForestClassifier(max_features='sqrt', n_estimators= 400, max_depth= 10, criterion='gini')

# X,y will automatically devided by 5 folder, the scoring I will still use the accuracy
scores = cross_val_score(rf_optimized, X_train_ros, y_train_ros, cv=20, scoring='precision_macro')

# print all 5 times scores 
print(scores)

# then I will do the average about these five scores to get more accuracy score.
print(scores.mean())

[0.79365079 0.61481481 0.69047619 0.80238095 0.66666667 0.73809524
 0.69047619 0.76666667 0.78333333 0.71851852 0.55555556 0.72380952
 0.67592593 0.64351852 0.72380952 0.72380952 0.52777778 0.80092593
 0.7797619  0.77777778]
0.7098875661375661
CPU times: total: 7.52 s
Wall time: 9.81 s


## Confidence Intervals

In [300]:
import math as m
from scipy import stats

In [302]:
# confidence intervals for precision / class 0

n = 40 
p = 0.94
t = stats.t.ppf(0.975, df=n-1) # alpha = 0.05, so confidence level = 0.975 since confidence level = 1-alpha+alpha/2
error = t*m.sqrt( (p*(1-p)/n))
confidence_interval_class_0 = [p - error, p + error]
confidence_interval_class_0

[0.8640480235976434, 1.0159519764023563]

In [303]:
# confidence intervals for precision / class 1

n = 47
p = 0.83
t = stats.t.ppf(0.975, df=n-1) # alpha = 0.05, so confidence level = 0.975 since confidence level = 1-alpha+alpha/2
error = t*m.sqrt( (p*(1-p)/n))
confidence_interval_class_1 = [p - error, p + error]
confidence_interval_class_1

[0.7197100785450627, 0.9402899214549372]

In [304]:
# confidence intervals for precision / class 2

n = 15 
p = 0.83
t = stats.t.ppf(0.975, df=n-1) # alpha = 0.05, so confidence level = 0.975 since confidence level = 1-alpha+alpha/2
error = t*m.sqrt( (p*(1-p)/n))
confidence_interval_class_2 = [p - error, p + error]
confidence_interval_class_2

[0.6219814892750286, 1.0380185107249713]

**Comment 1:** Other approaches I tried, not included in this notebook:

   1. SVM + Oversampling (SMOTE, RandomOversampler)  & Undersampling Techniques (TomekLinks)
   2. KNN + Oversampling (SMOTE, RandomOversampler)  & Undersampling Techniques (TomekLinks)
   3. Ada Boost + Oversampling (SMOTE, RandomOversampler)  & Undersampling Techniques (TomekLinks)
   4. Feature selection: Importance features based on the Random Forest above. Re-running all aforementioned features did not yield better scores.
   
The code for all aformentioned models is not included in this notebook as the results it yielded were inferior to the Random Forests above. If you want to access these model reach out to the author (see readme.txt)

**Comment 2:** After some consideration, the author of this notebook has decided to continue working on this project by:
   1. building a Neural Network architecture that address the issues of overfitting/variance.
   2. Adding data from more years. Due to time restrictions only data from the 2017 - 2018 annual surveys were used. That could address the use of having low amount of data points for certain classes. 
   3. Use additional strategies to address nan values.