In [297]:
#import dependencies
import numpy as np
import pandas as pd
import sklearn
from sklearn.model_selection import train_test_split
from sklearn import preprocessing
from sklearn.ensemble import RandomForestClassifier
from sklearn.decomposition import PCA

In [302]:
class RandomForest:
    def __init__(self, csv):
        df = pd.read_csv(csv)
        self.data = df
        columns = df.columns
        self.columns = columns
        for column in columns:
            df[column]=preprocessing.LabelEncoder().fit_transform(data[column])
        self.updated=df
        data_= self.updated
        self.feature_labels = list(data_.loc[:,data_.columns!= "mental_vs_physical"])
        X = np.array(data_.loc[:,data_.columns!= "mental_vs_physical"])
        Y = np.array(data_["mental_vs_physical"])
        self.target = Y
        self.features = X
        # Use sklearn to create train and test data
        train_features,test_features,train_target,test_target = train_test_split(self.features,self.target, test_size=0.25,random_state=42)
        self.train_features = train_features
        self.train_target=train_target
        self.test_features = test_features
        self.test_target = test_target
        
    def predict(self):
        model = RandomForestClassifier(n_estimators = 10, random_state = 42)
        self.model = model.fit(self.train_features,self.train_target)
        self.score = model.score(self.test_features,self.test_target)
        print('Summary Results')
        print("------------------------------------------------")
        print(f'The accuracy score is of the data: {self.score}')
    
    def importance(self):
        importances = list(self.model.feature_importances_)
        feature_importances = [(feature, round(importance, 2)) for feature, importance in zip(self.feature_labels, importances)]
        feature_importances = sorted(feature_importances, key = lambda x: x[1], reverse = True)
        [print('Variable: {:20} Importance: {}'.format(*pair)) for pair in feature_importances];
        
    def pca(self):
        self.reducer = PCA(n_components=4)
        features_variable = self.features
        pca_data = self.reducer.fit_transform(features_variable)
        self.pca_data = pca_data
        features_lables = self.feature_labels
        print(f'The shape of the PCA Data is: {pca_data.shape}')
        comp_loc= []
        for x in range(0,4):
            comp_loc.append(np.argmax(abs(self.reducer.components_[x])))
    
        for loc in comp_loc:
                print(f'For component {comp_loc.index(loc)+1}, the feature is: {features_lables[loc]}')
        train_features,test_features,train_target,test_target = sklearn.model_selection.train_test_split(self.pca_data,self.target, test_size=0.25,random_state=42)
        model = RandomForestClassifier(n_estimators = 10, random_state = 42)
        self.pca_test_features = test_features
        self.pca_test_target = test_target
        self.pca_train_features = train_features
        self.pca_train_target = train_target
        self.pca_model = model.fit(self.pca_train_features,self.pca_train_target)
        self.pca_score = model.score(self.pca_test_features,self.pca_test_target)
        print(" ")
        print(f'The accuracy score for the data reduced with PCA: {self.pca_score}')
        importances = list(self.pca_model.feature_importances_)
        print(" ")
        print(f'The importances for the PCA data: {importances}')
    

            

if __name__ == '__main__': 
    randomforest = RandomForest("clean_survey.csv")
    randomforest.predict()
    print("--------------------------------------------")
    randomforest.importance()
    print("---------------------------------------------")
    randomforest.pca()
  


        
  
    

Summary Results
------------------------------------------------
The accuracy score is of the data: 0.5454545454545454
--------------------------------------------
Variable: Country              Importance: 0.08
Variable: no_employees         Importance: 0.08
Variable: leave                Importance: 0.08
Variable: mental_health_consequence Importance: 0.08
Variable: supervisor           Importance: 0.06
Variable: care_options         Importance: 0.05
Variable: anonymity            Importance: 0.05
Variable: coworkers            Importance: 0.05
Variable: phys_health_interview Importance: 0.05
Variable: Age_bin              Importance: 0.04
Variable: family_history       Importance: 0.04
Variable: benefits             Importance: 0.04
Variable: wellness_program     Importance: 0.04
Variable: seek_help            Importance: 0.04
Variable: phys_health_consequence Importance: 0.04
Variable: Gender               Importance: 0.03
Variable: treatment            Importance: 0.03
Variable: r