# Importing libraries

In [84]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import train_test_split
from sklearn.tree import plot_tree
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import GridSearchCV
import warnings
from imblearn.over_sampling import SMOTE
warnings.filterwarnings('ignore')

# Loading .csv

In [50]:
who = pd.read_csv('who')

In [51]:
who.columns

Index(['Unnamed: 0', 'country', 'status', 'life expectancy', 'infant deaths',
       'alcohol', 'percentage expenditure', 'hepatitis b', 'measles', 'bmi',
       'polio', 'total expenditure', 'hiv/aids', 'thinness 1-19 years',
       'income composition of resources', 'schooling'],
      dtype='object')

In [52]:
who.drop(['Unnamed: 0'], axis=1, inplace=True)

In [69]:
who['status'].value_counts()

1.0    160
0.0     32
Name: status, dtype: int64

In [53]:
X = who.drop(['status'], axis = 1)

In [54]:
y = who['status']

# Hot Encoding

In [55]:
numericalX = X.select_dtypes(np.number)
categoricalX = X.select_dtypes(np.object)

In [56]:
encoder = OneHotEncoder(drop='first').fit(categoricalX)
encoded_categorical = encoder.transform(categoricalX).toarray()
encoded_categorical = pd.DataFrame(encoded_categorical)
X = pd.concat([numericalX, encoded_categorical], axis = 1)

# X y Split

In [57]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=0)

In [58]:
X_train = pd.DataFrame(X_train)
X_test = pd.DataFrame(X_test)

# Fitting the model

In [59]:
clf = RandomForestClassifier(max_depth=6,min_samples_leaf=20,max_features=None,n_estimators=100,
                             bootstrap=True,oob_score=True, random_state=0)
clf.fit(X_train, y_train)
print(clf.score(X_train, y_train))
print(clf.score(X_test, y_test))

0.9305555555555556
0.8541666666666666


# Cross validation

In [61]:
clf = RandomForestClassifier(max_depth=6,min_samples_leaf=20,max_features=None,n_estimators=100,
                             bootstrap=True,oob_score=True, random_state=0)
cross_val_scores = cross_val_score(clf, X_train, y_train, cv=5)
cross_val_scores

array([0.96551724, 0.89655172, 0.96551724, 0.89655172, 0.78571429])

In [62]:
print("%0.2f accuracy with a standard deviation of %0.2f" % (cross_val_scores.mean(), cross_val_scores.std()))

0.90 accuracy with a standard deviation of 0.07


# Upsampling using smote

In [73]:
smote = SMOTE()


In [74]:
X_sm, y_sm = smote.fit_resample(X, y)
y_sm.value_counts()

1.0    160
0.0    160
Name: status, dtype: int64

# Y y Split Smote

In [75]:
X_train, X_test, y_train, y_test = train_test_split(X_sm, y_sm, test_size=0.25, random_state=0)
X_train = pd.DataFrame(X_train)
X_test = pd.DataFrame(X_test)

# Fitting the model Smote

In [77]:
clf = RandomForestClassifier(max_depth=3, random_state=0)
clf.fit(X_train, y_train)
print(clf.score(X_train, y_train))
print(clf.score(X_test, y_test))

0.975
0.95


# Cross validation Smote

In [79]:
clf = RandomForestClassifier(max_depth=3, random_state=0)
cross_val_scores = cross_val_score(clf, X_train, y_train, cv=5)
cross_val_scores

array([0.9375    , 0.97916667, 0.91666667, 0.97916667, 0.97916667])

In [80]:
print("%0.2f accuracy with a standard deviation of %0.2f" % (cross_val_scores.mean(), cross_val_scores.std()))

0.96 accuracy with a standard deviation of 0.03


# Feature Importance

In [81]:
clf.fit( X_train, y_train)

In [82]:
feature_names = X_train.columns
feature_names = list(feature_names)

In [83]:
df = pd.DataFrame(list(zip(feature_names, clf.feature_importances_)))
df.columns = ['columns_name', 'score_feature_importance']
df.sort_values(by=['score_feature_importance'], ascending = False)

Unnamed: 0,columns_name,score_feature_importance
0,life expectancy,0.139324
11,income composition of resources,0.108580
2,alcohol,0.092470
12,schooling,0.091815
10,thinness 1-19 years,0.082444
...,...,...
82,69,0.000000
81,68,0.000000
80,67,0.000000
79,66,0.000000


# Hyper Parameter Tuning: Grid Search (Feat. Import., Smote)

In [92]:
X2 = who[['life expectancy', 'income composition of resources', 'alcohol', 'schooling']].copy()

In [93]:
y2 = who['status']

In [94]:
X2_sm, y2_sm = smote.fit_resample(X2, y2)
y2_sm.value_counts()

1.0    160
0.0    160
Name: status, dtype: int64

In [97]:
X2_train, X2_test, y2_train, y2_test = train_test_split(X2_sm, y2_sm, test_size=0.20, random_state=0)

In [98]:
X2_train = pd.DataFrame(X2_train)
X2_test = pd.DataFrame(X2_test)

In [99]:
param_grid = {
    'n_estimators': [50, 100,500],
    'min_samples_split': [2, 4],
    'min_samples_leaf' : [1, 2],
    'max_features': ['sqrt']
    ##'max_samples' : ['None', 0.5],
    ##'max_depth':[3,5,10],
    ## 'bootstrap':[True,False] 
    }
clf = RandomForestClassifier(random_state=100)

In [100]:
grid_search = GridSearchCV(clf, param_grid, cv=5,return_train_score=True,n_jobs=-1,)

In [101]:
grid_search.fit(X2_train,y2_train)

In [102]:
grid_search.best_params_

{'max_features': 'sqrt',
 'min_samples_leaf': 1,
 'min_samples_split': 2,
 'n_estimators': 500}

In [103]:
# With Random Forest (incl. smote, grid) only using features 
# 'life expectancy', 'income composition of resources', 'alcohol', 'schooling'
# one can predict whether a country is 
# developing or not with 96% certainty 

clf = RandomForestClassifier(random_state=0, max_features='sqrt', 
                             min_samples_leaf=1, min_samples_split=2, n_estimators=500)
cross_val_scores2 = cross_val_score(clf, X2_train, y2_train, cv=10)
print(np.mean(cross_val_scores2))

0.9612307692307691
