# Lab | Random Forests

For this lab, you will be using the CSV files provided in the files_for_lab folder.

Instructions
* Apply the Random Forests algorithm but this time only bmy upscaling the data using SMOTE.
* Note that since SMOTE works on numerical data only, we will first encode the categorical variables in this case.

In [1]:
import pandas as pd
import numpy as np
import matplotlib
import matplotlib.pyplot as plt
import matplotlib.colors as colors
from pandas.plotting import scatter_matrix
import seaborn as sns
from IPython.display import set_matplotlib_formats, HTML
from matplotlib.dates import DateFormatter
import matplotlib_inline 
from pandas.plotting import register_matplotlib_converters
register_matplotlib_converters()
from matplotlib import colors as mcolors
from pandas.plotting import register_matplotlib_converters
import plotly.express as px
%matplotlib inline
%config InlineBackend.figure_format = 'png'
import scipy.stats as stats
import warnings
warnings.filterwarnings('ignore') 

In [2]:
# Formating Plots
# default styles
def set_sns_format(width=14, height=8):
    sns.set_theme(palette='pastel', context='notebook',rc={'savefig.dpi':300})
    matplotlib_inline.backend_inline.set_matplotlib_formats('retina')
    matplotlib.rcParams['figure.figsize'] = (width, height)
    return None
set_sns_format(width=14, height=8)

In [3]:
def add_value_labels(ax, typ, spacing=5):
    #This function add the labels in the bar and line plots
    #input the ax to add the labels, the type of plot
    
    space = spacing
    va = 'bottom'
    

    if typ == 'bar':
        for i in ax.patches:
            y_value = i.get_height()
            x_value = i.get_x() + i.get_width() / 2

            label = "{:.0f}".format(y_value)
            ax.annotate(label,(x_value, y_value), xytext=(0, space), 
                    textcoords="offset points", ha='center', va=va, fontsize=10)     

    if typ == 'line':
        for line in ax.lines:
            for x_value, y_value in zip(line.get_xdata(), line.get_ydata()):
                label = "{:.0f}".format(y_value)
                ax.annotate(label,(x_value, y_value), xytext=(0, space), 
                    textcoords="offset points", ha='center', va=va, fontsize=10)

In [4]:
from sklearn.preprocessing import StandardScaler, LabelEncoder, OrdinalEncoder
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor
from imblearn.over_sampling import SMOTE

In [5]:
numerical = pd.read_csv("files_for_lab/numerical.csv")
categorical = pd.read_csv("files_for_lab/categorical.csv")
target = pd.read_csv("files_for_lab/target.csv")

In [6]:
obj_df = categorical.select_dtypes(include=['object']).copy()

In [7]:
for col in obj_df:
    freq = obj_df.groupby(col).size() / len(obj_df)
    obj_df[col] = obj_df[col].apply(lambda x : freq[x])

In [8]:
for col in obj_df:
    categorical[col] = obj_df[col]

In [9]:
df = pd.concat([numerical, categorical], axis=1)

In [10]:
df

Unnamed: 0,TCODE,AGE,INCOME,WEALTH1,HIT,MALEMILI,MALEVET,VIETVETS,WWIIVETS,LOCALGOV,...,DOB_YR,DOB_MM,MINRDATE_YR,MINRDATE_MM,MAXRDATE_YR,MAXRDATE_MM,LASTDATE_YR,LASTDATE_MM,FIRSTDATE_YR,FIRSTDATE_MM
0,0,60.000000,5,9,0,0,39,34,18,10,...,37,12,92,8,94,2,95,12,89,11
1,1,46.000000,6,9,16,0,15,55,11,6,...,52,2,93,10,95,12,95,12,93,10
2,1,61.611649,3,1,2,0,20,29,33,6,...,0,2,91,11,92,7,95,12,90,1
3,0,70.000000,1,4,2,0,23,14,31,3,...,28,1,87,11,94,11,95,12,87,2
4,0,78.000000,3,2,60,1,28,9,53,26,...,20,1,93,10,96,1,96,1,79,3
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
95407,1,61.611649,5,9,0,14,36,47,11,7,...,0,2,96,2,96,2,96,2,96,2
95408,1,48.000000,7,9,1,0,31,43,19,4,...,50,1,96,3,96,3,96,3,96,3
95409,1,60.000000,5,9,0,0,18,46,20,7,...,38,1,96,3,95,1,96,10,94,10
95410,0,58.000000,7,9,0,0,28,35,20,9,...,40,5,90,11,96,8,97,1,86,12


# Classification

In [11]:
target_B = target.iloc[:,0]

In [12]:
df

Unnamed: 0,TCODE,AGE,INCOME,WEALTH1,HIT,MALEMILI,MALEVET,VIETVETS,WWIIVETS,LOCALGOV,...,DOB_YR,DOB_MM,MINRDATE_YR,MINRDATE_MM,MAXRDATE_YR,MAXRDATE_MM,LASTDATE_YR,LASTDATE_MM,FIRSTDATE_YR,FIRSTDATE_MM
0,0,60.000000,5,9,0,0,39,34,18,10,...,37,12,92,8,94,2,95,12,89,11
1,1,46.000000,6,9,16,0,15,55,11,6,...,52,2,93,10,95,12,95,12,93,10
2,1,61.611649,3,1,2,0,20,29,33,6,...,0,2,91,11,92,7,95,12,90,1
3,0,70.000000,1,4,2,0,23,14,31,3,...,28,1,87,11,94,11,95,12,87,2
4,0,78.000000,3,2,60,1,28,9,53,26,...,20,1,93,10,96,1,96,1,79,3
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
95407,1,61.611649,5,9,0,14,36,47,11,7,...,0,2,96,2,96,2,96,2,96,2
95408,1,48.000000,7,9,1,0,31,43,19,4,...,50,1,96,3,96,3,96,3,96,3
95409,1,60.000000,5,9,0,0,18,46,20,7,...,38,1,96,3,95,1,96,10,94,10
95410,0,58.000000,7,9,0,0,28,35,20,9,...,40,5,90,11,96,8,97,1,86,12


In [13]:
target_B

0        0
1        0
2        0
3        0
4        0
        ..
95407    0
95408    0
95409    0
95410    1
95411    0
Name: TARGET_B, Length: 95412, dtype: int64

In [14]:
sc = StandardScaler()

In [15]:
df = sc.fit_transform(df)

In [16]:
pd.DataFrame(df)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,327,328,329,330,331,332,333,334,335,336
0,-0.056847,-1.115292e-01,0.509147,0.717411,-0.356881,-0.206977,0.745798,0.284659,-0.826571,0.719013,...,0.370442,3.012879,-0.172698,0.419080,-0.201169,-1.236357,-0.789204,1.257673,-0.714736,1.237697
1,-0.055799,-1.080356e+00,1.097566,0.717411,1.362283,-0.206977,-1.346527,1.675602,-1.221851,-0.191801,...,1.049599,-0.290923,0.200433,0.932504,0.374914,1.266242,-0.789204,1.257673,0.530642,0.985761
2,-0.055799,1.267017e-10,-0.667692,-2.239375,-0.141985,-0.206977,-0.910626,-0.046518,0.020458,-0.191801,...,-1.304814,-0.290923,-0.545828,1.189216,-1.353334,0.014943,-0.789204,1.257673,-0.403392,-1.281661
3,-0.056847,5.804901e-01,-1.844530,-1.130581,-0.141985,-0.206977,-0.649086,-1.040049,-0.092480,-0.874911,...,-0.037053,-0.621303,-2.038350,1.189216,-0.201169,1.015982,-0.789204,1.257673,-1.337425,-1.029725
4,-0.056847,1.134106e+00,-0.667692,-1.869777,6.089983,-0.009563,-0.213185,-1.371225,1.149829,4.362269,...,-0.399270,-0.621303,0.200433,0.932504,0.950996,-1.486617,1.115522,-1.336135,-3.828181,-0.777790
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
95407,-0.055799,1.267017e-10,0.509147,0.717411,-0.356881,2.556819,0.484257,1.145719,-1.221851,0.035903,...,-1.304814,-0.290923,1.319824,-1.121193,0.950996,-1.236357,1.115522,-1.100334,1.464675,-1.029725
95408,-0.055799,-9.419524e-01,1.685985,0.717411,-0.249433,-0.206977,0.048356,0.880777,-0.770102,-0.647208,...,0.959045,-0.621303,1.319824,-0.864481,0.950996,-0.986097,1.115522,-0.864533,1.464675,-0.777790
95409,-0.055799,-1.115292e-01,0.509147,0.717411,-0.356881,-0.206977,-1.084987,1.079483,-0.713634,0.035903,...,0.415719,-0.621303,1.319824,-0.864481,0.374914,-1.486617,1.115522,0.786072,0.841986,0.985761
95410,-0.056847,-2.499331e-01,1.685985,0.717411,-0.356881,-0.206977,-0.213185,0.350894,-0.713634,0.491310,...,0.506273,0.700217,-0.918959,1.189216,0.950996,0.265203,3.020249,-1.336135,-1.648770,1.489632


In [17]:
X_train, X_test, y_train, y_test = train_test_split(pd.DataFrame(df), target_B, random_state=42, test_size=0.30)

In [18]:
y_train.value_counts()

0    63369
1     3419
Name: TARGET_B, dtype: int64

In [23]:
sm = SMOTE(k_neighbors = 3, random_state = 42)

X_train_SMOTE, y_train_SMOTE = sm.fit_resample(X_train, y_train)


In [24]:
y_train_SMOTE.value_counts()

0    63369
1    63369
Name: TARGET_B, dtype: int64

In [21]:
clf = RandomForestClassifier(random_state=0)

In [25]:
clf.fit(X_train_SMOTE, y_train_SMOTE)

In [26]:
print("train:", clf.score(X_train_SMOTE, y_train_SMOTE))
print("test:", clf.score(X_test, y_test))

train: 1.0
test: 0.9499021799888205


* This model have a high accuracy

# Regression

In [27]:
target_D = target.iloc[:,1]

In [28]:
reg = RandomForestRegressor(random_state=0, max_depth=4)

In [29]:
X_train, X_test, y_train, y_test = train_test_split(df, target_D, random_state=42, test_size=0.30)

In [30]:
reg.fit(X_train, y_train)

In [31]:
print("train:", reg.score(X_train, y_train))
print("test:", reg.score(X_test, y_test))

train: 0.04875613649783517
test: 0.004507287953963046


In [32]:
target_D.value_counts()

0.00     90569
10.00      941
15.00      591
20.00      577
5.00       503
         ...  
18.25        1
10.70        1
2.50         1
16.87        1
44.21        1
Name: TARGET_D, Length: 71, dtype: int64

* The regression scores are very low, assuming because of the huge amount of 0s.