# Lab | Random Forests

For this lab, you will be using the CSV files provided in the files_for_lab folder.

## Instructions

- Apply the Random Forests algorithm but this time only by upscaling the data using SMOTE.
- Note that since SMOTE works on numerical data only, we will first encode the categorical variables in this case.


### Loading the dataset

In [1]:
import pandas as pd
import numpy as np

from sklearn.datasets import load_boston
from sklearn.tree import DecisionTreeRegressor
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.neighbors import KNeighborsRegressor

from scipy.stats import t, norm

# Categories

In [2]:
categorical = pd.read_csv("C:/Users/jw156/Ironhack/Day39/Afternoon/Lab/lab-random-forests/files_for_lab/categorical.csv")

In [3]:
to_drop=['STATE', 'CLUSTER', 'ODATEW_MM', 'DOB_MM', 'MINRDATE_MM',
       'MAXRDATE_MM', 'LASTDATE_MM','FIRSTDATE_MM', 'RFA_2R']


In [4]:
to_keep = categorical[['STATE', 'CLUSTER', 'HOMEOWNR', 'GENDER', 'DATASRCE', 'RFA_2R',
       'RFA_2A', 'GEOCODE2', 'DOMAIN_A', 'DOMAIN_B', 'ODATEW_YR', 'ODATEW_MM',
       'DOB_YR', 'DOB_MM', 'MINRDATE_YR', 'MINRDATE_MM', 'MAXRDATE_YR',
       'MAXRDATE_MM', 'LASTDATE_YR', 'LASTDATE_MM', 'FIRSTDATE_YR',
       'FIRSTDATE_MM']].nunique()

In [5]:
categorical.drop(to_drop, axis=1, inplace=True)

In [6]:
years_df = categorical[["ODATEW_YR","DOB_YR", "MINRDATE_YR", "MAXRDATE_YR","LASTDATE_YR", "FIRSTDATE_YR"]]

In [7]:
categorical.drop(years_df, axis=1, inplace=True)

In [8]:
df_dummies = pd.get_dummies(categorical, drop_first = True)

In [9]:
df_dummies

Unnamed: 0,DATASRCE,DOMAIN_B,HOMEOWNR_U,GENDER_M,GENDER_other,RFA_2A_E,RFA_2A_F,RFA_2A_G,GEOCODE2_B,GEOCODE2_C,GEOCODE2_D,DOMAIN_A_R,DOMAIN_A_S,DOMAIN_A_T,DOMAIN_A_U
0,3,2,0,0,0,1,0,0,0,1,0,0,0,1,0
1,3,1,0,1,0,0,0,1,0,0,0,0,1,0,0
2,3,2,1,1,0,1,0,0,0,1,0,1,0,0,0
3,3,2,1,0,0,1,0,0,0,1,0,1,0,0,0
4,3,2,0,0,0,0,1,0,0,0,0,0,1,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
95407,3,2,0,1,0,0,0,1,0,1,0,0,0,0,0
95408,3,1,0,1,0,0,1,0,0,0,0,0,0,0,0
95409,3,3,0,1,0,1,0,0,1,0,0,0,0,0,0
95410,2,1,0,0,0,0,1,0,0,0,0,0,0,0,0


In [10]:
for year in years_df:
    years_df[year] = pd.to_numeric(years_df[year], errors="coerce")

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  years_df[year] = pd.to_numeric(years_df[year], errors="coerce")


In [11]:
# df = pd.concat([numerical, df_dummies, years_df], axis=1)

In [12]:
# df

# Numerical Columns

In [13]:
numerical = pd.read_csv("C:/Users/jw156/Ironhack/Day39/Afternoon/Lab/lab-random-forests/files_for_lab/numerical.csv")

In [14]:
nums_colums = numerical.columns

In [15]:
target = pd.read_csv("C:/Users/jw156/Ironhack/Day39/Afternoon/Lab/lab-random-forests/files_for_lab/target.csv")

## Feature selection based on variance.

In [18]:
from sklearn.feature_selection import VarianceThreshold

In [19]:
sel = VarianceThreshold(threshold=(.8))
Nums = sel.fit_transform(numerical)
new_numerical = pd.DataFrame(Nums)
print(numerical.shape)
print(new_numerical.shape)

(95412, 315)
(95412, 305)


In [20]:
def variance_threshold_selector(numerical, threshold=0.8):
    selector = VarianceThreshold(threshold)
    selector.fit(numerical)
    return numerical[numerical.columns[selector.get_support(indices=True)]]


In [21]:
Num = variance_threshold_selector(numerical)

In [22]:
Num.head()

Unnamed: 0,TCODE,AGE,INCOME,WEALTH1,HIT,MALEMILI,MALEVET,VIETVETS,WWIIVETS,LOCALGOV,...,NGIFTALL,CARDGIFT,MINRAMNT,MAXRAMNT,LASTGIFT,TIMELAG,AVGGIFT,CONTROLN,RFA_2F,CLUSTER2
0,0,60.0,5,9,0,0,39,34,18,10,...,31,14,5.0,12.0,10.0,4,7.741935,95515,4,39
1,1,46.0,6,9,16,0,15,55,11,6,...,3,1,10.0,25.0,25.0,18,15.666667,148535,2,1
2,1,61.611649,3,1,2,0,20,29,33,6,...,27,14,2.0,16.0,5.0,12,7.481481,15078,4,60
3,0,70.0,1,4,2,0,23,14,31,3,...,16,7,2.0,11.0,10.0,9,6.8125,172556,4,41
4,0,78.0,3,2,60,1,28,9,53,26,...,37,8,3.0,15.0,15.0,14,6.864865,7112,2,26


## Select K-Best

In [23]:
target['TARGET_D']

0         0.0
1         0.0
2         0.0
3         0.0
4         0.0
         ... 
95407     0.0
95408     0.0
95409     0.0
95410    18.0
95411     0.0
Name: TARGET_D, Length: 95412, dtype: float64

In [25]:
X = Num
y = target['TARGET_B']

from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2
kbest = SelectKBest(chi2, k=20).fit_transform(X, y)
# Here we chose 10 so that is easier to analyze results later, as we will see
selected = pd.DataFrame(kbest)
selected.head()

# To check the scores
model = SelectKBest(chi2, k=20).fit(X, y)
df =pd.DataFrame(data = model.scores_, columns = ['score'])
df['Column'] = Num.columns
# Sorting data
print(df.sort_values(by = ['score'], ascending = False).head(10))


             score    Column
302  527716.426176  CONTROLN
139  187983.976667       IC5
82    49855.611718       HV1
83    49561.067003       HV2
0     39087.069814     TCODE
132   26891.429352       MSA
13    17167.230879    POP901
136    2921.367106       IC2
14     2811.233301    POP902
294    2756.199364  RAMNTALL


In [26]:
list_to_keep = list(df.sort_values(by = ['score'], ascending = False).head(20)['Column'].values)

In [27]:
Num = Num[list_to_keep]

In [28]:
my_list_of_tuples = [(index,value) for (index,value) in enumerate(list(sel.variances_)) if value < 0.9]

In [29]:
cols_nul_variance =  [list(numerical.columns)[index] for index in [x[0] for x in my_list_of_tuples]]
cols_nul_variance

['ETH6',
 'TPE6',
 'TPE7',
 'ANC5',
 'ANC6',
 'ANC11',
 'ANC15',
 'HC15',
 'MHUC2',
 'HPHONE_D']

# Concat

In [30]:
df = pd.concat([Num, df_dummies, years_df], axis=1)

In [31]:
df

Unnamed: 0,CONTROLN,IC5,HV1,HV2,TCODE,MSA,POP901,IC2,POP902,RAMNTALL,...,DOMAIN_A_R,DOMAIN_A_S,DOMAIN_A_T,DOMAIN_A_U,ODATEW_YR,DOB_YR,MINRDATE_YR,MAXRDATE_YR,LASTDATE_YR,FIRSTDATE_YR
0,95515,12883,479,635,0,0.0,992,318,264,240.0,...,0,0,1,0,89,37,92,94,95,89
1,148535,36175,5468,5218,1,4480.0,3611,1096,940,47.0,...,0,1,0,0,94,52,93,95,95,93
2,15078,11576,497,546,1,0.0,7001,292,2040,202.0,...,1,0,0,0,90,0,91,92,95,90
3,172556,15130,1000,1263,0,9340.0,640,388,160,109.0,...,1,0,0,0,87,28,87,94,95,87
4,7112,9836,576,594,0,5000.0,2520,250,627,254.0,...,0,1,0,0,86,20,93,96,96,79
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
95407,184568,18807,988,1025,1,380.0,27380,481,7252,25.0,...,0,0,0,0,96,0,96,96,96,96
95408,122706,26538,1679,1723,1,3360.0,1254,836,322,20.0,...,0,0,0,0,96,50,96,96,96,96
95409,189641,12178,376,377,1,4040.0,552,264,131,58.0,...,0,0,0,0,95,38,96,95,96,94
95410,4693,15948,2421,2459,0,8735.0,1746,544,432,498.0,...,0,0,0,0,86,40,90,96,97,86


## Random Forests algorithm

In [32]:
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier


In [33]:
#-- 1 --
#X-Y-SPLIT
X = df
Y = target['TARGET_B']


#-- 2 --
#TRAIN-TEST-SPLIT
from sklearn.model_selection import train_test_split
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.3, random_state=100)


In [34]:
#-- 4.2 --
#BALANCING
from imblearn.over_sampling import SMOTE

sa = SMOTE()
X_s, Y_s = sa.fit_sample(X_train, Y_train)


In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import cross_val_score

clf = RandomForestClassifier(max_depth=2, random_state=0)
clf.fit(X_s, Y_s)
print("The accuracy of the Random forest is: {:4.2f}".format(clf.score(X_test, Y_test)))
print()

alpha = 0.05
K = 10
# For cross validation
clf = RandomForestClassifier(max_depth=2, random_state=0)
cross_val_scores = cross_val_score(clf, X_s, Y_s, cv=K)

if (K < 30):
    t_critical = abs(t.ppf(1-alpha/2, K-1))
    interval = t_critical*(np.std(cross_val_score(clf,  X_s, Y_s, cv=10))/np.sqrt(K))
else:
    z_critical = abs(norm.ppf(1-alpha/2))
    interval = z_critical*(np.std(cross_val_score(clf,  X_s, Y_s, cv=10))/np.sqrt(K)) 
print("The accuracy of the Random Forest model (CV witk K={}) is: {:4.2f} +/- {:4.2f}".format(K,np.mean(cross_val_scores),interval))

The accuracy of the Random forest is: 0.73

