# Lab | Random Forests

### Importing the datasets

In [1]:
import pandas as pd
import numpy as np

pd.set_option('display.max_columns', None)

In [2]:
numerical=pd.read_csv('./files_for_lab/numerical.csv')
categorical=pd.read_csv('./files_for_lab/categorical.csv')
targets=pd.read_csv('./files_for_lab/target.csv')

In [3]:
all_data= pd.concat([numerical, categorical, targets], axis = 1)

In [4]:
all_data['TARGET_B'].value_counts()

0    90569
1     4843
Name: TARGET_B, dtype: int64

In [5]:
all_data.shape

(95412, 339)

In [6]:
all_data.isna().sum().sum() #no nans in the data

0

### X,y split and Train-Test Split

In [7]:
y_class= all_data['TARGET_B'] #target for classification
X= all_data.drop(['TARGET_B','TARGET_D'],axis=1) #TARGET_D will be used for regression

In [8]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y_class, test_size=0.2, random_state=0)

X_train_num   = X_train.select_dtypes(np.number)
X_test_num    = X_test.select_dtypes(np.number)
X_train_cat = X_train.select_dtypes(object) 
X_test_cat  = X_test.select_dtypes(object) 

### Scaling and Encoding

In [9]:
from sklearn.preprocessing import OneHotEncoder

encoder = OneHotEncoder(drop='first').fit(X_train_cat)
encoded_cat_train = encoder.transform(X_train_cat).toarray()
X_train_encoded = pd.DataFrame(encoded_cat_train, columns=encoder.get_feature_names_out())

encoded_cat_test = encoder.transform(X_test_cat).toarray()
X_test_encoded = pd.DataFrame(encoded_cat_test, columns=encoder.get_feature_names_out())

In [10]:
from sklearn.preprocessing import MinMaxScaler

transformer = MinMaxScaler().fit(X_train_num)
scaled_train = transformer.transform(X_train_num)
X_train_scaled = pd.DataFrame(scaled_train, columns=X_train_num.columns)

scaled_test  = transformer.transform(X_test_num)
X_test_scaled  = pd.DataFrame(scaled_test, columns=X_train_num.columns)

X_train_trans = pd.concat([X_train_scaled, X_train_encoded], axis = 1)
X_test_trans  = pd.concat([X_test_scaled,  X_test_encoded],  axis = 1)

In [11]:
display(X_train_trans.head())
display(y_train) #need to reset_index on y_train

Unnamed: 0,TCODE,AGE,INCOME,WEALTH1,HIT,MALEMILI,MALEVET,VIETVETS,WWIIVETS,LOCALGOV,STATEGOV,FEDGOV,WEALTH2,POP901,POP902,POP903,POP90C1,POP90C2,POP90C3,POP90C4,POP90C5,ETH1,ETH2,ETH3,ETH4,ETH5,ETH6,ETH7,ETH8,ETH9,ETH10,ETH11,ETH12,ETH13,ETH14,ETH15,ETH16,AGE901,AGE902,AGE903,AGE904,AGE905,AGE906,AGE907,CHIL1,CHIL2,CHIL3,AGEC1,AGEC2,AGEC3,AGEC4,AGEC5,AGEC6,AGEC7,CHILC1,CHILC2,CHILC3,CHILC4,CHILC5,HHAGE1,HHAGE2,HHAGE3,HHN1,HHN2,HHN3,HHN4,HHN5,HHN6,MARR1,MARR2,MARR3,MARR4,HHP1,HHP2,DW1,DW2,DW3,DW4,DW5,DW6,DW7,DW8,DW9,HV1,HV2,HV3,HV4,HU1,HU2,HU3,HU4,HU5,HHD1,HHD2,HHD3,HHD4,HHD5,HHD6,HHD7,HHD8,HHD9,HHD10,HHD11,HHD12,ETHC1,ETHC2,ETHC3,ETHC4,ETHC5,ETHC6,HVP1,HVP2,HVP3,HVP4,HVP5,HVP6,HUR1,HUR2,RHP1,RHP2,RHP3,RHP4,HUPA1,HUPA2,HUPA3,HUPA4,HUPA5,HUPA6,HUPA7,RP1,RP2,RP3,RP4,MSA,ADI,DMA,IC1,IC2,IC3,IC4,IC5,IC6,IC7,IC8,IC9,IC10,IC11,IC12,IC13,IC14,IC15,IC16,IC17,IC18,IC19,IC20,IC21,IC22,IC23,HHAS1,HHAS2,HHAS3,HHAS4,MC1,MC2,MC3,TPE1,TPE2,TPE3,TPE4,TPE5,TPE6,TPE7,TPE8,TPE9,PEC1,PEC2,TPE10,TPE11,TPE12,TPE13,LFC1,LFC2,LFC3,LFC4,LFC5,LFC6,LFC7,LFC8,LFC9,LFC10,OCC1,OCC2,OCC3,OCC4,OCC5,OCC6,OCC7,OCC8,OCC9,OCC10,OCC11,OCC12,OCC13,EIC1,EIC2,EIC3,EIC4,EIC5,EIC6,EIC7,EIC8,EIC9,EIC10,EIC11,EIC12,EIC13,EIC14,EIC15,EIC16,OEDC1,OEDC2,OEDC3,OEDC4,OEDC5,OEDC6,OEDC7,EC1,EC2,EC3,EC4,EC5,EC6,EC7,EC8,SEC1,SEC2,SEC3,SEC4,SEC5,AFC1,AFC2,AFC3,AFC4,AFC5,AFC6,VC1,VC2,VC3,VC4,ANC1,ANC2,ANC3,ANC4,ANC5,ANC6,ANC7,ANC8,ANC9,ANC10,ANC11,ANC12,ANC13,ANC14,ANC15,POBC1,POBC2,LSC1,LSC2,LSC3,LSC4,VOC1,VOC2,VOC3,HC1,HC2,HC3,HC4,HC5,HC6,HC7,HC8,HC9,HC10,HC11,HC12,HC13,HC14,HC15,HC16,HC17,HC18,HC19,HC20,HC21,MHUC1,MHUC2,AC1,AC2,CARDPROM,NUMPROM,CARDPM12,NUMPRM12,RAMNTALL,NGIFTALL,CARDGIFT,MINRAMNT,MAXRAMNT,LASTGIFT,TIMELAG,AVGGIFT,CONTROLN,HPHONE_D,RFA_2F,CLUSTER2,CLUSTER,DATASRCE,DOMAIN_B,ODATEW_YR,ODATEW_MM,DOB_YR,DOB_MM,MINRDATE_YR,MINRDATE_MM,MAXRDATE_YR,MAXRDATE_MM,LASTDATE_YR,LASTDATE_MM,FIRSTDATE_YR,FIRSTDATE_MM,STATE_FL,STATE_GA,STATE_IL,STATE_IN,STATE_MI,STATE_MO,STATE_NC,STATE_TX,STATE_WA,STATE_WI,STATE_other,HOMEOWNR_U,GENDER_M,GENDER_other,RFA_2A_E,RFA_2A_F,RFA_2A_G,GEOCODE2_B,GEOCODE2_C,GEOCODE2_D,DOMAIN_A_R,DOMAIN_A_S,DOMAIN_A_T,DOMAIN_A_U
0,1.7e-05,0.762887,0.5,0.666667,0.008299,0.0,0.313131,0.10101,0.686869,0.060606,0.121212,0.034483,1.0,0.007153,0.009129,0.007683,1.0,0.0,0.0,0.474747,0.535354,0.757576,0.212121,0.0,0.020202,0.020202,0.0,0.0,0.0,0.0,0.0,0.021277,0.0,0.0,0.017544,0.0,0.023256,0.404762,0.5,0.547619,0.428571,0.535714,0.583333,0.333333,0.363636,0.474747,0.171717,0.131313,0.212121,0.191919,0.131313,0.151515,0.121212,0.060606,0.141414,0.141414,0.353535,0.212121,0.161616,0.282828,0.090909,0.252525,0.212121,0.383838,0.424242,0.232323,0.10101,0.040404,0.616162,0.10101,0.09589,0.222222,0.272308,0.371429,0.777778,0.757576,0.028571,0.222222,0.20202,0.10101,0.0,0.0,0.0,0.105167,0.107333,0.230769,0.230769,0.717172,0.292929,0.939394,0.070707,0.0,0.343434,0.757576,0.59596,0.242424,0.888889,0.121212,0.10101,0.04,0.080808,0.090909,0.212121,0.040404,0.186667,0.434343,0.191919,0.162791,0.141414,0.012346,0.0,0.010101,0.040404,0.232323,0.757576,0.0,0.0,0.525253,0.635294,0.633333,0.229508,0.1,0.161616,0.060606,0.0,0.111111,0.090909,0.090909,0.0,0.030303,0.080808,0.808081,0.929293,0.337607,0.327189,0.643587,0.225333,0.220667,0.238,0.228667,0.068232,0.10101,0.313131,0.131313,0.212121,0.212121,0.050505,0.0,0.0,0.0,0.080808,0.333333,0.161616,0.212121,0.161616,0.060606,0.0,0.0,0.0,0.414141,0.070707,0.575758,0.070707,0.343434,0.666667,0.111111,0.757576,0.20202,0.0,0.0,0.0,0.0,0.0,0.0,0.050505,0.0,0.050505,0.155556,0.25,0.050505,0.454545,0.676768,0.666667,0.686869,0.636364,0.686869,0.909091,0.59596,1.0,1.0,0.0,0.272727,0.20202,0.020202,0.10101,0.171717,0.0,0.0,0.070707,0.0,0.040404,0.080808,0.028571,0.030303,0.0,0.0,0.020202,0.141414,0.1,0.078125,0.020202,0.222222,0.030303,0.0,0.0,0.0,0.030303,0.282828,0.070707,0.090909,0.060606,0.121212,0.030303,0.040404,0.59596,0.161616,0.0,0.823529,0.050505,0.10101,0.292929,0.121212,0.162162,0.252525,0.141414,0.072165,0.222222,0.033333,0.263889,0.080808,0.0,0.0,0.0,0.211268,0.313131,0.066667,0.10101,0.30303,0.686869,0.0,0.0,0.060606,0.0,0.012048,0.0,0.0,0.122449,0.018182,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.011111,0.676768,0.989899,0.020202,0.0,0.0,0.969697,0.666667,0.131313,0.322581,0.615385,0.0,0.050505,0.050505,0.141414,0.464646,0.545455,0.0,0.0,0.212121,0.030303,0.262626,0.474747,0.0,0.030303,1.0,0.0,1.0,1.0,0.909091,0.285714,0.4,0.090909,0.080808,0.366667,0.230366,0.263158,0.12987,0.008658,0.063559,0.195122,0.001,0.001001,0.007,0.003676,0.00472,0.127215,1.0,0.666667,0.508197,0.442308,1.0,0.0,0.214286,0.0,0.237113,0.0,0.636364,0.090909,0.681818,1.0,0.5,0.0,0.916667,0.181818,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.536082,0.666667,1.0,0.0,0.0,0.292929,0.242424,0.383838,0.070707,0.080808,0.045977,1.0,0.045856,0.055403,0.044968,0.0,0.0,1.0,0.494949,0.515152,0.757576,0.161616,0.090909,0.0,0.010101,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.010309,0.0,0.0,0.0,0.416667,0.559524,0.607143,0.452381,0.583333,0.630952,0.373333,0.393939,0.40404,0.20202,0.111111,0.181818,0.161616,0.151515,0.141414,0.121212,0.141414,0.141414,0.181818,0.343434,0.191919,0.151515,0.353535,0.141414,0.323232,0.212121,0.333333,0.464646,0.30303,0.141414,0.050505,0.626263,0.080808,0.150685,0.181818,0.290769,0.395714,0.787879,0.777778,0.0,0.030303,0.030303,0.010101,0.020202,0.020202,0.0,0.042833,0.052667,0.153846,0.076923,0.79798,0.212121,0.89899,0.111111,0.070707,0.393939,0.777778,0.636364,0.30303,0.888889,0.121212,0.090909,0.04,0.070707,0.10101,0.181818,0.020202,0.213333,0.40404,0.181818,0.093023,0.080808,0.049383,0.0,0.010101,0.020202,0.040404,0.161616,0.0,0.010101,0.373737,0.576471,0.577778,0.229508,0.125,0.030303,0.0,0.191919,0.141414,0.020202,0.010101,0.040404,0.0,0.010101,0.050505,0.30303,0.0,0.41321,0.761635,0.117333,0.142667,0.148667,0.166,0.044796,0.454545,0.20202,0.141414,0.141414,0.050505,0.010101,0.0,0.0,0.0,0.363636,0.222222,0.171717,0.171717,0.060606,0.010101,0.0,0.0,0.0,0.414141,0.141414,0.252525,0.232323,0.353535,0.656566,0.040404,0.767677,0.212121,0.0,0.0,0.0,0.0,0.0,0.010101,0.020202,0.0,0.444444,0.344444,0.447368,0.131313,0.59596,0.484848,0.585859,0.40404,0.555556,0.363636,0.555556,0.444444,0.575758,0.909091,0.10101,0.070707,0.060606,0.030303,0.121212,0.141414,0.0,0.036364,0.10101,0.050505,0.131313,0.131313,0.1,0.070707,0.040404,0.04918,0.080808,0.242424,0.083333,0.03125,0.030303,0.151515,0.050505,0.030303,0.020202,0.016393,0.080808,0.080808,0.030303,0.060606,0.070707,0.080808,0.040404,0.080808,0.69697,0.040404,0.0,0.705882,0.20202,0.212121,0.393939,0.111111,0.081081,0.030303,0.030303,0.010309,0.242424,0.033333,0.277778,0.040404,0.0,0.0,0.0,0.197183,0.292929,0.033333,0.242424,0.171717,0.383838,0.10101,0.0,0.030303,0.0,0.048193,0.0,0.0,0.061224,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.808081,0.989899,0.010101,0.0,0.010101,0.888889,0.606061,0.232323,0.290323,0.442308,0.010101,0.050505,0.20202,0.454545,0.59596,0.414141,0.0,0.016129,0.666667,0.121212,0.10101,0.010101,0.0,0.111111,0.969697,0.010101,0.565657,0.979798,0.868687,0.380952,0.4,0.070707,0.070707,0.083333,0.052356,0.263158,0.142857,0.000634,0.0,0.02439,0.02,0.003003,0.02,0.008272,0.018738,0.599688,0.0,0.0,0.967213,1.0,0.5,0.666667,0.928571,0.0,0.463918,0.0,0.954545,0.090909,0.954545,0.090909,0.5,0.090909,1.0,0.090909,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0
2,1.7e-05,0.608247,0.666667,0.111111,0.020747,0.0,0.424242,0.161616,0.626263,0.10101,0.020202,0.011494,0.111111,0.067304,0.091154,0.087309,1.0,0.0,0.0,0.454545,0.555556,0.989899,0.0,0.0,0.010101,0.020202,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.017544,0.0,0.011628,0.595238,0.702381,0.738095,0.559524,0.642857,0.678571,0.2,0.505051,0.353535,0.151515,0.10101,0.161616,0.111111,0.080808,0.151515,0.252525,0.151515,0.232323,0.191919,0.30303,0.151515,0.131313,0.464646,0.141414,0.444444,0.292929,0.494949,0.232323,0.10101,0.040404,0.010101,0.606061,0.111111,0.178082,0.151515,0.221538,0.298571,0.454545,0.383838,0.042857,0.40404,0.373737,0.323232,0.020202,0.020202,0.0,0.102667,0.108167,0.307692,0.230769,0.69697,0.313131,0.838384,0.171717,0.575758,0.20202,0.666667,0.545455,0.131313,0.777778,0.232323,0.070707,0.02,0.060606,0.131313,0.262626,0.060606,0.173333,0.434343,0.424242,0.0,0.0,0.0,0.010101,0.010101,0.060606,0.242424,0.757576,0.0,0.030303,0.191919,0.494118,0.511111,0.196721,0.1,0.171717,0.232323,0.141414,0.040404,0.080808,0.181818,0.010101,0.020202,0.292929,0.777778,0.969697,0.884615,0.201229,0.611805,0.141333,0.156,0.165333,0.188,0.068255,0.333333,0.272727,0.191919,0.131313,0.050505,0.010101,0.02,0.0,0.0,0.242424,0.282828,0.222222,0.161616,0.070707,0.020202,0.02,0.0,0.0,0.454545,0.050505,0.474747,0.151515,0.676768,0.333333,0.323232,0.868687,0.111111,0.0,0.0,0.0,0.0,0.0,0.010101,0.010101,0.012048,0.353535,0.2,0.328947,0.080808,0.555556,0.464646,0.525253,0.414141,0.484848,0.40404,0.656566,0.484848,0.676768,1.0,0.070707,0.151515,0.070707,0.060606,0.141414,0.151515,0.0,0.036364,0.151515,0.040404,0.111111,0.010101,0.071429,0.060606,0.040404,0.0,0.090909,0.060606,0.083333,0.03125,0.020202,0.232323,0.070707,0.040404,0.050505,0.032787,0.131313,0.070707,0.080808,0.050505,0.10101,0.020202,0.010101,0.060606,0.757576,0.060606,0.0,0.705882,0.080808,0.191919,0.383838,0.181818,0.162162,0.070707,0.030303,0.030928,0.161616,0.033333,0.152778,0.070707,0.0,0.0,0.0,0.28169,0.424242,0.066667,0.161616,0.141414,0.626263,0.090909,0.012048,0.050505,0.066667,0.13253,0.0,0.0,0.122449,0.181818,0.014706,0.050505,0.0,0.0,0.052632,0.037037,0.03125,0.055556,0.161616,0.909091,0.020202,0.0,0.080808,0.959596,0.333333,0.060606,0.096774,0.153846,0.070707,0.343434,0.59596,0.848485,0.929293,0.080808,0.1,0.064516,0.0,0.020202,0.969697,0.010101,0.0,0.010101,0.949495,0.060606,0.909091,1.0,0.979798,0.285714,0.4,0.040404,0.090909,0.416667,0.314136,0.315789,0.155844,0.006652,0.059322,0.146341,0.002,0.001001,0.01,0.004596,0.003853,0.237264,1.0,0.666667,0.786885,0.615385,1.0,0.666667,0.357143,0.0,0.381443,0.727273,0.727273,0.818182,0.909091,0.909091,0.0,0.909091,0.916667,0.727273,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,1.7e-05,0.783505,0.833333,0.666667,0.037344,0.010101,0.40404,0.232323,0.414141,0.080808,0.030303,0.045977,0.666667,0.014063,0.017853,0.013728,1.0,0.0,0.0,0.494949,0.515152,0.787879,0.010101,0.010101,0.151515,0.111111,0.0,0.013889,0.050505,0.089552,0.021739,0.021277,0.013889,0.072165,0.017544,0.0,0.034884,0.47619,0.583333,0.630952,0.47619,0.571429,0.619048,0.28,0.343434,0.444444,0.232323,0.10101,0.161616,0.171717,0.141414,0.222222,0.141414,0.060606,0.151515,0.131313,0.353535,0.212121,0.161616,0.343434,0.080808,0.292929,0.151515,0.383838,0.474747,0.313131,0.141414,0.060606,0.636364,0.080808,0.109589,0.212121,0.295385,0.407143,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.342,0.340333,0.692308,0.615385,0.888889,0.121212,1.0,0.010101,0.0,0.323232,0.828283,0.717172,0.272727,0.909091,0.10101,0.050505,0.02,0.030303,0.090909,0.151515,0.030303,0.16,0.444444,0.212121,0.023256,0.010101,0.0,0.545455,0.89899,0.979798,1.0,1.0,0.010101,0.0,0.606061,0.635294,0.644444,0.245902,0.125,0.0,0.0,0.0,0.121212,0.0,0.0,0.0,0.89899,0.919192,0.939394,0.939394,0.616987,0.099846,0.916005,0.292667,0.31,0.333333,0.350667,0.114323,0.050505,0.050505,0.232323,0.272727,0.262626,0.10101,0.04,0.0,0.020202,0.010101,0.050505,0.222222,0.272727,0.323232,0.090909,0.06,0.0,0.030303,0.323232,0.060606,0.575758,0.020202,0.313131,0.69697,0.060606,0.818182,0.10101,0.040404,0.020202,0.028169,0.0,0.08,0.010101,0.010101,0.0,0.131313,0.233333,0.315789,0.030303,0.666667,0.646465,0.707071,0.585859,0.686869,0.585859,0.727273,0.636364,0.767677,1.0,0.050505,0.121212,0.141414,0.050505,0.070707,0.252525,0.0,0.072727,0.060606,0.010101,0.121212,0.040404,0.028571,0.070707,0.020202,0.0,0.060606,0.141414,0.133333,0.09375,0.090909,0.191919,0.060606,0.020202,0.040404,0.032787,0.080808,0.070707,0.0,0.070707,0.080808,0.030303,0.040404,0.030303,0.787879,0.040404,0.0,0.705882,0.060606,0.131313,0.363636,0.212121,0.297297,0.090909,0.030303,0.030928,0.212121,0.033333,0.208333,0.080808,0.010309,0.010101,0.0,0.28169,0.40404,0.0,0.232323,0.272727,0.414141,0.070707,0.0,0.020202,0.0,0.060241,0.0,0.0,0.122449,0.145455,0.0,0.010101,0.219512,0.0,0.052632,0.074074,0.0,0.188889,0.545455,0.828283,0.040404,0.090909,0.050505,1.0,0.777778,0.393939,0.612903,0.673077,0.0,0.0,0.0,0.010101,0.141414,0.868687,0.0,0.0,0.888889,0.030303,0.050505,0.0,0.0,0.030303,1.0,0.0,1.0,1.0,1.0,0.428571,0.4,0.141414,0.090909,0.433333,0.314136,0.315789,0.155844,0.015099,0.033898,0.170732,0.01,0.004004,0.02,0.003676,0.01618,0.863723,1.0,0.0,0.131148,0.076923,1.0,0.333333,0.428571,0.0,0.216495,0.0,0.590909,0.909091,0.818182,0.727273,0.0,1.0,0.916667,0.909091,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
4,5.2e-05,0.556701,0.666667,0.222222,0.087137,0.333333,0.272727,0.292929,0.181818,0.121212,0.020202,0.08046,1.0,0.01004,0.012052,0.010988,0.939394,0.0,0.070707,0.505051,0.505051,0.888889,0.080808,0.0,0.030303,0.040404,0.0,0.0,0.0,0.014925,0.021739,0.0,0.0,0.020619,0.017544,0.0,0.011628,0.309524,0.380952,0.416667,0.321429,0.428571,0.47619,0.4,0.545455,0.343434,0.121212,0.20202,0.383838,0.161616,0.131313,0.080808,0.040404,0.010101,0.222222,0.242424,0.343434,0.131313,0.070707,0.060606,0.020202,0.060606,0.242424,0.313131,0.454545,0.252525,0.080808,0.010101,0.626263,0.131313,0.041096,0.222222,0.281538,0.364286,0.222222,0.222222,0.014286,0.272727,0.262626,0.252525,0.0,0.0,0.0,0.1025,0.100167,0.307692,0.230769,0.686869,0.323232,0.969697,0.040404,0.191919,0.444444,0.69697,0.555556,0.333333,0.838384,0.171717,0.111111,0.02,0.10101,0.212121,0.131313,0.070707,0.32,0.59596,0.050505,0.069767,0.050505,0.0,0.0,0.0,0.0,0.070707,0.838384,0.0,0.030303,0.222222,0.517647,0.533333,0.229508,0.125,0.060606,0.20202,0.515152,0.040404,0.020202,0.242424,0.030303,0.010101,0.161616,0.919192,0.959596,0.632479,0.462366,0.740068,0.164667,0.172,0.179333,0.182667,0.064267,0.222222,0.292929,0.282828,0.131313,0.050505,0.010101,0.02,0.0,0.0,0.212121,0.252525,0.363636,0.080808,0.080808,0.0,0.04,0.0,0.0,0.10101,0.040404,0.434343,0.151515,0.777778,0.232323,0.353535,0.808081,0.151515,0.0,0.0,0.0,0.0,0.0,0.0,0.060606,0.0,0.272727,0.155556,0.25,0.0,0.505051,0.868687,0.929293,0.79798,0.878788,0.717172,0.636364,0.555556,0.919192,1.0,0.020202,0.111111,0.040404,0.050505,0.10101,0.232323,0.0,0.054545,0.20202,0.020202,0.090909,0.030303,0.128571,0.020202,0.020202,0.0,0.040404,0.070707,0.116667,0.046875,0.060606,0.181818,0.070707,0.10101,0.0,0.032787,0.111111,0.10101,0.090909,0.050505,0.121212,0.020202,0.070707,0.090909,0.646465,0.070707,0.0,0.723529,0.050505,0.121212,0.313131,0.343434,0.108108,0.141414,0.0,0.041237,0.424242,0.1,0.277778,0.232323,0.195876,0.333333,0.051282,0.225352,0.272727,0.2,0.292929,0.111111,0.181818,0.545455,0.024096,0.030303,0.0,0.216867,0.0,0.0,0.102041,0.054545,0.0,0.020202,0.0,0.0,0.0,0.0,0.0,0.066667,0.393939,0.89899,0.010101,0.030303,0.060606,0.979798,0.484848,0.090909,0.096774,0.307692,0.030303,0.171717,0.505051,0.79798,0.919192,0.090909,0.0,0.0,0.89899,0.020202,0.10101,0.0,0.0,0.0,1.0,0.0,1.0,1.0,0.989899,0.285714,0.0,0.060606,0.040404,0.183333,0.141361,0.210526,0.103896,0.001795,0.008475,0.02439,0.005,0.002002,0.015,0.004596,0.009059,0.555564,1.0,0.0,0.57377,0.5,1.0,0.333333,0.785714,0.0,0.443299,0.0,0.863636,0.0,0.909091,0.272727,0.0,0.272727,0.979167,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0


79401    0
86429    0
76729    1
38838    0
83012    0
        ..
21243    0
45891    0
42613    1
43567    0
68268    0
Name: TARGET_B, Length: 76329, dtype: int64

### Treating the imbalance

In [12]:
trainset = pd.concat([X_train_trans, y_train.reset_index(drop=True)], axis=1)

In [13]:
category_0 = trainset[trainset['TARGET_B']==0]
print(category_0.shape)

category_1_upsampled = trainset[trainset['TARGET_B']== 1 ].sample(len(trainset[trainset['TARGET_B']==0]),replace=True)
print(category_1_upsampled.shape)

(72486, 355)
(72486, 355)


In [14]:
trainset_upsampled= pd.concat([category_0, category_1_upsampled], axis = 0) 
trainset_upsampled = trainset_upsampled.sample(frac=1) #randomize the rows

In [15]:
X_train = trainset_upsampled.drop(['TARGET_B'], axis=1)
y_train = trainset_upsampled['TARGET_B']

### Random Forest Classifier

In [16]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import confusion_matrix

clf = RandomForestClassifier(max_depth=5, # max number of questions to ask
                             min_samples_split=20, # amount of rows still considered at every question
                             min_samples_leaf =20, # ultimate answer based on at least this many rows
                             #max_samples=0.8, # fraction of X-train to use in each tree
                             random_state=42)
clf.fit(X_train, y_train)
print(clf.score(X_train, y_train))
print(clf.score(X_test_trans, y_test))

y_pred = clf.predict(X_test_trans)
display(y_test.value_counts())
display(confusion_matrix(y_test, y_pred))

0.6259829484314212
0.6071896452339779


0    18083
1     1000
Name: TARGET_B, dtype: int64

array([[11042,  7041],
       [  455,   545]], dtype=int64)

### Cross Validation

In [17]:
from sklearn.model_selection import cross_val_score
clf = RandomForestClassifier(max_depth=5,
                             min_samples_split=20,
                             min_samples_leaf =20,
                            # max_samples=0.8
                            )
cross_val_scores = cross_val_score(clf, X_train, y_train, cv=5,scoring='f1')
print(np.mean(cross_val_scores))

0.6245545310657385


In [18]:
print(cross_val_scores)

[0.61914626 0.62776304 0.62731325 0.62429035 0.62425975]


## Feature Selection

### Variance Threshold

#### Scaling the numericals

In [19]:
num_scaled=MinMaxScaler().fit_transform(numerical)
num_scaled=pd.DataFrame(num_scaled,columns=numerical.columns)

In [20]:
from sklearn.feature_selection import VarianceThreshold 

var_threshold= 0.02
sel = VarianceThreshold(threshold=(var_threshold))

sel= sel.fit(num_scaled)

In [21]:
var_list= list(sel.get_support()) #gives false for columns below the threshold

In [22]:
cols=[col[0] for col in zip(numerical.columns, var_list) if col[1] == False]
len(cols) # no of columns with low variance

236

In [23]:
# I want to keep columns with low variance that have correlation with the target > 0.1

corr=[]
for col in cols:
    if abs(num_scaled[col].corr(y_class)) >= 0.1:
        corr.append(col)
corr #  empty list means no columns have any good correaltion with the target

[]

In [24]:
#dropping columns with low variance

selected_cols=[col[0] for col in zip(numerical.columns, var_list) if col[1] == True]
num_data=pd.DataFrame(sel.transform(num_scaled),columns=selected_cols)
num_data.shape

(95412, 79)

#### Checking correlation between the remaining columns with each other

In [25]:
corr_matrix=num_data.corr()
corr_matrix.shape

(79, 79)

In [26]:
#finding columns that are highly correlated with each other

def high_corr(matrix):
    highly_corr=pd.DataFrame()
    
    for col in matrix.columns:
        correlated_cols = matrix[(abs(matrix[col]) >= 0.95)].index   
        correlated_pairs = pd.DataFrame({'Column 1': col, 'Column 2': correlated_cols})  
        highly_corr = pd.concat([highly_corr, correlated_pairs], ignore_index=True)
    highly_corr = highly_corr[highly_corr['Column 1'] < highly_corr['Column 2']]    
    return highly_corr

In [27]:
high_corr(corr_matrix)

Unnamed: 0,Column 1,Column 2
13,DW1,DW2
17,DW4,DW5
20,DW5,DW6
23,DW6,HUPA2
25,HV1,HV2
61,LFC2,LFC4


In [28]:
#need drop columns 'DW2', 'DW5','DW6','HV2','LFC4' 

num_data=num_data.drop(['DW2', 'DW5','DW6','HV2','LFC4'],axis=1)

In [29]:
#joining the numerical and categorical data

X=pd.concat([num_data,categorical],axis=1)

In [30]:
X_train, X_test, y_train, y_test = train_test_split(X, y_class, test_size=0.2, random_state=0)


#numerical is already scaled
X_train_num   = X_train.select_dtypes(np.number)
X_test_num    = X_test.select_dtypes(np.number)
X_train_cat = X_train.select_dtypes(object) 
X_test_cat  = X_test.select_dtypes(object) 

#### Encoding the categorical

In [31]:
encoder = OneHotEncoder(drop='first').fit(X_train_cat)
encoded_cat_train = encoder.transform(X_train_cat).toarray()
X_train_encoded = pd.DataFrame(encoded_cat_train, columns=encoder.get_feature_names_out())

encoded_cat_test = encoder.transform(X_test_cat).toarray()
X_test_encoded = pd.DataFrame(encoded_cat_test, columns=encoder.get_feature_names_out())

In [32]:
X_train=pd.concat([X_train_num.reset_index(drop=True), X_train_encoded],axis=1)
X_test=pd.concat([X_test_num.reset_index(drop=True),X_test_encoded],axis=1)

#### Need to deal with imbalance again

In [33]:
trainset=pd.concat([X_train,y_train.reset_index(drop=True)],axis=1)

In [34]:
trainset['TARGET_B'].value_counts()

0    72486
1     3843
Name: TARGET_B, dtype: int64

In [35]:
category_0 = trainset[trainset['TARGET_B']==0]
print(category_0.shape)

category_1_upsampled = trainset[trainset['TARGET_B']== 1 ].sample(len(trainset[trainset['TARGET_B']==0]),replace=True)
print(category_1_upsampled.shape)

(72486, 114)
(72486, 114)


In [36]:
trainset_upsampled= pd.concat([category_0, category_1_upsampled], axis = 0) 
trainset_upsampled = trainset_upsampled.sample(frac=1) #randomize the rows

In [37]:
X_train = trainset_upsampled.drop(['TARGET_B'], axis=1)
y_train = trainset_upsampled['TARGET_B']

#### Random Forest Classifier

In [38]:
clf = RandomForestClassifier(max_depth=5, # max number of questions to ask
                             min_samples_split=20, # amount of rows still considered at every question
                             min_samples_leaf =20, # ultimate answer based on at least this many rows
                             #max_samples=0.8, # fraction of X-train to use in each tree
                             random_state=42)
clf.fit(X_train, y_train)
print(clf.score(X_train, y_train))
print(clf.score(X_test, y_test))

y_pred = clf.predict(X_test)
display(y_test.value_counts())
display(confusion_matrix(y_test, y_pred))

0.6248930828021962
0.586909815018603


0    18083
1     1000
Name: TARGET_B, dtype: int64

array([[10627,  7456],
       [  427,   573]], dtype=int64)

In [39]:
#no of actual donors predicted as non-donors
#avg.donation=15

427*15

6405

In [40]:
#no .of non_donors predicted as donors
#cost per post=0.68

7456*(.68)

5070.08

- depending on if we want get more donation, we should reduce false negatives i.e. (427) by recall
- if we want to reduce money spent on post to non-donors, we should reduce false positives i.e (7456) by precision
- here we can see that both have almost same impact, so we can try to improve overall results by using 'f1'

#### Cross Validation

In [41]:
from sklearn.model_selection import cross_val_score
clf = RandomForestClassifier(max_depth=5,
                             min_samples_split=20,
                             min_samples_leaf =20,
                            # max_samples=0.8
                            )
cross_val_scores = cross_val_score(clf, X_train, y_train, cv=5,scoring='f1')
print(np.mean(cross_val_scores))

0.6332907177727294


In [42]:
print(cross_val_scores)

[0.63020676 0.63845127 0.63305527 0.62998077 0.63475952]


Features selection improves the model slightly.