In [1]:
import numpy as np
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, confusion_matrix
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.preprocessing import MinMaxScaler
from imblearn.over_sampling import SMOTE

In [2]:
%load_ext watermark
%watermark --iversions

numpy : 1.18.5
pandas: 1.0.1



In [95]:
data= pd.read_csv("Data/data3.csv")
data.shape

(18545, 23)

In [4]:
data.dtypes

DISAPPROVALS_BL        float64
DISAPPROVALS_EN        float64
DISAPPROVALS_HUMAN       int64
DISAPPROVALS_FM        float64
DISAPPROVALS_OM        float64
DISAPPROVALS_EXACTS      int64
NOTE_BL                float64
NOTE_EN                float64
NOTE_FM                float64
NOTE_OM                float64
NOTE_BL_CAT             object
NOTE_EN_CAT             object
NOTE_FM_CAT             object
NOTE_OM_CAT             object
NOTE_BL_CAT_N            int64
NOTE_EN_CAT_N            int64
NOTE_FM_CAT_N            int64
NOTE_OM_CAT_N            int64
ENGLISH                float64
H_CLASS_PRES           float64
ONLINE_TASKS           float64
ABSENCES               float64
PROFILE_N              float64
dtype: object

In [5]:
df= data.drop(["NOTE_BL_CAT", "NOTE_EN_CAT", "NOTE_FM_CAT", "NOTE_OM_CAT"], axis= 1)
df.dtypes

DISAPPROVALS_BL        float64
DISAPPROVALS_EN        float64
DISAPPROVALS_HUMAN       int64
DISAPPROVALS_FM        float64
DISAPPROVALS_OM        float64
DISAPPROVALS_EXACTS      int64
NOTE_BL                float64
NOTE_EN                float64
NOTE_FM                float64
NOTE_OM                float64
NOTE_BL_CAT_N            int64
NOTE_EN_CAT_N            int64
NOTE_FM_CAT_N            int64
NOTE_OM_CAT_N            int64
ENGLISH                float64
H_CLASS_PRES           float64
ONLINE_TASKS           float64
ABSENCES               float64
PROFILE_N              float64
dtype: object

In [96]:
x = df.iloc[:, 0:18]
y = df.PROFILE_N

In [97]:
smote = SMOTE(random_state= 100)
x, y= smote.fit_resample(x, y)

In [8]:
y.value_counts()

3.0    8229
5.0    8229
4.0    8229
1.0    8229
2.0    8229
Name: PROFILE_N, dtype: int64

In [9]:
xtrain, xtest, ytrain, ytest= train_test_split(x, y, test_size=.25, random_state= 100)

In [10]:
xtrain

Unnamed: 0,DISAPPROVALS_BL,DISAPPROVALS_EN,DISAPPROVALS_HUMAN,DISAPPROVALS_FM,DISAPPROVALS_OM,DISAPPROVALS_EXACTS,NOTE_BL,NOTE_EN,NOTE_FM,NOTE_OM,NOTE_BL_CAT_N,NOTE_EN_CAT_N,NOTE_FM_CAT_N,NOTE_OM_CAT_N,ENGLISH,H_CLASS_PRES,ONLINE_TASKS,ABSENCES
12028,0.0,0.0,0,0.0,0.0,0,5.200000,6.400000,4.600000,5.000000,6,8,6,6,0.0,13.000000,7.0,1.000000
39882,0.0,0.0,0,0.0,0.0,0,7.201905,7.396191,7.003809,6.901905,8,8,8,8,0.0,5.000000,5.0,3.000000
23068,0.0,0.0,0,0.0,0.0,0,7.180958,7.326986,7.288903,6.538083,8,8,8,8,1.0,3.000000,1.0,6.269861
21313,0.0,0.0,0,0.0,0.0,0,7.077455,6.922545,7.369939,5.352606,8,8,8,6,1.0,2.000000,2.0,5.000000
34160,0.0,0.0,0,0.0,0.0,0,7.160756,7.660756,7.143022,7.039244,8,8,8,8,1.0,4.196222,0.0,5.607556
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
16304,0.0,0.0,0,0.0,0.0,0,7.200000,6.100000,6.600000,5.200000,8,8,8,6,1.0,3.000000,2.0,8.000000
79,0.0,0.0,0,0.0,0.0,0,5.700000,5.600000,5.200000,5.700000,6,6,6,6,1.0,2.000000,5.0,3.000000
12119,0.0,0.0,0,0.0,0.0,0,5.400000,5.100000,5.000000,5.400000,6,6,6,6,0.0,5.000000,0.0,3.000000
14147,0.0,0.0,0,0.0,0.0,0,6.600000,6.000000,6.600000,6.300000,8,6,8,8,0.0,9.000000,4.0,4.000000


In [11]:
rf = RandomForestClassifier(max_depth= 15, random_state= 100, max_features= "log2")
model = rf.fit(xtrain, ytrain)

In [12]:
predict= model.predict(xtrain)
accuracy_score(ytrain, predict)

0.9460107589603992

In [13]:
pred= model.predict(xtest)
accuracy_score(ytest, pred)

0.854087683484009

In [78]:
parameters= {"n_estimators":[10, 50, 100, 200], "max_depth":[8, 10, 15, 20]}
rf1= RandomForestClassifier(random_state= 100)

In [79]:
grid= GridSearchCV(rf1, parameters, cv=5, refit=True, n_jobs=4, scoring="accuracy")

In [16]:
model2= grid.fit(xtrain, ytrain)

In [17]:
model2.best_score_

0.8696284641235273

In [18]:
model2.best_params_

{'max_depth': 20, 'n_estimators': 100}

In [19]:
pred2= model2.predict(xtest)
accuracy_score(ytest, pred2)

0.8834451249149412

In [20]:
parameters2= {"n_estimators":[50, 100, 200], "max_depth":[10, 15, 20],
            "min_samples_split":list(range(2,7))}
grid2= GridSearchCV(rf1, parameters2, cv=5, refit=True, n_jobs=4, scoring="accuracy")

In [21]:
model3= grid2.fit(xtrain, ytrain)



In [22]:
model3.best_score_

0.8696284641235273

In [23]:
model3.best_params_

{'max_depth': 20, 'min_samples_split': 2, 'n_estimators': 100}

In [24]:
pred3= model3.predict(xtest)
accuracy_score(ytest, pred3)

0.8834451249149412

<font color="darkpink">loading standardized data</font> 

In [25]:
xtrain_norm= pd.read_csv("Data/x_train_norm.csv")
xtest_norm= pd.read_csv("Data/x_test_norm.csv")
y_train= pd.read_csv("Data/y_train.csv")
y_test= pd.read_csv("Data/y_test.csv")

In [26]:
xtrain_norm

Unnamed: 0,DISAPPROVALS_BL,DISAPPROVALS_EN,DISAPPROVALS_FM,DISAPPROVALS_OM,NOTE_BL,NOTE_EN,NOTE_FM,NOTE_OM,ENGLISH,H_CLASS_PRES,ONLINE_TASKS,ABSENCES
0,-0.250308,-0.250198,-0.293156,-0.293808,-0.212294,-0.877855,-0.551592,-0.135847,0.800015,-0.557012,-0.886794,1.512318
1,-0.250308,-0.250198,-0.293156,-0.293808,-0.313401,-0.251804,-0.631016,-0.292352,0.800015,-1.117204,-0.886794,1.512318
2,1.917495,1.927500,-0.293156,-0.293808,-3.144408,-2.996797,-0.273605,-0.448858,0.800015,-0.930473,1.180295,1.512318
3,1.917495,1.927500,1.631678,1.701063,-3.144408,-2.996797,-2.497497,-2.952943,-1.385902,-1.117204,-1.920339,-0.330399
4,-0.250308,-0.250198,-0.293156,-0.293808,0.444904,0.470562,0.004381,0.698848,-1.385902,-0.743743,1.180295,-0.330399
...,...,...,...,...,...,...,...,...,...,...,...,...
33080,-0.250308,-0.250198,-0.293156,-0.293808,0.784377,0.655919,-0.341944,0.714010,0.177260,-0.503813,1.180295,-0.330399
33081,-0.250308,-0.250198,-0.293156,-0.293808,0.863443,1.192929,1.276570,-0.135847,-0.597996,1.377602,1.180295,-0.944638
33082,-0.250308,-0.250198,-0.293156,-0.293808,0.494578,0.423242,0.878399,0.491536,-1.385902,1.683757,0.663522,-0.536926
33083,-0.250308,-0.250198,-0.293156,-0.293808,0.882537,0.907402,1.056171,1.236893,0.800015,-0.292937,-0.672747,-0.944638


In [37]:
xtest_norm

Unnamed: 0,DISAPPROVALS_BL,DISAPPROVALS_EN,DISAPPROVALS_FM,DISAPPROVALS_OM,NOTE_BL,NOTE_EN,NOTE_FM,NOTE_OM,ENGLISH,H_CLASS_PRES,ONLINE_TASKS,ABSENCES
0,-0.250308,-0.250198,-0.293156,-0.293808,-0.262848,-0.059173,-0.591304,-0.396689,0.800015,-0.743743,-0.886794,-0.330399
1,-0.250308,-0.250198,-0.293156,-0.293808,0.394350,0.037142,0.361792,-0.396689,-1.385902,0.750103,0.146750,-0.944638
2,-0.250308,-0.250198,-0.293156,-0.293808,0.849333,0.855824,0.798628,-0.135847,0.800015,0.003180,0.663522,-0.330399
3,-0.250308,-0.250198,-0.293156,-0.293808,0.091028,-0.059173,-0.591304,0.072827,0.800015,-0.930473,-0.886794,0.283840
4,-0.250308,-0.250198,-0.293156,-0.293808,0.242689,0.181616,-0.114756,-0.135847,0.800015,-0.930473,-0.370022,-0.330399
...,...,...,...,...,...,...,...,...,...,...,...,...
3704,-0.250308,-0.250198,-0.293156,-0.293808,-0.414509,-0.637066,-0.114756,0.385837,0.800015,-0.557012,0.663522,-0.330399
3705,-0.250308,-0.250198,-0.293156,-0.293808,-0.717831,-0.251804,-0.194180,-0.553194,0.800015,-0.557012,0.663522,-0.944638
3706,-0.250308,-0.250198,-0.293156,-0.293808,0.343797,-0.396278,0.600067,-0.135847,0.800015,1.497026,-1.920339,-0.330399
3707,-0.250308,-0.250198,-0.293156,-0.293808,0.546011,0.326089,0.679491,0.490174,0.800015,0.563373,1.180295,-0.330399


In [27]:
#y_train = y_train.to_numpy()
y_train = np.ravel(y_train)

In [28]:
xtest_norm.shape

(3709, 12)

In [29]:
y_test= np.ravel(y_test)

In [30]:
model4 = rf.fit(xtrain_norm, y_train)

In [31]:
predict4= model4.predict(xtrain_norm)
accuracy_score(y_train, predict4)

0.9517303914160495

In [32]:
pred4= model4.predict(xtest_norm)
accuracy_score(y_test, pred4)

0.7710973308169318

In [33]:
model5= grid.fit(xtrain_norm, y_train)



In [34]:
model5.best_score_

0.8900710291672965

In [35]:
model5.best_params_

{'max_depth': 20, 'n_estimators': 200}

In [36]:
pred5= model5.predict(xtest_norm)
accuracy_score(y_test, pred5)

0.772984631976274

Standardizing data

In [38]:
minmax= MinMaxScaler()
train_norm= minmax.fit_transform(xtrain)

In [39]:
test_norm= minmax.transform(xtest)

In [40]:
model6= rf.fit(train_norm, ytrain)

In [41]:
predict6= model6.predict(train_norm)
accuracy_score(ytrain, predict6)

0.9459459459459459

In [42]:
pred6= model6.predict(test_norm)
accuracy_score(ytest, pred6)

0.8533100029163021

In [43]:
model7= grid.fit(train_norm, ytrain)

In [44]:
model7.best_score_

0.8700821730812269

In [45]:
pred7= model7.predict(test_norm)
accuracy_score(ytest, pred7)

0.8844172256245747

Loading feature selection data

In [46]:
selection= pd.read_csv("Data/selection_variables.csv")

In [47]:
selection.shape

(18545, 11)

In [48]:
selection

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,TARGET
0,0.0,0.0,0.0,0.0,0.0,6.2,5.8,4.6,6.0,2.0,2.0
1,0.0,0.0,0.0,0.0,0.0,6.0,6.2,5.2,6.0,2.0,2.0
2,0.0,0.0,0.0,0.0,0.0,7.3,6.7,7.1,8.0,5.0,3.0
3,1.0,3.0,1.0,1.0,1.0,0.0,0.0,0.0,2.0,4.0,1.0
4,1.0,3.0,1.0,1.0,1.0,0.0,0.0,0.0,2.0,5.0,1.0
...,...,...,...,...,...,...,...,...,...,...,...
18540,1.0,1.0,1.0,1.0,1.0,0.0,0.0,0.0,2.0,20.0,1.0
18541,0.0,0.0,0.0,0.0,0.0,7.0,7.9,5.8,6.0,9.0,2.0
18542,0.0,0.0,0.0,0.0,0.0,7.9,7.6,8.3,10.0,8.0,5.0
18543,0.0,0.0,1.0,1.0,1.0,6.3,5.1,0.0,2.0,3.0,1.0


In [49]:
x= selection.iloc[:, 0:10]
y= selection["TARGET"]

In [50]:
features_train, features_test, target_train, target_test= train_test_split(x, y, 
                                                                           test_size= 0.25, 
                                                                           random_state= 100) 

In [60]:
features_train, target_train= smote.fit_resample(features_train, target_train)
features_train.shape

(30890, 10)

In [63]:
model8= rf.fit(features_train, target_train)

In [64]:
predict8= model8.predict(features_train)
accuracy_score(target_train, predict8)

0.9272256393654904

In [65]:
pred8= model8.predict(features_test)
accuracy_score(target_test, pred8)

0.7815397886564589

In [66]:
confusion_matrix(target_test, pred8)

array([[ 997,  356,   42,    2,    0],
       [ 207, 1802,   12,   27,    3],
       [   6,    0,  671,   88,   56],
       [   0,   37,   67,   52,   60],
       [   0,    2,   17,   31,  102]])

In [80]:
model9= grid.fit(features_train, target_train)

In [81]:
model9.best_score_

0.8254127549368728

In [82]:
model9.best_params_

{'max_depth': 15, 'n_estimators': 100}

In [83]:
pred9= model9.predict(features_test)
accuracy_score(target_test, pred9)

0.7009263773768893

In [84]:
confusion_matrix(target_test, pred9)

array([[1537,  450,   58,    6,    0],
       [ 207, 1802,   12,   27,    3],
       [  10,    0, 1586,  315,  140],
       [   1,  162,  498,  781,  609],
       [   0,    8,  101,  460, 1482]])

In [85]:
predict10= model9.predict(features_train)
accuracy_score(target_train, predict10)

0.9272256393654904

In [86]:
confusion_matrix(target_train, predict10)

array([[5311,  721,  139,    6,    1],
       [ 301, 5847,   23,    7,    0],
       [   0,    0, 5944,  111,  123],
       [   0,   64,  399, 5472,  243],
       [   0,    3,   38,   69, 6068]])

In [87]:
features_test, target_test= smote.fit_resample(features_test, target_test)

In [88]:
pred10= model9.predict(features_test)
accuracy_score(target_test, pred10)

0.7009263773768893

In [89]:
confusion_matrix(target_test, pred10)

array([[1537,  450,   58,    6,    0],
       [ 207, 1802,   12,   27,    3],
       [  10,    0, 1586,  315,  140],
       [   1,  162,  498,  781,  609],
       [   0,    8,  101,  460, 1482]])

In [90]:
result = pd.DataFrame(model9.cv_results_)
result = result[['mean_fit_time', 'std_fit_time', 'mean_score_time', 'std_score_time',
                'mean_test_score', 'std_test_score', 'rank_test_score']]

In [105]:
result.sort_values(['rank_test_score'])

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,mean_test_score,std_test_score,rank_test_score
10,1.877491,0.041201,0.096385,0.001479,0.825413,0.013469,1
11,3.689196,0.165283,0.184894,0.008004,0.825024,0.013697,2
9,0.94119,0.018515,0.050104,0.001039,0.822402,0.012796,3
8,0.194316,0.003659,0.012461,0.000177,0.80887,0.012,4
7,2.864758,0.057672,0.145062,0.001853,0.760117,0.0042,5
6,1.432567,0.032504,0.074129,0.000997,0.759599,0.004369,6
5,0.721505,0.015209,0.038395,0.000584,0.759437,0.003944,7
4,0.150899,0.004861,0.010013,0.000163,0.751117,0.005641,8
3,2.463825,0.046303,0.127467,0.001514,0.72988,0.00673,9
2,1.238926,0.026677,0.06534,0.001195,0.729168,0.006554,10
