In [30]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier

In [31]:
train = pd.read_csv("../../data/train_data.csv")
test = pd.read_csv("../../data/test_data.csv")

In [32]:
train

Unnamed: 0,ID,parents,has_nurs,form,children,housing,finance,social,health,app_status
0,1,usual,less_proper,complete,3,critical,convenient,problematic,not_recom,0
1,2,pretentious,very_crit,completed,1,convenient,inconv,nonprob,not_recom,0
2,3,pretentious,proper,incomplete,1,less_conv,convenient,slightly_prob,priority,1
3,4,great_pret,improper,complete,1,convenient,convenient,nonprob,recommended,1
4,5,great_pret,less_proper,completed,1,convenient,convenient,slightly_prob,priority,1
...,...,...,...,...,...,...,...,...,...,...
10363,10364,usual,improper,incomplete,3,less_conv,convenient,problematic,recommended,1
10364,10365,great_pret,less_proper,incomplete,3,convenient,inconv,nonprob,not_recom,0
10365,10366,pretentious,proper,complete,2,less_conv,inconv,problematic,priority,1
10366,10367,pretentious,very_crit,completed,1,less_conv,convenient,slightly_prob,not_recom,0


In [38]:
train.columns

Index(['ID', 'parents', 'has_nurs', 'form', 'children', 'housing', 'finance',
       'social', 'health', 'app_status'],
      dtype='object')

In [51]:
def preprocess(df,train=False):
    df.loc[df.form == "completed","form"] = "complete"
    str_cols = ['parents', 'has_nurs', 'form', 'children', 'housing', 'finance','social', 'health']
    result = pd.concat([pd.DataFrame({"ID":df.ID.values.tolist()}),
           pd.get_dummies(df.loc[:,str_cols])],axis=1)
    result = result if not train else pd.concat([result,
                                         pd.DataFrame({"app_status":df.app_status.values.tolist()})],axis=1)
    return result

In [52]:
ptrain = preprocess(train,train=True)
ptest = preprocess(test,train=False)

In [53]:
train.loc[train.form == "completed","form"] = "complete"

In [54]:
pd.concat([pd.DataFrame({"ID":train.ID.values.tolist()}),
           pd.get_dummies(train.loc[:,str_cols]),
           pd.DataFrame({"app_status":train.app_status.values.tolist()})],axis=1)

Unnamed: 0,ID,parents_great_pret,parents_pretentious,parents_usual,has_nurs_critical,has_nurs_improper,has_nurs_less_proper,has_nurs_proper,has_nurs_very_crit,form_complete,...,housing_less_conv,finance_convenient,finance_inconv,social_nonprob,social_problematic,social_slightly_prob,health_not_recom,health_priority,health_recommended,app_status
0,1,0,0,1,0,0,1,0,0,1,...,0,1,0,0,1,0,1,0,0,0
1,2,0,1,0,0,0,0,0,1,1,...,0,0,1,1,0,0,1,0,0,0
2,3,0,1,0,0,0,0,1,0,0,...,1,1,0,0,0,1,0,1,0,1
3,4,1,0,0,0,1,0,0,0,1,...,0,1,0,1,0,0,0,0,1,1
4,5,1,0,0,0,0,1,0,0,1,...,0,1,0,0,0,1,0,1,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
10363,10364,0,0,1,0,1,0,0,0,0,...,1,1,0,0,1,0,0,0,1,1
10364,10365,1,0,0,0,0,1,0,0,0,...,0,0,1,1,0,0,1,0,0,0
10365,10366,0,1,0,0,0,0,1,0,1,...,1,0,1,0,1,0,0,1,0,1
10366,10367,0,1,0,0,0,0,0,1,1,...,1,1,0,0,0,1,1,0,0,0


In [80]:
X = ptrain.drop(["app_status","ID"],axis=1)
y = ptrain.app_status
X_train,X_test,y_train,y_test = train_test_split(X,y,random_state=42,train_size=0.65,test_size=0.35)

In [81]:
X_train.shape, X_test.shape, y_train.shape, y_test.shape

((6739, 26), (3629, 26), (6739,), (3629,))

In [82]:
rf = RandomForestClassifier().fit(X_train,y_train)

In [83]:
y_pred = rf.predict(X_test)

In [84]:
from sklearn.metrics import classification_report

In [85]:
print(classification_report(y_test,y_pred))

              precision    recall  f1-score   support

           0       0.99      0.99      0.99      2311
           1       0.99      0.98      0.98      1318

    accuracy                           0.99      3629
   macro avg       0.99      0.98      0.99      3629
weighted avg       0.99      0.99      0.99      3629



In [86]:
from sklearn.metrics import roc_curve, auc
fpr, tpr, thresholds = roc_curve(y_test,y_pred)
auc(fpr,tpr)

0.9847780523182326

In [98]:
for i,v in enumerate(rf.feature_importances_):
    print(f"Feature {X.columns[i]} importance {v}")

Feature parents_great_pret importance 0.06191131524795117
Feature parents_pretentious importance 0.02320848661529852
Feature parents_usual importance 0.04365425215775274
Feature has_nurs_critical importance 0.05176963458324169
Feature has_nurs_improper importance 0.02997243854164249
Feature has_nurs_less_proper importance 0.05211561664591287
Feature has_nurs_proper importance 0.05733959383060021
Feature has_nurs_very_crit importance 0.09140989523664853
Feature form_complete importance 0.014010414513133798
Feature form_foster importance 0.01192704513181296
Feature form_incomplete importance 0.007581913506217928
Feature children_1 importance 0.020578615449988832
Feature children_2 importance 0.011613553006665405
Feature children_3 importance 0.011624089820462718
Feature children_more importance 0.011675141520424055
Feature housing_convenient importance 0.022180242875515427
Feature housing_critical importance 0.017942042120596868
Feature housing_less_conv importance 0.01037050261784327
Fe

In [88]:
tX = ptest.drop(["ID"],axis=1)
tpred = rf.predict(tX)

In [89]:
tpred

array([1, 1, 1, ..., 0, 0, 0])

In [93]:
sample_sub = pd.read_csv("../../data/sampleSubmissionFile.csv")
sample_sub["app_status"] = tpred
sample_sub

Unnamed: 0,ID,app_status
0,1,1
1,2,1
2,3,1
3,4,1
4,5,1
...,...,...
2587,2588,0
2588,2589,0
2589,2590,0
2590,2591,0


In [95]:
from datetime import datetime
sample_sub.to_csv(f"rf-attempt-{datetime.now()}.csv",index=False)