In [1]:
import os
import re
import pandas as pd
import numpy as np

from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV

from sklearn.metrics import classification_report, roc_auc_score, f1_score
from sklearn.model_selection import train_test_split

In [2]:
#Setting your path

key_word_path='./key_word.txt'
train_path='./Data/Train_Textual/'   
valid_path='./Data/Validation/'   

In [3]:
key_word=[]
with open(key_word_path,'r') as f:
    for j in f:
        key_word.append(j.replace('\n',''))

In [4]:
#Define columns name 
columns_name=[]
for i in key_word:
    columns_name.append(i)

<h3>Ceate new data</h3>

In [5]:
#Train set
new_data=np.zeros(shape=(400,len(key_word)),dtype=int)
train_df=pd.DataFrame(data=new_data,columns=columns_name)
label=[]
cnt=0

for i in os.listdir(train_path):
    with open(train_path+str(i),'r') as f:
        for j in f:
            h=j.lower().replace('\n','').split(' ')
            for s in h:
                s=re.sub(r'[^\w\s]','',s)
                if s in key_word:
                    train_df.iloc[cnt,key_word.index(s)]=1
    if cnt<200:
        label.append(0)
    else:
        label.append(1)
    f.close()
    cnt+=1
train_df['label']=label

In [6]:
#Validation set
new_data=np.zeros(shape=(50,len(key_word)),dtype=int)
valid_df=pd.DataFrame(data=new_data,columns=columns_name)
label=[]
cnt=0

for i in os.listdir(valid_path):
    with open(valid_path+str(i),'r') as f:
        for j in f:
            h=j.lower().replace('\n','').split(' ')
            for s in h:
                s=re.sub(r'[^\w\s]','',s)
                if s in key_word:
                    valid_df.iloc[cnt,key_word.index(s)]=1
    cnt+=1

print("Train set shape : ",train_df.shape,", Validation set shape : ",valid_df.shape)

Train set shape :  (400, 84) , Validation set shape :  (50, 83)


<h3>Split data for training and testing</h3>

In [7]:
X = train_df.drop(columns='label')
y = train_df['label']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=1)

<h3>Start training </h3>
<h4>Use GridSearch method to tune model <h4>

In [8]:
param_grid = { 
    'n_estimators': [10, 100, 200, 500],
    'max_features': ['auto', 'sqrt', 'log2'],
    'max_depth' : [20, 25, 30, 40, 50],
    'criterion' :['gini', 'entropy']
}

rfc=RandomForestClassifier(random_state=31)
CV_rfc = GridSearchCV(estimator=rfc, param_grid=param_grid, cv= 5, scoring='f1')
CV_rfc.fit(X_train, y_train)

print("Best parameters set found on development set:")
print()
print(CV_rfc.best_params_)
print()
print("Grid scores on development set:")
print()
means = CV_rfc.cv_results_['mean_test_score']
for mean, params in zip(means, CV_rfc.cv_results_['params']):
    print("Accuracy = %0.3f for %r" % (mean, params))

test_pred = CV_rfc.predict(X_test)
print(classification_report(y_test, test_pred))
print(roc_auc_score(y_test, CV_rfc.predict_proba(X_test)[:, 1]))
print(f1_score(y_test, test_pred))

Best parameters set found on development set:

{'criterion': 'entropy', 'max_depth': 25, 'max_features': 'log2', 'n_estimators': 10}

Grid scores on development set:

Accuracy = 0.353 for {'criterion': 'gini', 'max_depth': 20, 'max_features': 'auto', 'n_estimators': 10}
Accuracy = 0.356 for {'criterion': 'gini', 'max_depth': 20, 'max_features': 'auto', 'n_estimators': 100}
Accuracy = 0.352 for {'criterion': 'gini', 'max_depth': 20, 'max_features': 'auto', 'n_estimators': 200}
Accuracy = 0.350 for {'criterion': 'gini', 'max_depth': 20, 'max_features': 'auto', 'n_estimators': 500}
Accuracy = 0.353 for {'criterion': 'gini', 'max_depth': 20, 'max_features': 'sqrt', 'n_estimators': 10}
Accuracy = 0.356 for {'criterion': 'gini', 'max_depth': 20, 'max_features': 'sqrt', 'n_estimators': 100}
Accuracy = 0.352 for {'criterion': 'gini', 'max_depth': 20, 'max_features': 'sqrt', 'n_estimators': 200}
Accuracy = 0.350 for {'criterion': 'gini', 'max_depth': 20, 'max_features': 'sqrt', 'n_estimators': 

<h3>Predict validation set</h3>

In [9]:
pred = CV_rfc.predict(valid_df)
pred_df = pd.read_csv('./sample_submission.csv') 
pred_df['Obesity'] = pred
# pred_df.to_csv('./sub.csv',index=False)
pred_df.head()

Unnamed: 0,Filename,Obesity
0,ID_1159.txt,0
1,ID_1160.txt,0
2,ID_1162.txt,0
3,ID_1167.txt,0
4,ID_1168.txt,1
