In [1]:
#import libraries
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score,confusion_matrix, classification_report
from sklearn.model_selection import StratifiedShuffleSplit

In [3]:
#reading data
dataR2 = pd.read_csv('dataR2.csv')

#seperating into predictors and classes
X = dataR2.iloc[:,0:-1]
y = dataR2.iloc[:,-1]
#splitting into train and test set
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.4, random_state=123)



In [4]:
#Random forest model def
def rf_model(xtrain,ytrain):
    clf = RandomForestClassifier(n_estimators=200, max_depth=1,random_state=0)
    clf.fit(xtrain,ytrain)
    return clf

In [5]:
#Accuracy score with all the features
clf = RandomForestClassifier(n_estimators=200, max_depth=1,random_state=0)
clf.fit(X_train,y_train)
y_pred = clf.predict(X_train)
print("Confusion Matrix")
print(confusion_matrix(y_train, y_pred))
print("Accuracy score: %f" %(accuracy_score(y_train, y_pred)))
print('-------------------------------------------------------')
target_names = ['Controls', 'Patients']
print(classification_report(y_train, y_pred, target_names=target_names))

y_pred = clf.predict(X_test)
print("Confusion Matrix")
print(confusion_matrix(y_test, y_pred))
print("Accuracy score: %f" %(accuracy_score(y_test, y_pred)))
print('-------------------------------------------------------')
target_names = ['Controls', 'Patients']
print(classification_report(y_test, y_pred, target_names=target_names))




Confusion Matrix
[[20 11]
 [ 1 36]]
Accuracy score: 0.823529
-------------------------------------------------------
              precision    recall  f1-score   support

    Controls       0.95      0.65      0.77        31
    Patients       0.77      0.97      0.86        37

   micro avg       0.82      0.82      0.82        68
   macro avg       0.86      0.81      0.81        68
weighted avg       0.85      0.82      0.82        68

Confusion Matrix
[[10 10]
 [ 2 24]]
Accuracy score: 0.739130
-------------------------------------------------------
              precision    recall  f1-score   support

    Controls       0.83      0.50      0.62        20
    Patients       0.71      0.92      0.80        26

   micro avg       0.74      0.74      0.74        46
   macro avg       0.77      0.71      0.71        46
weighted avg       0.76      0.74      0.72        46



In [6]:
# StratifiedShuffleSplit
# to find important features in different subsets
sss = StratifiedShuffleSplit(n_splits=5, test_size=0.5, random_state=0)
for train, test in sss.split(X_train, y_train):
    model=rf_model(X_train.iloc[train],y_train.iloc[train])
    #model.fit(X_train.iloc[train],y_train.iloc[train])
    score=pd.DataFrame(model.feature_importances_,index=X_train.columns,columns=['importance'])
    score=score.sort_values('importance', ascending=False) 
    top4=pd.DataFrame(score.index[:5])
    print(top4)
    model2 = rf_model(X_train[top4[0]],y_train)
    y_pred = model2.predict(X_train[top4[0]])
    
    print('-------------------Training Set -----------------------')
    
    print("Confusion Matrix")
    print(confusion_matrix(y_train, y_pred))
    print("Accuracy score: %f" %(accuracy_score(y_train, y_pred)))
    #print('-------------------------------------------------------')
    model2 = rf_model(X_test[top4[0]],y_test)
    y_pred = model2.predict(X_test[top4[0]])
    
    print('-------------------Test Set -----------------------')
    #print('-------------------------------------------------------')
    print("Confusion Matrix")
    print(confusion_matrix(y_test, y_pred))
    print("Accuracy score: %f" %(accuracy_score(y_test, y_pred)))
    print('-------------------------------------------------------')
    print('-------------------------------------------------------')
    
    

             0
0     Resistin
1         HOMA
2  Adiponectin
3          BMI
4      Insulin
-------------------Training Set -----------------------
Confusion Matrix
[[18 13]
 [ 7 30]]
Accuracy score: 0.705882
-------------------Test Set -----------------------
Confusion Matrix
[[18  2]
 [ 7 19]]
Accuracy score: 0.804348
-------------------------------------------------------
-------------------------------------------------------
          0
0  Resistin
1   Glucose
2       Age
3     MCP.1
4      HOMA
-------------------Training Set -----------------------
Confusion Matrix
[[19 12]
 [ 4 33]]
Accuracy score: 0.764706
-------------------Test Set -----------------------
Confusion Matrix
[[18  2]
 [ 3 23]]
Accuracy score: 0.891304
-------------------------------------------------------
-------------------------------------------------------
          0
0  Resistin
1       BMI
2   Insulin
3      HOMA
4       Age
-------------------Training Set -----------------------
Confusion Matrix
[[18 13]


In [7]:
#Selected Features
clf=rf_model(X_train[['Age', 'Resistin', 'Glucose', 'HOMA']],y_train)
y_pred = clf.predict(X_train[['Age', 'Resistin', 'Glucose', 'HOMA']])
print("Confusion Matrix")
print(confusion_matrix(y_train, y_pred))
print("Accuracy score: %f" %(accuracy_score(y_train, y_pred)))
print('-------------------------------------------------------')
target_names = ['Controls', 'Patients']
print(classification_report(y_train, y_pred, target_names=target_names))

y_pred = clf.predict(X_test[['Age', 'Resistin', 'Glucose', 'HOMA']])
print("Confusion Matrix")
print(confusion_matrix(y_test, y_pred))
print("Accuracy score: %f" %(accuracy_score(y_test, y_pred)))
print('-------------------------------------------------------')
target_names = ['Controls', 'Patients']
print(classification_report(y_test, y_pred, target_names=target_names))



Confusion Matrix
[[20 11]
 [ 4 33]]
Accuracy score: 0.779412
-------------------------------------------------------
              precision    recall  f1-score   support

    Controls       0.83      0.65      0.73        31
    Patients       0.75      0.89      0.81        37

   micro avg       0.78      0.78      0.78        68
   macro avg       0.79      0.77      0.77        68
weighted avg       0.79      0.78      0.77        68

Confusion Matrix
[[12  8]
 [ 2 24]]
Accuracy score: 0.782609
-------------------------------------------------------
              precision    recall  f1-score   support

    Controls       0.86      0.60      0.71        20
    Patients       0.75      0.92      0.83        26

   micro avg       0.78      0.78      0.78        46
   macro avg       0.80      0.76      0.77        46
weighted avg       0.80      0.78      0.77        46



In [9]:
#final test
finaltest = pd.read_csv('final_test.csv')
X_final=finaltest.iloc[:,0:-1]
y_final=finaltest.iloc[:,-1]

y_pred = clf.predict(X_final[['Age', 'Resistin', 'Glucose', 'HOMA']])
print("predicting from a seperate test set")
print("Confusion Matrix")
print("True Value ")
print(y_final.values)
print("Predicted Value")
print( y_pred)

predicting from a seperate test set
Confusion Matrix
True Value 
[2 1]
Predicted Value
[2 2]
