## Setting up

In [1]:
#Import packages

import numpy as np
import pandas as pd 
from sklearn import preprocessing
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import train_test_split


data1 = pd.read_csv("rao_nhek_10kb.tads.boundary.4mer.features.csv")
data2 = pd.read_csv("rao_imr90.tads.boundary.4mer.features.csv")
data3 = pd.read_csv("dixon_h1esc.tads.boundary.4mer.features.csv")

  has_raised = await self.run_ast_nodes(code_ast.body, cell_name,


In [None]:
#data.info()

In [2]:
#Take out the unneeded row
i = data1[(data1.AAAA == 'AAAA')].index
data1 = data1.drop(i)

i = data2[(data2.AAAA == 'AAAA')].index
data2 = data2.drop(i)

i = data3[(data3.AAAA == 'AAAA')].index
data3 = data3.drop(i)

In [3]:
#Indicate features and class label
x1 = data1.iloc[:,1:-1].apply(pd.to_numeric,errors='coerce')
y1 = data1["class"].astype('int')

x2 = data2.iloc[:,1:-1].apply(pd.to_numeric,errors='coerce')
y2 = data2["class"].astype('int')

x3 = data3.iloc[:,1:-1].apply(pd.to_numeric,errors='coerce')
y3 = data3["class"].astype('int')

In [4]:
#Split data into training (80%) and testing (testing)
x1_train, x1_test, y1_train, y1_test = train_test_split(x1, y1, test_size=0.20, random_state=12)

x2_train, x2_test, y2_train, y2_test = train_test_split(x2, y2, test_size=0.20, random_state=12)

x3_train, x3_test, y3_train, y3_test = train_test_split(x3, y3, test_size=0.20, random_state=12)

In [5]:
#Scale the features to center zero mean and unit standard deviation
ss = StandardScaler()
x1_train = ss.fit_transform(x1_train)
x1_test = ss.transform(x1_test)

x2_train = ss.fit_transform(x2_train)
x2_test = ss.transform(x2_test)

x3_train = ss.fit_transform(x3_train)
x3_test = ss.transform(x3_test)

## Classic random forest model

In [6]:
#Build the random forest model
model1 = RandomForestClassifier()
model1.fit(x1_train, y1_train)

model2 = RandomForestClassifier()
model2.fit(x2_train, y2_train)

model3 = RandomForestClassifier()
model3.fit(x3_train, y3_train)

RandomForestClassifier()

## Random forest model with 10-fold cross validation

In [7]:
from sklearn.model_selection import cross_val_score
train_score1 = cross_val_score(model1, x1_train, y1_train, scoring = 'accuracy', cv=10)
test_score1 = cross_val_score(model1, x1_test, y1_test, scoring = 'accuracy', cv=10)

print("Model 1:")
print(train_score1)
print(test_score1)

Model 1:
[0.59108527 0.60658915 0.59987072 0.58758888 0.57983193 0.60245637
 0.58629606 0.60180995 0.59405301 0.59922431]
[0.59689922 0.64341085 0.60981912 0.61498708 0.61757106 0.5994832
 0.64341085 0.625323   0.59067358 0.61917098]


In [9]:
from sklearn.model_selection import cross_val_score
train_score2 = cross_val_score(model2, x2_train, y2_train, scoring = 'accuracy', cv=10)
test_score2 = cross_val_score(model2, x2_test, y2_test, scoring = 'accuracy', cv=10)

print("Model 2:")
print(train_score2)
print(test_score2)

Model 2:
[0.61229508 0.60245902 0.63412633 0.62920427 0.62100082 0.62346185
 0.61033634 0.61936013 0.62264151 0.609516  ]
[0.60327869 0.59344262 0.66557377 0.60983607 0.62295082 0.56393443
 0.60983607 0.56393443 0.62171053 0.53618421]


In [12]:
from sklearn.model_selection import cross_val_score
train_score3 = cross_val_score(model3, x3_train, y3_train, scoring = 'accuracy', cv=10)
test_score3 = cross_val_score(model3, x3_test, y3_test, scoring = 'accuracy', cv=10)

print("Model 3:")
print(train_score3)
print(test_score3)

Model 3:
[0.60431655 0.65467626 0.62829736 0.56971154 0.63942308 0.57692308
 0.62259615 0.625      0.60817308 0.58413462]
[0.65714286 0.58653846 0.61538462 0.55769231 0.58653846 0.64423077
 0.61538462 0.59615385 0.56730769 0.66346154]


In [14]:
y1_pred = model1.predict(x1_test)
y2_pred = model2.predict(x2_test)
y3_pred = model3.predict(x3_test)

## Further tuning the model’s hyperparameters with GridSearchCV

In [15]:
model_param = {
    'n_estimators': [50, 100, 150],
    'criterion': ['gini', 'entropy'],
    'bootstrap': [True, False]
}

In [18]:
clf1 = GridSearchCV(model1, model_param, scoring='accuracy', cv=10)
mod1 = clf1.fit(x1, y1)

#Print out the the best parameter and score
params = clf1.best_params_
print("Best Parameters: ", params)

score = clf1.best_score_
print("Best Score: ", score)

Best Parameters:  {'bootstrap': False, 'criterion': 'gini', 'n_estimators': 150}
Best Score:  0.6084798345398139


In [21]:
clf2 = GridSearchCV(model2, model_param, scoring='accuracy', cv=10)
mod2 = clf2.fit(x2, y2)

#Print out the the best parameter and score
params = clf2.best_params_
print("Best Parameters: ", params)

score = clf2.best_score_
print("Best Score: ", score)

Best Parameters:  {'bootstrap': True, 'criterion': 'entropy', 'n_estimators': 150}
Best Score:  0.6210629921259843


In [23]:
clf3 = GridSearchCV(model3, model_param, scoring='accuracy', cv=10)
mod3 = clf3.fit(x3, y3)

#Print out the the best parameter and score
params = clf3.best_params_
print("Best Parameters: ", params)

score = clf3.best_score_
print("Best Score: ", score)

Best Parameters:  {'bootstrap': True, 'criterion': 'entropy', 'n_estimators': 150}
Best Score:  0.6216270485752251


## Accuracy, AUC (area under the curve) score, Precision, Recall, F1 score

In [24]:
#Accuracy, Precision, Recall, F1 score
from sklearn.metrics import accuracy_score, roc_auc_score, precision_score, recall_score, f1_score

print("Model 1:")
print("Accuracy: ", accuracy_score(y1_test, y1_pred))
print("AUC: ", roc_auc_score(y1_test, y1_pred))
print("Precision: ", precision_score(y1_test, y1_pred))
print("Recall: ", recall_score(y1_test, y1_pred))
print("F1 score: ", f1_score(y1_test, y1_pred))

print("Model 2:")
print("Accuracy: ", accuracy_score(y2_test, y2_pred))
print("AUC: ", roc_auc_score(y2_test, y2_pred))
print("Precision: ", precision_score(y2_test, y2_pred))
print("Recall: ", recall_score(y2_test, y2_pred))
print("F1 score: ", f1_score(y2_test, y2_pred))

print("Model 3:")
print("Accuracy: ", accuracy_score(y3_test, y3_pred))
print("AUC: ", roc_auc_score(y3_test, y3_pred))
print("Precision: ", precision_score(y3_test, y3_pred))
print("Recall: ", recall_score(y3_test, y3_pred))
print("F1 score: ", f1_score(y3_test, y3_pred))

Model 1:
Accuracy:  0.6080661840744571
AUC:  0.6084059953085404
Precision:  0.6235480464625132
Recall:  0.5952620967741935
F1 score:  0.6090768437338834
Model 2:
Accuracy:  0.6128608923884514
AUC:  0.6128205846059089
Precision:  0.6042356055592323
Recall:  0.6107023411371237
F1 score:  0.607451763140386
Model 3:
Accuracy:  0.5936599423631124
AUC:  0.5936925564192855
Precision:  0.5972495088408645
Recall:  0.5823754789272031
F1 score:  0.5897187196896218


## Save the trained model to a file

In [26]:
import pickle 
filename1 = 'rao_nhek.pickle'
pickle.dump(model1, open(filename1, 'wb'))

filename2 = 'rao_imr90.pickle'
pickle.dump(model2, open(filename2, 'wb'))

filename3 = 'dixon_h1esc.pickle'
pickle.dump(model3, open(filename3, 'wb'))