In [1]:
import matplotlib.pyplot as plt
import numpy as np
from sklearn.datasets import load_digits
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
import pandas as pd
from sklearn.metrics import accuracy_score

In [2]:
data = pd.read_csv("gene_data.csv", header = 0, index_col = 0)
data = data.T
data = np.log(data+1)
y = [ 0 if d.split('_')[1] == 'LG' else 1 for d in data.index ]

In [3]:
data.columns

Index(['CALML3', 'PIK3R2', 'IL11RA', 'BAMBI', 'PLA2G10', 'ETV1', 'POLD4',
       'NFKBIZ', 'SMAD2', 'IFNG',
       ...
       'VPS33B', 'MRPS5', 'NUBP1', 'CNOT4', 'COG7', 'CC2D1B', 'GPATCH3',
       'ZNF143', 'EDC3', 'TTC31'],
      dtype='object', name='Name', length=784)

In [4]:
from sklearn.ensemble import RandomForestClassifier

# create a list of random states to iterate through
random_states = range(11)

# create empty lists to store the accuracy scores for each model
acc_rfc = []

# iterate through the random states and fit the models
for rstate in random_states:
    X_train, X_test, y_train, y_test = train_test_split(data, y, test_size=0.2, random_state=rstate)
    
    
    # create a random forest classifier with 100 trees
    rfc = RandomForestClassifier(n_estimators=100)

    # fit the classifier to your data
    rfc.fit(X_train, y_train)

    # predict using the trained classifier
    y_pred = rfc.predict(X_test)
    
    acc_rfc.append(rfc.score(X_test, y_test))
    
    print('Random State: ' + str(rstate))
    print('Accuracy of RFC classifier on test set: {:.2f}'.format(rfc.score(X_test, y_test)))
    print(pd.DataFrame(zip(X_test.index, y_test, y_pred), columns=["Sample", "Target", "Predicted"]))
    print()
    print()

    

Random State: 0
Accuracy of RFC classifier on test set: 1.00
         Sample  Target  Predicted
0   SC15-615_HG       1          1
1  SO13-7742_LG       0          0
2  SC14-3857_LG       0          0
3   SC14-469_HG       1          1
4  SO13-7790_HG       1          1
5  SC14-3805_LG       0          0
6  SC14-2414_LG       0          0
7  SO11-5496_LG       0          0


Random State: 1
Accuracy of RFC classifier on test set: 0.88
         Sample  Target  Predicted
0   SC14-469_HG       1          1
1   SC15-584_HG       1          1
2   SC14-299_HG       1          1
3  SO12-5870_LG       0          0
4  SO13-7554_LG       0          0
5  SC14-4015_LG       0          0
6  CO14-3067_HG       1          0
7  SO14-1358_HG       1          1


Random State: 2
Accuracy of RFC classifier on test set: 1.00
         Sample  Target  Predicted
0   SC15-349_LG       0          0
1  SC08-3655_LG       0          0
2   SC14-299_HG       1          1
3   SC14-469_HG       1          1
4  SO15-

In [5]:
pd.DataFrame(zip(data.columns,rfc.feature_importances_) , columns=["Gene Name","Importance"])


Unnamed: 0,Gene Name,Importance
0,CALML3,0.008444
1,PIK3R2,0.000000
2,IL11RA,0.000000
3,BAMBI,0.000000
4,PLA2G10,0.000000
...,...,...
779,CC2D1B,0.008940
780,GPATCH3,0.000000
781,ZNF143,0.000000
782,EDC3,0.000000


In [6]:
pd.DataFrame(zip(random_states, acc_rfc), columns=["Random State", "RFC Accuracy"])

Unnamed: 0,Random State,RFC Accuracy
0,0,1.0
1,1,0.875
2,2,1.0
3,3,1.0
4,4,1.0
5,5,0.875
6,6,1.0
7,7,1.0
8,8,0.875
9,9,1.0


In [7]:
total_ac = 0
for ac in acc_rfc:
    total_ac += ac
#print(total_ac/len(acc_rfc))
print('avg acc of rfc of rnd state 0-10: {:.2f}'.format(total_ac/len(acc_rfc)))
print()

avg acc of rfc of rnd state 0-10: 0.97



<a style='text-decoration:none;line-height:16px;display:flex;color:#5B5B62;padding:10px;justify-content:end;' href='https://deepnote.com?utm_source=created-in-deepnote-cell&projectId=26a52daa-3781-4303-b6d1-419771ff77cb' target="_blank">
 </img>
Created in <span style='font-weight:600;margin-left:4px;'>Deepnote</span></a>