In [319]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
plt.style.use('bmh')

from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sklearn.metrics import roc_auc_score

In [320]:
#Load in the gene expression matrix
df = pd.read_csv('gene.tsv', sep='\t', index_col=0)
df

Unnamed: 0_level_0,SRR8477377,SRR8477378,SRR8477379,SRR8477380,SRR8477381,SRR8477383,SRR8477384,SRR8477385,SRR8477386,SRR8477387,...,SRR8477701,SRR8477702,SRR8477703,SRR8477704,SRR8477705,SRR8477706,SRR8477707,SRR8477708,SRR8477709,SRR8477710
gene,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
Gnai3,6.003730,6.717884,5.879887,6.126242,6.911103,6.647521,6.415873,6.061517,6.073213,6.198332,...,9.713097,12.408170,11.925100,11.796491,11.335134,12.397634,13.405191,9.369862,12.053689,13.110332
Pbsn,0.157460,0.153849,0.154648,0.150461,0.150191,0.149977,0.147436,0.146877,0.147456,0.149704,...,0.324236,0.330234,0.328160,0.311731,0.315161,0.320094,0.319713,0.320536,0.318293,0.316961
Cdc45,1.660414,2.025149,1.638161,2.320365,1.705641,2.326299,2.125218,2.190152,2.139593,1.537633,...,2.460072,3.078061,3.840317,2.857121,3.299295,2.962622,4.512856,3.729228,2.972924,4.147524
H19,1.151994,1.616525,1.570316,0.150461,0.749839,0.149977,1.063427,1.511052,1.439036,1.091345,...,3.725690,3.331363,4.060742,2.495034,2.502888,2.530673,3.038688,0.320536,2.522001,0.316961
Scml2,1.369270,0.153849,1.320171,1.523063,1.658413,1.358042,1.617542,1.463084,2.519570,2.724351,...,2.023284,2.076867,0.328160,1.909401,1.937866,0.320094,4.735587,4.830414,4.890850,5.219754
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
Gm50415,0.157460,0.153849,0.154648,0.150461,0.150191,0.149977,0.147436,0.146877,0.147456,0.149704,...,0.324236,0.330234,0.328160,0.311731,0.315161,0.320094,0.319713,0.320536,0.318293,0.316961
Vmn1r64,0.157460,0.153849,0.154648,0.150461,0.150191,0.149977,0.147436,0.146877,0.147456,0.149704,...,0.324236,0.330234,0.328160,0.311731,0.315161,0.320094,0.319713,0.320536,0.318293,0.316961
Gm50102,1.112498,1.498023,0.833246,1.126152,0.150191,1.223026,0.147436,1.442259,1.599911,1.464710,...,3.006951,3.073555,0.328160,0.311731,2.944464,2.963304,0.319713,0.320536,2.461793,0.316961
Gm19519,2.272763,1.828033,1.734675,0.150461,0.150191,2.045549,0.147436,1.356502,1.608898,0.972838,...,0.324236,0.330234,0.328160,0.311731,0.315161,0.320094,0.319713,0.320536,0.318293,0.316961


# 2b-2d

In [415]:
df_t = df.T

# get gene variance
gene_variance = df.var(axis=1)

# put gene variance in a dataframe
df_with_var = df
df_with_var['var'] = gene_variance

# get top 5000 varying genes
top5000 = df_with_var.sort_values(by='var', ascending=False).head(5000).drop(columns=['var'])

top5000

Unnamed: 0_level_0,SRR8477377,SRR8477378,SRR8477379,SRR8477380,SRR8477381,SRR8477383,SRR8477384,SRR8477385,SRR8477386,SRR8477387,...,SRR8477701,SRR8477702,SRR8477703,SRR8477704,SRR8477705,SRR8477706,SRR8477707,SRR8477708,SRR8477709,SRR8477710
gene,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
Hbb-bs,182.460673,182.460673,182.460673,10.300817,9.965343,10.491651,9.460778,87.196588,86.104768,85.497160,...,182.460673,182.460673,182.460673,182.460673,182.460673,182.460673,182.460673,182.460673,182.460673,182.460673
Hba-a2,173.839732,173.839732,173.839732,8.949210,8.456053,8.672090,7.646339,57.016861,56.495669,55.672795,...,173.839732,173.839732,173.839732,173.839732,173.839732,173.839732,173.839732,173.839732,173.839732,173.839732
Hba-a1,153.765477,153.765477,153.765477,5.218350,5.759153,6.254244,6.211060,32.979796,34.066093,35.038973,...,165.218790,165.218790,165.218790,165.218790,165.218790,165.218790,165.218790,165.218790,165.218790,165.218790
Hbb-bt,140.532415,140.532415,140.532415,2.688350,1.684045,2.031768,2.211419,8.871895,9.204295,8.533221,...,153.765477,153.765477,153.765477,153.765477,153.765477,153.765477,153.765477,153.765477,153.765477,153.765477
Alas2,118.555250,117.359420,118.555250,1.347125,1.843833,1.855703,2.104219,3.923974,4.112073,3.894269,...,143.299492,143.299492,143.299492,143.299492,140.532415,140.532415,140.532415,140.532415,140.532415,140.532415
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
Supt4a,4.562386,4.478880,4.652473,4.306918,4.809838,4.488440,4.477741,4.646772,4.722957,4.553462,...,9.572159,8.861361,10.041021,11.006406,10.323273,11.098050,10.398720,9.102967,10.670386,10.654893
Crip2,9.602470,9.367707,9.576681,9.613998,9.283625,8.900150,9.083058,11.467333,11.224610,11.073645,...,3.923522,5.464804,4.639016,1.999098,2.096317,3.766677,3.047850,4.197482,0.318293,3.607195
Pde3b,5.570871,4.067098,5.107865,4.084625,4.370074,5.483263,4.015849,5.279091,4.392500,4.530679,...,7.712258,7.481603,6.178080,7.481603,7.878966,8.696802,10.869067,8.451116,6.791199,7.272540
Cert1,13.922076,13.345410,14.151844,13.903650,14.722905,14.336360,14.161640,12.023313,11.631940,12.816398,...,13.186691,19.313780,12.019641,8.724413,17.704872,16.146562,14.927352,9.247555,12.046822,18.199820


In [416]:
# load in metadata
metadata = pd.read_csv('data/metadata_SRP181622.tsv', sep='\t', index_col=0)

# add target column
top5000 = top5000.T
top5000

stressed = []

for sample in metadata['refinebio_title']:
    if 'UCMS' in sample:
        stressed.append("stressed")
    else:
        stressed.append("unstressed")

top5000['stressed'] = stressed

top5000

gene,Hbb-bs,Hba-a2,Hba-a1,Hbb-bt,Alas2,Ube2l6,Slc1a2,Tent5c,Bnip3l,Camk2a,...,Dtx4,Filip1l,Col4a2,Zc3h4,Supt4a,Crip2,Pde3b,Cert1,Tmem63c,stressed
SRR8477377,182.460673,173.839732,153.765477,140.532415,118.555250,67.828143,130.200581,60.224164,89.831858,123.934603,...,9.921208,3.141237,6.806236,11.086213,4.562386,9.602470,5.570871,13.922076,8.104546,unstressed
SRR8477378,182.460673,173.839732,153.765477,140.532415,117.359420,68.073526,131.873654,59.295312,87.196588,123.934603,...,9.715433,2.600299,6.458633,9.763113,4.478880,9.367707,4.067098,13.345410,5.803068,unstressed
SRR8477379,182.460673,173.839732,153.765477,140.532415,118.555250,68.309429,131.873654,59.079150,88.635569,123.934603,...,10.081583,2.922880,8.174101,9.621248,4.652473,9.576681,5.107865,14.151844,5.838867,unstressed
SRR8477380,10.300817,8.949210,5.218350,2.688350,1.347125,1.361161,140.532415,2.957085,14.215903,117.359420,...,7.589135,2.742418,5.841284,10.558317,4.306918,9.613998,4.084625,13.903650,5.727155,unstressed
SRR8477381,9.965343,8.456053,5.759153,1.684045,1.843833,2.259845,140.532415,2.864434,12.808283,117.359420,...,7.766005,2.873571,5.439825,12.828605,4.809838,9.283625,4.370074,14.722905,5.938446,unstressed
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
SRR8477706,182.460673,173.839732,165.218790,153.765477,140.532415,135.642548,3.822611,128.561118,137.765338,3.094361,...,3.623752,8.251453,0.320094,7.644928,11.098050,3.766677,8.696802,16.146562,0.320094,stressed
SRR8477707,182.460673,173.839732,165.218790,153.765477,140.532415,135.642548,3.048982,128.561118,137.765338,6.032050,...,3.929960,9.155986,0.319713,16.628795,10.398720,3.047850,10.869067,14.927352,0.319713,stressed
SRR8477708,182.460673,173.839732,165.218790,153.765477,140.532415,135.642548,0.320536,127.067799,137.765338,3.514561,...,6.295562,9.925422,0.320536,8.487642,9.102967,4.197482,8.451116,9.247555,0.320536,stressed
SRR8477709,182.460673,173.839732,165.218790,153.765477,140.532415,135.642548,10.809911,127.067799,137.765338,11.197622,...,3.327915,9.991246,0.318293,6.689468,10.670386,0.318293,6.791199,12.046822,0.318293,stressed


In [417]:
# split data into training and test sets (80/20)
X = top5000.drop(['stressed'], axis=1)
y = top5000['stressed']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y)

In [418]:
# Create logistic regression pipeline
logistic_reg = Pipeline([
    ('scaler', StandardScaler()),
    ('clf', LogisticRegression(penalty='l2',
                               solver='saga',
                               max_iter=5000)
    )
])

logistic_reg

In [419]:
# train the model
logistic_reg.fit(X_train, y_train)

In [420]:
# Evaluate the model
y_pred = logistic_reg.predict(X_test)

print("Accuracy:", accuracy_score(y_test, y_pred))
print(classification_report(y_test, y_pred))
print(confusion_matrix(y_test, y_pred))

Accuracy: 0.967741935483871
              precision    recall  f1-score   support

    stressed       0.94      1.00      0.97        31
  unstressed       1.00      0.94      0.97        31

    accuracy                           0.97        62
   macro avg       0.97      0.97      0.97        62
weighted avg       0.97      0.97      0.97        62

[[31  0]
 [ 2 29]]


In [428]:
Prediction = {
    'Prediction' : y_pred,
    'Actual': y_test
}

In [434]:
predictions = pd.DataFrame(Prediction)
predictions.to_csv('gabe_prediction_with_actual.csv')
predictions.drop(['Actual'], axis=1).to_csv('gabe_prediction_without_actual.csv')

# 2e

In [409]:
classification = pd.read_csv('Gabe_classification.tsv', sep='\t', index_col=0)
classification

Unnamed: 0,x
SRR8477377,2
SRR8477378,2
SRR8477379,2
SRR8477380,2
SRR8477381,2
...,...
SRR8477706,6
SRR8477707,6
SRR8477708,6
SRR8477709,6


In [410]:
top5000 = top5000.drop(['stressed'], axis=1)
top5000['classification'] = classification
top5000

gene,Hbb-bs,Hba-a2,Hba-a1,Hbb-bt,Alas2,Ube2l6,Slc1a2,Tent5c,Bnip3l,Camk2a,...,Dtx4,Filip1l,Col4a2,Zc3h4,Supt4a,Crip2,Pde3b,Tmem63c,Cert1,classification
SRR8477377,182.460673,173.839732,153.765477,140.532415,118.555250,67.828143,130.200581,60.224164,89.831858,123.934603,...,9.921208,3.141237,6.806236,11.086213,4.562386,9.602470,5.570871,8.104546,13.922076,2
SRR8477378,182.460673,173.839732,153.765477,140.532415,117.359420,68.073526,131.873654,59.295312,87.196588,123.934603,...,9.715433,2.600299,6.458633,9.763113,4.478880,9.367707,4.067098,5.803068,13.345410,2
SRR8477379,182.460673,173.839732,153.765477,140.532415,118.555250,68.309429,131.873654,59.079150,88.635569,123.934603,...,10.081583,2.922880,8.174101,9.621248,4.652473,9.576681,5.107865,5.838867,14.151844,2
SRR8477380,10.300817,8.949210,5.218350,2.688350,1.347125,1.361161,140.532415,2.957085,14.215903,117.359420,...,7.589135,2.742418,5.841284,10.558317,4.306918,9.613998,4.084625,5.727155,13.903650,2
SRR8477381,9.965343,8.456053,5.759153,1.684045,1.843833,2.259845,140.532415,2.864434,12.808283,117.359420,...,7.766005,2.873571,5.439825,12.828605,4.809838,9.283625,4.370074,5.938446,14.722905,2
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
SRR8477706,182.460673,173.839732,165.218790,153.765477,140.532415,135.642548,3.822611,128.561118,137.765338,3.094361,...,3.623752,8.251453,0.320094,7.644928,11.098050,3.766677,8.696802,0.320094,16.146562,6
SRR8477707,182.460673,173.839732,165.218790,153.765477,140.532415,135.642548,3.048982,128.561118,137.765338,6.032050,...,3.929960,9.155986,0.319713,16.628795,10.398720,3.047850,10.869067,0.319713,14.927352,6
SRR8477708,182.460673,173.839732,165.218790,153.765477,140.532415,135.642548,0.320536,127.067799,137.765338,3.514561,...,6.295562,9.925422,0.320536,8.487642,9.102967,4.197482,8.451116,0.320536,9.247555,6
SRR8477709,182.460673,173.839732,165.218790,153.765477,140.532415,135.642548,10.809911,127.067799,137.765338,11.197622,...,3.327915,9.991246,0.318293,6.689468,10.670386,0.318293,6.791199,0.318293,12.046822,6


In [411]:
# split data into new training and test sets (80/20)
X = top5000.drop(['classification'], axis=1)
y = top5000['classification']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y)

In [412]:
# train the model
logistic_reg.fit(X_train, y_train)

In [413]:
# Evaluate the model
y_pred = logistic_reg.predict(X_test)

print("Accuracy:", accuracy_score(y_test, y_pred))
print(classification_report(y_test, y_pred))
print(confusion_matrix(y_test, y_pred))

Accuracy: 1.0
              precision    recall  f1-score   support

           1       1.00      1.00      1.00        14
           2       1.00      1.00      1.00        14
           3       1.00      1.00      1.00         2
           4       1.00      1.00      1.00        10
           5       1.00      1.00      1.00         6
           6       1.00      1.00      1.00         3
           7       1.00      1.00      1.00         1
           8       1.00      1.00      1.00         1
           9       1.00      1.00      1.00         3
          10       1.00      1.00      1.00         2
          11       1.00      1.00      1.00         4
          12       1.00      1.00      1.00         1
          13       1.00      1.00      1.00         1

    accuracy                           1.00        62
   macro avg       1.00      1.00      1.00        62
weighted avg       1.00      1.00      1.00        62

[[14  0  0  0  0  0  0  0  0  0  0  0  0]
 [ 0 14  0  0  0  0  0 

In [414]:
# get area under ROC curve
roc_auc_score(y_test, logistic_reg.predict_proba(X_test), multi_class='ovo')

np.float64(1.0)

In [398]:
samples = y_test.index
samples

Index(['SRR8477384', 'SRR8477656', 'SRR8477546', 'SRR8477390', 'SRR8477541',
       'SRR8477611', 'SRR8477655', 'SRR8477409', 'SRR8477614', 'SRR8477596',
       'SRR8477411', 'SRR8477597', 'SRR8477463', 'SRR8477673', 'SRR8477675',
       'SRR8477680', 'SRR8477634', 'SRR8477551', 'SRR8477396', 'SRR8477670',
       'SRR8477698', 'SRR8477391', 'SRR8477593', 'SRR8477437', 'SRR8477481',
       'SRR8477465', 'SRR8477386', 'SRR8477529', 'SRR8477475', 'SRR8477574',
       'SRR8477414', 'SRR8477573', 'SRR8477590', 'SRR8477549', 'SRR8477495',
       'SRR8477501', 'SRR8477431', 'SRR8477581', 'SRR8477420', 'SRR8477451',
       'SRR8477492', 'SRR8477522', 'SRR8477490', 'SRR8477629', 'SRR8477508',
       'SRR8477615', 'SRR8477570', 'SRR8477533', 'SRR8477429', 'SRR8477641',
       'SRR8477531', 'SRR8477441', 'SRR8477607', 'SRR8477667', 'SRR8477622',
       'SRR8477704', 'SRR8477378', 'SRR8477471', 'SRR8477489', 'SRR8477665',
       'SRR8477705', 'SRR8477605', 'predictions'],
      dtype='object')

# 4 

## 10 genes

In [333]:
top10 = top5000.T.head(10).T
top10

gene,Hbb-bs,Hba-a2,Hba-a1,Hbb-bt,Alas2,Ube2l6,Slc1a2,Tent5c,Bnip3l,Camk2a
SRR8477377,182.460673,173.839732,153.765477,140.532415,118.555250,67.828143,130.200581,60.224164,89.831858,123.934603
SRR8477378,182.460673,173.839732,153.765477,140.532415,117.359420,68.073526,131.873654,59.295312,87.196588,123.934603
SRR8477379,182.460673,173.839732,153.765477,140.532415,118.555250,68.309429,131.873654,59.079150,88.635569,123.934603
SRR8477380,10.300817,8.949210,5.218350,2.688350,1.347125,1.361161,140.532415,2.957085,14.215903,117.359420
SRR8477381,9.965343,8.456053,5.759153,1.684045,1.843833,2.259845,140.532415,2.864434,12.808283,117.359420
...,...,...,...,...,...,...,...,...,...,...
SRR8477706,182.460673,173.839732,165.218790,153.765477,140.532415,135.642548,3.822611,128.561118,137.765338,3.094361
SRR8477707,182.460673,173.839732,165.218790,153.765477,140.532415,135.642548,3.048982,128.561118,137.765338,6.032050
SRR8477708,182.460673,173.839732,165.218790,153.765477,140.532415,135.642548,0.320536,127.067799,137.765338,3.514561
SRR8477709,182.460673,173.839732,165.218790,153.765477,140.532415,135.642548,10.809911,127.067799,137.765338,11.197622


### stressed vs unstressed

In [334]:
# add target column
top10['stressed'] = stressed
top10

gene,Hbb-bs,Hba-a2,Hba-a1,Hbb-bt,Alas2,Ube2l6,Slc1a2,Tent5c,Bnip3l,Camk2a,stressed
SRR8477377,182.460673,173.839732,153.765477,140.532415,118.555250,67.828143,130.200581,60.224164,89.831858,123.934603,unstressed
SRR8477378,182.460673,173.839732,153.765477,140.532415,117.359420,68.073526,131.873654,59.295312,87.196588,123.934603,unstressed
SRR8477379,182.460673,173.839732,153.765477,140.532415,118.555250,68.309429,131.873654,59.079150,88.635569,123.934603,unstressed
SRR8477380,10.300817,8.949210,5.218350,2.688350,1.347125,1.361161,140.532415,2.957085,14.215903,117.359420,unstressed
SRR8477381,9.965343,8.456053,5.759153,1.684045,1.843833,2.259845,140.532415,2.864434,12.808283,117.359420,unstressed
...,...,...,...,...,...,...,...,...,...,...,...
SRR8477706,182.460673,173.839732,165.218790,153.765477,140.532415,135.642548,3.822611,128.561118,137.765338,3.094361,stressed
SRR8477707,182.460673,173.839732,165.218790,153.765477,140.532415,135.642548,3.048982,128.561118,137.765338,6.032050,stressed
SRR8477708,182.460673,173.839732,165.218790,153.765477,140.532415,135.642548,0.320536,127.067799,137.765338,3.514561,stressed
SRR8477709,182.460673,173.839732,165.218790,153.765477,140.532415,135.642548,10.809911,127.067799,137.765338,11.197622,stressed


In [335]:
# Split data
X = top10.drop(['stressed'], axis=1)
y = top10['stressed']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y)

In [336]:
# train the model
logistic_reg.fit(X_train, y_train)

In [337]:
# Evaluate the model
y_pred = logistic_reg.predict(X_test)

print("Accuracy:", accuracy_score(y_test, y_pred))
print(classification_report(y_test, y_pred))
print(confusion_matrix(y_test, y_pred))

Accuracy: 0.532258064516129
              precision    recall  f1-score   support

    stressed       0.53      0.61      0.57        31
  unstressed       0.54      0.45      0.49        31

    accuracy                           0.53        62
   macro avg       0.53      0.53      0.53        62
weighted avg       0.53      0.53      0.53        62

[[19 12]
 [17 14]]


### **clusters**

In [338]:
classification = pd.read_csv('Gabe_classification_10.tsv', sep='\t', index_col=0)
classification

Unnamed: 0,x
SRR8477377,1
SRR8477378,1
SRR8477379,1
SRR8477380,2
SRR8477381,2
...,...
SRR8477706,8
SRR8477707,8
SRR8477708,8
SRR8477709,8


In [339]:
top10 = top10.drop(['stressed'], axis=1)
top10['classification'] = classification
top10

gene,Hbb-bs,Hba-a2,Hba-a1,Hbb-bt,Alas2,Ube2l6,Slc1a2,Tent5c,Bnip3l,Camk2a,classification
SRR8477377,182.460673,173.839732,153.765477,140.532415,118.555250,67.828143,130.200581,60.224164,89.831858,123.934603,1
SRR8477378,182.460673,173.839732,153.765477,140.532415,117.359420,68.073526,131.873654,59.295312,87.196588,123.934603,1
SRR8477379,182.460673,173.839732,153.765477,140.532415,118.555250,68.309429,131.873654,59.079150,88.635569,123.934603,1
SRR8477380,10.300817,8.949210,5.218350,2.688350,1.347125,1.361161,140.532415,2.957085,14.215903,117.359420,2
SRR8477381,9.965343,8.456053,5.759153,1.684045,1.843833,2.259845,140.532415,2.864434,12.808283,117.359420,2
...,...,...,...,...,...,...,...,...,...,...,...
SRR8477706,182.460673,173.839732,165.218790,153.765477,140.532415,135.642548,3.822611,128.561118,137.765338,3.094361,8
SRR8477707,182.460673,173.839732,165.218790,153.765477,140.532415,135.642548,3.048982,128.561118,137.765338,6.032050,8
SRR8477708,182.460673,173.839732,165.218790,153.765477,140.532415,135.642548,0.320536,127.067799,137.765338,3.514561,8
SRR8477709,182.460673,173.839732,165.218790,153.765477,140.532415,135.642548,10.809911,127.067799,137.765338,11.197622,8


In [340]:
# split data into new training and test sets (80/20)
X = top10.drop(['classification'], axis=1)
y = top10['classification']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y)

In [341]:
# train the model
logistic_reg.fit(X_train, y_train)

In [342]:
# Evaluate the model
y_pred = logistic_reg.predict(X_test)

print("Accuracy:", accuracy_score(y_test, y_pred))
print(classification_report(y_test, y_pred))
print(confusion_matrix(y_test, y_pred))

Accuracy: 0.8548387096774194
              precision    recall  f1-score   support

           1       1.00      1.00      1.00         4
           2       0.83      1.00      0.90        19
           3       0.00      0.00      0.00         4
           4       1.00      1.00      1.00         2
           5       1.00      1.00      1.00         4
           6       1.00      1.00      1.00         8
           7       1.00      1.00      1.00         3
           8       0.72      1.00      0.84        13
           9       0.00      0.00      0.00         5

    accuracy                           0.85        62
   macro avg       0.73      0.78      0.75        62
weighted avg       0.74      0.85      0.79        62

[[ 4  0  0  0  0  0  0  0  0]
 [ 0 19  0  0  0  0  0  0  0]
 [ 0  4  0  0  0  0  0  0  0]
 [ 0  0  0  2  0  0  0  0  0]
 [ 0  0  0  0  4  0  0  0  0]
 [ 0  0  0  0  0  8  0  0  0]
 [ 0  0  0  0  0  0  3  0  0]
 [ 0  0  0  0  0  0  0 13  0]
 [ 0  0  0  0  0  0  0  5 

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [343]:
# get area under ROC curve
roc_auc_score(y_test, logistic_reg.predict_proba(X_test), multi_class='ovo')

np.float64(0.9887286324786325)

## 100 genes

In [344]:
top100 = top5000.T.head(100).T
top100

gene,Hbb-bs,Hba-a2,Hba-a1,Hbb-bt,Alas2,Ube2l6,Slc1a2,Tent5c,Bnip3l,Camk2a,...,Rnf10,Dnm1,Tsc22d1,Tpt1,Sec61g,Gda,Gng11,Hpcal4,Peg3,Fam220a
SRR8477377,182.460673,173.839732,153.765477,140.532415,118.555250,67.828143,130.200581,60.224164,89.831858,123.934603,...,42.969454,81.288350,62.301919,51.874324,11.000776,39.724473,10.218912,71.182024,45.834816,8.627432
SRR8477378,182.460673,173.839732,153.765477,140.532415,117.359420,68.073526,131.873654,59.295312,87.196588,123.934603,...,42.845829,78.605492,62.301919,49.354564,11.984659,39.106948,9.351819,69.466445,46.746253,6.109945
SRR8477379,182.460673,173.839732,153.765477,140.532415,118.555250,68.309429,131.873654,59.079150,88.635569,123.934603,...,51.322541,83.171185,59.845291,50.997257,11.176028,41.085445,9.871250,69.466445,47.276017,21.911494
SRR8477380,10.300817,8.949210,5.218350,2.688350,1.347125,1.361161,140.532415,2.957085,14.215903,117.359420,...,25.010610,71.439601,38.609632,34.311561,4.579505,27.577196,2.976132,75.050092,59.845291,1.105832
SRR8477381,9.965343,8.456053,5.759153,1.684045,1.843833,2.259845,140.532415,2.864434,12.808283,117.359420,...,23.913177,69.466445,39.162645,32.478096,4.118055,28.820099,2.847319,75.657566,59.295312,3.177405
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
SRR8477706,182.460673,173.839732,165.218790,153.765477,140.532415,135.642548,3.822611,128.561118,137.765338,3.094361,...,99.933584,2.852649,116.163590,108.374647,76.617575,97.362222,76.954759,0.320094,3.112539,75.965965
SRR8477707,182.460673,173.839732,165.218790,153.765477,140.532415,135.642548,3.048982,128.561118,137.765338,6.032050,...,96.624008,6.843878,116.163590,109.363737,74.235967,97.362222,77.677329,2.119599,2.073199,76.285694
SRR8477708,182.460673,173.839732,165.218790,153.765477,140.532415,135.642548,0.320536,127.067799,137.765338,3.514561,...,96.624008,0.320536,117.359420,108.374647,75.350819,99.066884,76.617575,0.320536,3.147481,73.596234
SRR8477709,182.460673,173.839732,165.218790,153.765477,140.532415,135.642548,10.809911,127.067799,137.765338,11.197622,...,99.066884,3.062474,117.359420,109.363737,76.954759,97.362222,77.316044,2.754517,5.536569,75.657566


### stressed vs unstressed

In [345]:
# add target column
top100['stressed'] = stressed
top100

gene,Hbb-bs,Hba-a2,Hba-a1,Hbb-bt,Alas2,Ube2l6,Slc1a2,Tent5c,Bnip3l,Camk2a,...,Dnm1,Tsc22d1,Tpt1,Sec61g,Gda,Gng11,Hpcal4,Peg3,Fam220a,stressed
SRR8477377,182.460673,173.839732,153.765477,140.532415,118.555250,67.828143,130.200581,60.224164,89.831858,123.934603,...,81.288350,62.301919,51.874324,11.000776,39.724473,10.218912,71.182024,45.834816,8.627432,unstressed
SRR8477378,182.460673,173.839732,153.765477,140.532415,117.359420,68.073526,131.873654,59.295312,87.196588,123.934603,...,78.605492,62.301919,49.354564,11.984659,39.106948,9.351819,69.466445,46.746253,6.109945,unstressed
SRR8477379,182.460673,173.839732,153.765477,140.532415,118.555250,68.309429,131.873654,59.079150,88.635569,123.934603,...,83.171185,59.845291,50.997257,11.176028,41.085445,9.871250,69.466445,47.276017,21.911494,unstressed
SRR8477380,10.300817,8.949210,5.218350,2.688350,1.347125,1.361161,140.532415,2.957085,14.215903,117.359420,...,71.439601,38.609632,34.311561,4.579505,27.577196,2.976132,75.050092,59.845291,1.105832,unstressed
SRR8477381,9.965343,8.456053,5.759153,1.684045,1.843833,2.259845,140.532415,2.864434,12.808283,117.359420,...,69.466445,39.162645,32.478096,4.118055,28.820099,2.847319,75.657566,59.295312,3.177405,unstressed
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
SRR8477706,182.460673,173.839732,165.218790,153.765477,140.532415,135.642548,3.822611,128.561118,137.765338,3.094361,...,2.852649,116.163590,108.374647,76.617575,97.362222,76.954759,0.320094,3.112539,75.965965,stressed
SRR8477707,182.460673,173.839732,165.218790,153.765477,140.532415,135.642548,3.048982,128.561118,137.765338,6.032050,...,6.843878,116.163590,109.363737,74.235967,97.362222,77.677329,2.119599,2.073199,76.285694,stressed
SRR8477708,182.460673,173.839732,165.218790,153.765477,140.532415,135.642548,0.320536,127.067799,137.765338,3.514561,...,0.320536,117.359420,108.374647,75.350819,99.066884,76.617575,0.320536,3.147481,73.596234,stressed
SRR8477709,182.460673,173.839732,165.218790,153.765477,140.532415,135.642548,10.809911,127.067799,137.765338,11.197622,...,3.062474,117.359420,109.363737,76.954759,97.362222,77.316044,2.754517,5.536569,75.657566,stressed


In [346]:
# Split data
X = top100.drop(['stressed'], axis=1)
y = top100['stressed']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y)

In [347]:
# train the model
logistic_reg.fit(X_train, y_train)

In [348]:
# Evaluate the model
y_pred = logistic_reg.predict(X_test)

print("Accuracy:", accuracy_score(y_test, y_pred))
print(classification_report(y_test, y_pred))
print(confusion_matrix(y_test, y_pred))

Accuracy: 0.6774193548387096
              precision    recall  f1-score   support

    stressed       0.70      0.61      0.66        31
  unstressed       0.66      0.74      0.70        31

    accuracy                           0.68        62
   macro avg       0.68      0.68      0.68        62
weighted avg       0.68      0.68      0.68        62

[[19 12]
 [ 8 23]]


### clusters

In [349]:
classification = pd.read_csv('Gabe_classification_100.tsv', sep='\t', index_col=0)
classification

Unnamed: 0,x
SRR8477377,1
SRR8477378,1
SRR8477379,1
SRR8477380,2
SRR8477381,2
...,...
SRR8477706,14
SRR8477707,14
SRR8477708,14
SRR8477709,14


In [350]:
top100 = top100.drop(['stressed'], axis=1)
top100['classification'] = classification
top100

gene,Hbb-bs,Hba-a2,Hba-a1,Hbb-bt,Alas2,Ube2l6,Slc1a2,Tent5c,Bnip3l,Camk2a,...,Dnm1,Tsc22d1,Tpt1,Sec61g,Gda,Gng11,Hpcal4,Peg3,Fam220a,classification
SRR8477377,182.460673,173.839732,153.765477,140.532415,118.555250,67.828143,130.200581,60.224164,89.831858,123.934603,...,81.288350,62.301919,51.874324,11.000776,39.724473,10.218912,71.182024,45.834816,8.627432,1
SRR8477378,182.460673,173.839732,153.765477,140.532415,117.359420,68.073526,131.873654,59.295312,87.196588,123.934603,...,78.605492,62.301919,49.354564,11.984659,39.106948,9.351819,69.466445,46.746253,6.109945,1
SRR8477379,182.460673,173.839732,153.765477,140.532415,118.555250,68.309429,131.873654,59.079150,88.635569,123.934603,...,83.171185,59.845291,50.997257,11.176028,41.085445,9.871250,69.466445,47.276017,21.911494,1
SRR8477380,10.300817,8.949210,5.218350,2.688350,1.347125,1.361161,140.532415,2.957085,14.215903,117.359420,...,71.439601,38.609632,34.311561,4.579505,27.577196,2.976132,75.050092,59.845291,1.105832,2
SRR8477381,9.965343,8.456053,5.759153,1.684045,1.843833,2.259845,140.532415,2.864434,12.808283,117.359420,...,69.466445,39.162645,32.478096,4.118055,28.820099,2.847319,75.657566,59.295312,3.177405,2
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
SRR8477706,182.460673,173.839732,165.218790,153.765477,140.532415,135.642548,3.822611,128.561118,137.765338,3.094361,...,2.852649,116.163590,108.374647,76.617575,97.362222,76.954759,0.320094,3.112539,75.965965,14
SRR8477707,182.460673,173.839732,165.218790,153.765477,140.532415,135.642548,3.048982,128.561118,137.765338,6.032050,...,6.843878,116.163590,109.363737,74.235967,97.362222,77.677329,2.119599,2.073199,76.285694,14
SRR8477708,182.460673,173.839732,165.218790,153.765477,140.532415,135.642548,0.320536,127.067799,137.765338,3.514561,...,0.320536,117.359420,108.374647,75.350819,99.066884,76.617575,0.320536,3.147481,73.596234,14
SRR8477709,182.460673,173.839732,165.218790,153.765477,140.532415,135.642548,10.809911,127.067799,137.765338,11.197622,...,3.062474,117.359420,109.363737,76.954759,97.362222,77.316044,2.754517,5.536569,75.657566,14


In [351]:
# split data into new training and test sets (80/20)
X = top100.drop(['classification'], axis=1)
y = top100['classification']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y)

In [352]:
# train the model
logistic_reg.fit(X_train, y_train)

In [353]:
# Evaluate the model
y_pred = logistic_reg.predict(X_test)

print("Accuracy:", accuracy_score(y_test, y_pred))
print(classification_report(y_test, y_pred))
print(confusion_matrix(y_test, y_pred))

Accuracy: 1.0
              precision    recall  f1-score   support

           1       1.00      1.00      1.00         5
           2       1.00      1.00      1.00         1
           3       1.00      1.00      1.00         7
           4       1.00      1.00      1.00         3
           5       1.00      1.00      1.00         1
           6       1.00      1.00      1.00         1
           7       1.00      1.00      1.00         7
           8       1.00      1.00      1.00         4
           9       1.00      1.00      1.00         1
          10       1.00      1.00      1.00         4
          11       1.00      1.00      1.00         4
          12       1.00      1.00      1.00         6
          13       1.00      1.00      1.00         1
          14       1.00      1.00      1.00        15
          15       1.00      1.00      1.00         2

    accuracy                           1.00        62
   macro avg       1.00      1.00      1.00        62
weighted avg

In [354]:
# get area under ROC curve
roc_auc_score(y_test, logistic_reg.predict_proba(X_test), multi_class='ovo')

np.float64(1.0)

## 1000 genes

In [355]:
top1000 = top5000.T.head(1000).T
top1000

gene,Hbb-bs,Hba-a2,Hba-a1,Hbb-bt,Alas2,Ube2l6,Slc1a2,Tent5c,Bnip3l,Camk2a,...,Sptb,Gprc5b,Slc7a14,Ahnak,Reep5,Ptms,Tmem178b,Esyt2,Abtb1,Nlgn3
SRR8477377,182.460673,173.839732,153.765477,140.532415,118.555250,67.828143,130.200581,60.224164,89.831858,123.934603,...,29.308866,16.535868,19.349914,20.476299,30.695725,28.952865,25.759114,11.293250,6.199362,25.887568
SRR8477378,182.460673,173.839732,153.765477,140.532415,117.359420,68.073526,131.873654,59.295312,87.196588,123.934603,...,28.745315,17.038598,20.097787,19.639757,31.363386,30.233404,25.476459,10.501973,5.358637,25.211411
SRR8477379,182.460673,173.839732,153.765477,140.532415,118.555250,68.309429,131.873654,59.079150,88.635569,123.934603,...,28.243608,20.436949,21.224430,19.679280,30.435025,30.033552,24.684943,11.467333,5.053297,25.059937
SRR8477380,10.300817,8.949210,5.218350,2.688350,1.347125,1.361161,140.532415,2.957085,14.215903,117.359420,...,22.976559,25.947611,20.986692,18.545562,33.637555,33.936474,27.244163,9.967703,3.784547,28.794783
SRR8477381,9.965343,8.456053,5.759153,1.684045,1.843833,2.259845,140.532415,2.864434,12.808283,117.359420,...,23.660169,29.179265,21.435164,18.419868,31.111793,33.176238,26.206336,10.225526,4.426755,25.314660
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
SRR8477706,182.460673,173.839732,165.218790,153.765477,140.532415,135.642548,3.822611,128.561118,137.765338,3.094361,...,49.156344,6.337338,0.320094,31.654359,15.863967,9.074909,3.402543,31.718557,25.692807,4.042412
SRR8477707,182.460673,173.839732,165.218790,153.765477,140.532415,135.642548,3.048982,128.561118,137.765338,6.032050,...,48.338848,6.247050,3.440847,30.465731,8.660472,8.583338,0.319713,29.468991,33.602075,4.041411
SRR8477708,182.460673,173.839732,165.218790,153.765477,140.532415,135.642548,0.320536,127.067799,137.765338,3.514561,...,51.874324,6.333605,2.342948,29.931982,9.112248,9.583548,2.747722,34.532102,22.357277,2.561566
SRR8477709,182.460673,173.839732,165.218790,153.765477,140.532415,135.642548,10.809911,127.067799,137.765338,11.197622,...,49.878407,2.655833,2.341839,27.888471,14.650378,7.403803,1.950331,37.900482,24.303736,3.623752


### stressed vs unstressed

In [356]:
# add target column
top1000['stressed'] = stressed
top1000

gene,Hbb-bs,Hba-a2,Hba-a1,Hbb-bt,Alas2,Ube2l6,Slc1a2,Tent5c,Bnip3l,Camk2a,...,Gprc5b,Slc7a14,Ahnak,Reep5,Ptms,Tmem178b,Esyt2,Abtb1,Nlgn3,stressed
SRR8477377,182.460673,173.839732,153.765477,140.532415,118.555250,67.828143,130.200581,60.224164,89.831858,123.934603,...,16.535868,19.349914,20.476299,30.695725,28.952865,25.759114,11.293250,6.199362,25.887568,unstressed
SRR8477378,182.460673,173.839732,153.765477,140.532415,117.359420,68.073526,131.873654,59.295312,87.196588,123.934603,...,17.038598,20.097787,19.639757,31.363386,30.233404,25.476459,10.501973,5.358637,25.211411,unstressed
SRR8477379,182.460673,173.839732,153.765477,140.532415,118.555250,68.309429,131.873654,59.079150,88.635569,123.934603,...,20.436949,21.224430,19.679280,30.435025,30.033552,24.684943,11.467333,5.053297,25.059937,unstressed
SRR8477380,10.300817,8.949210,5.218350,2.688350,1.347125,1.361161,140.532415,2.957085,14.215903,117.359420,...,25.947611,20.986692,18.545562,33.637555,33.936474,27.244163,9.967703,3.784547,28.794783,unstressed
SRR8477381,9.965343,8.456053,5.759153,1.684045,1.843833,2.259845,140.532415,2.864434,12.808283,117.359420,...,29.179265,21.435164,18.419868,31.111793,33.176238,26.206336,10.225526,4.426755,25.314660,unstressed
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
SRR8477706,182.460673,173.839732,165.218790,153.765477,140.532415,135.642548,3.822611,128.561118,137.765338,3.094361,...,6.337338,0.320094,31.654359,15.863967,9.074909,3.402543,31.718557,25.692807,4.042412,stressed
SRR8477707,182.460673,173.839732,165.218790,153.765477,140.532415,135.642548,3.048982,128.561118,137.765338,6.032050,...,6.247050,3.440847,30.465731,8.660472,8.583338,0.319713,29.468991,33.602075,4.041411,stressed
SRR8477708,182.460673,173.839732,165.218790,153.765477,140.532415,135.642548,0.320536,127.067799,137.765338,3.514561,...,6.333605,2.342948,29.931982,9.112248,9.583548,2.747722,34.532102,22.357277,2.561566,stressed
SRR8477709,182.460673,173.839732,165.218790,153.765477,140.532415,135.642548,10.809911,127.067799,137.765338,11.197622,...,2.655833,2.341839,27.888471,14.650378,7.403803,1.950331,37.900482,24.303736,3.623752,stressed


In [357]:
# Split data
X = top1000.drop(['stressed'], axis=1)
y = top1000['stressed']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y)

In [358]:
# train the model
logistic_reg.fit(X_train, y_train)

In [359]:
# Evaluate the model
y_pred = logistic_reg.predict(X_test)

print("Accuracy:", accuracy_score(y_test, y_pred))
print(classification_report(y_test, y_pred))
print(confusion_matrix(y_test, y_pred))

Accuracy: 0.9354838709677419
              precision    recall  f1-score   support

    stressed       0.94      0.94      0.94        31
  unstressed       0.94      0.94      0.94        31

    accuracy                           0.94        62
   macro avg       0.94      0.94      0.94        62
weighted avg       0.94      0.94      0.94        62

[[29  2]
 [ 2 29]]


### clusters

In [360]:
classification = pd.read_csv('Gabe_classification_1000.tsv', sep='\t', index_col=0)
classification

Unnamed: 0,x
SRR8477377,1
SRR8477378,1
SRR8477379,1
SRR8477380,2
SRR8477381,2
...,...
SRR8477706,9
SRR8477707,9
SRR8477708,9
SRR8477709,9


In [361]:
top1000 = top1000.drop(['stressed'], axis=1)
top1000['classification'] = classification
top1000

gene,Hbb-bs,Hba-a2,Hba-a1,Hbb-bt,Alas2,Ube2l6,Slc1a2,Tent5c,Bnip3l,Camk2a,...,Gprc5b,Slc7a14,Ahnak,Reep5,Ptms,Tmem178b,Esyt2,Abtb1,Nlgn3,classification
SRR8477377,182.460673,173.839732,153.765477,140.532415,118.555250,67.828143,130.200581,60.224164,89.831858,123.934603,...,16.535868,19.349914,20.476299,30.695725,28.952865,25.759114,11.293250,6.199362,25.887568,1
SRR8477378,182.460673,173.839732,153.765477,140.532415,117.359420,68.073526,131.873654,59.295312,87.196588,123.934603,...,17.038598,20.097787,19.639757,31.363386,30.233404,25.476459,10.501973,5.358637,25.211411,1
SRR8477379,182.460673,173.839732,153.765477,140.532415,118.555250,68.309429,131.873654,59.079150,88.635569,123.934603,...,20.436949,21.224430,19.679280,30.435025,30.033552,24.684943,11.467333,5.053297,25.059937,1
SRR8477380,10.300817,8.949210,5.218350,2.688350,1.347125,1.361161,140.532415,2.957085,14.215903,117.359420,...,25.947611,20.986692,18.545562,33.637555,33.936474,27.244163,9.967703,3.784547,28.794783,2
SRR8477381,9.965343,8.456053,5.759153,1.684045,1.843833,2.259845,140.532415,2.864434,12.808283,117.359420,...,29.179265,21.435164,18.419868,31.111793,33.176238,26.206336,10.225526,4.426755,25.314660,2
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
SRR8477706,182.460673,173.839732,165.218790,153.765477,140.532415,135.642548,3.822611,128.561118,137.765338,3.094361,...,6.337338,0.320094,31.654359,15.863967,9.074909,3.402543,31.718557,25.692807,4.042412,9
SRR8477707,182.460673,173.839732,165.218790,153.765477,140.532415,135.642548,3.048982,128.561118,137.765338,6.032050,...,6.247050,3.440847,30.465731,8.660472,8.583338,0.319713,29.468991,33.602075,4.041411,9
SRR8477708,182.460673,173.839732,165.218790,153.765477,140.532415,135.642548,0.320536,127.067799,137.765338,3.514561,...,6.333605,2.342948,29.931982,9.112248,9.583548,2.747722,34.532102,22.357277,2.561566,9
SRR8477709,182.460673,173.839732,165.218790,153.765477,140.532415,135.642548,10.809911,127.067799,137.765338,11.197622,...,2.655833,2.341839,27.888471,14.650378,7.403803,1.950331,37.900482,24.303736,3.623752,9


In [362]:
# split data into new training and test sets (80/20)
X = top1000.drop(['classification'], axis=1)
y = top1000['classification']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y)

In [363]:
# train the model
logistic_reg.fit(X_train, y_train)

In [364]:
# Evaluate the model
y_pred = logistic_reg.predict(X_test)

print("Accuracy:", accuracy_score(y_test, y_pred))
print(classification_report(y_test, y_pred))
print(confusion_matrix(y_test, y_pred))

Accuracy: 1.0
              precision    recall  f1-score   support

           1       1.00      1.00      1.00         5
           2       1.00      1.00      1.00         2
           3       1.00      1.00      1.00        10
           4       1.00      1.00      1.00         2
           5       1.00      1.00      1.00        11
           6       1.00      1.00      1.00         4
           7       1.00      1.00      1.00         7
           8       1.00      1.00      1.00         5
           9       1.00      1.00      1.00         5
          10       1.00      1.00      1.00         1
          11       1.00      1.00      1.00         1
          12       1.00      1.00      1.00         5
          13       1.00      1.00      1.00         2
          14       1.00      1.00      1.00         1
          15       1.00      1.00      1.00         1

    accuracy                           1.00        62
   macro avg       1.00      1.00      1.00        62
weighted avg

In [365]:
# get area under ROC curve
roc_auc_score(y_test, logistic_reg.predict_proba(X_test), multi_class='ovo')

np.float64(1.0)

## 10000 genes

In [366]:
# get top 10000 varying genes
top10000 = df_with_var.sort_values(by='var', ascending=False).head(10000).drop(columns=['var']).T
top10000

gene,Hbb-bs,Hba-a2,Hba-a1,Hbb-bt,Alas2,Ube2l6,Slc1a2,Tent5c,Bnip3l,Camk2a,...,Zswim6,Il1rl2,Gm5535,Ccz1,Zfp777,Gm5148,Mx1,M1ap,Mfap1b,Capn10
SRR8477377,182.460673,173.839732,153.765477,140.532415,118.555250,67.828143,130.200581,60.224164,89.831858,123.934603,...,8.092776,1.400053,1.881272,4.801759,5.697882,2.412197,1.630315,0.157460,5.046474,4.800482
SRR8477378,182.460673,173.839732,153.765477,140.532415,117.359420,68.073526,131.873654,59.295312,87.196588,123.934603,...,6.714866,1.548647,1.668301,5.308594,4.964285,2.079634,1.619073,1.470456,5.706976,4.505957
SRR8477379,182.460673,173.839732,153.765477,140.532415,118.555250,68.309429,131.873654,59.079150,88.635569,123.934603,...,6.512507,1.965138,0.759079,5.067185,4.966250,2.382964,1.153351,0.154648,5.678375,4.843598
SRR8477380,10.300817,8.949210,5.218350,2.688350,1.347125,1.361161,140.532415,2.957085,14.215903,117.359420,...,8.131757,1.827371,3.382115,5.124350,6.413428,2.394464,1.494152,0.150461,5.558671,4.813633
SRR8477381,9.965343,8.456053,5.759153,1.684045,1.843833,2.259845,140.532415,2.864434,12.808283,117.359420,...,8.261104,0.751769,3.216210,5.078073,5.229417,2.188396,1.785717,0.738470,5.678375,4.864634
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
SRR8477706,182.460673,173.839732,165.218790,153.765477,140.532415,135.642548,3.822611,128.561118,137.765338,3.094361,...,7.268586,3.698221,3.327915,7.096265,4.442708,5.008643,0.320094,1.999832,3.171141,0.320094
SRR8477707,182.460673,173.839732,165.218790,153.765477,140.532415,135.642548,3.048982,128.561118,137.765338,6.032050,...,8.837201,3.089249,0.319713,6.052401,5.226524,4.916887,3.837118,3.254350,5.489293,10.601691
SRR8477708,182.460673,173.839732,165.218790,153.765477,140.532415,135.642548,0.320536,127.067799,137.765338,3.514561,...,8.246039,2.715858,0.320536,6.263115,5.592340,4.462372,3.234535,2.625899,5.413907,5.713759
SRR8477709,182.460673,173.839732,165.218790,153.765477,140.532415,135.642548,10.809911,127.067799,137.765338,11.197622,...,6.518537,3.791637,0.318293,7.587733,8.545427,5.854221,4.327893,3.023135,4.578414,5.787677


## stressed vs unstressed

In [367]:
# add target column
top10000['stressed'] = stressed
top10000

gene,Hbb-bs,Hba-a2,Hba-a1,Hbb-bt,Alas2,Ube2l6,Slc1a2,Tent5c,Bnip3l,Camk2a,...,Il1rl2,Gm5535,Ccz1,Zfp777,Gm5148,Mx1,M1ap,Mfap1b,Capn10,stressed
SRR8477377,182.460673,173.839732,153.765477,140.532415,118.555250,67.828143,130.200581,60.224164,89.831858,123.934603,...,1.400053,1.881272,4.801759,5.697882,2.412197,1.630315,0.157460,5.046474,4.800482,unstressed
SRR8477378,182.460673,173.839732,153.765477,140.532415,117.359420,68.073526,131.873654,59.295312,87.196588,123.934603,...,1.548647,1.668301,5.308594,4.964285,2.079634,1.619073,1.470456,5.706976,4.505957,unstressed
SRR8477379,182.460673,173.839732,153.765477,140.532415,118.555250,68.309429,131.873654,59.079150,88.635569,123.934603,...,1.965138,0.759079,5.067185,4.966250,2.382964,1.153351,0.154648,5.678375,4.843598,unstressed
SRR8477380,10.300817,8.949210,5.218350,2.688350,1.347125,1.361161,140.532415,2.957085,14.215903,117.359420,...,1.827371,3.382115,5.124350,6.413428,2.394464,1.494152,0.150461,5.558671,4.813633,unstressed
SRR8477381,9.965343,8.456053,5.759153,1.684045,1.843833,2.259845,140.532415,2.864434,12.808283,117.359420,...,0.751769,3.216210,5.078073,5.229417,2.188396,1.785717,0.738470,5.678375,4.864634,unstressed
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
SRR8477706,182.460673,173.839732,165.218790,153.765477,140.532415,135.642548,3.822611,128.561118,137.765338,3.094361,...,3.698221,3.327915,7.096265,4.442708,5.008643,0.320094,1.999832,3.171141,0.320094,stressed
SRR8477707,182.460673,173.839732,165.218790,153.765477,140.532415,135.642548,3.048982,128.561118,137.765338,6.032050,...,3.089249,0.319713,6.052401,5.226524,4.916887,3.837118,3.254350,5.489293,10.601691,stressed
SRR8477708,182.460673,173.839732,165.218790,153.765477,140.532415,135.642548,0.320536,127.067799,137.765338,3.514561,...,2.715858,0.320536,6.263115,5.592340,4.462372,3.234535,2.625899,5.413907,5.713759,stressed
SRR8477709,182.460673,173.839732,165.218790,153.765477,140.532415,135.642548,10.809911,127.067799,137.765338,11.197622,...,3.791637,0.318293,7.587733,8.545427,5.854221,4.327893,3.023135,4.578414,5.787677,stressed


In [368]:
# Split data
X = top10000.drop(['stressed'], axis=1)
y = top10000['stressed']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y)

In [369]:
# train the model
logistic_reg.fit(X_train, y_train)

In [370]:
# Evaluate the model
y_pred = logistic_reg.predict(X_test)

print("Accuracy:", accuracy_score(y_test, y_pred))
print(classification_report(y_test, y_pred))
print(confusion_matrix(y_test, y_pred))

Accuracy: 0.967741935483871
              precision    recall  f1-score   support

    stressed       0.97      0.97      0.97        31
  unstressed       0.97      0.97      0.97        31

    accuracy                           0.97        62
   macro avg       0.97      0.97      0.97        62
weighted avg       0.97      0.97      0.97        62

[[30  1]
 [ 1 30]]


### clusters

In [371]:
classification = pd.read_csv('Gabe_classification_10000.tsv', sep='\t', index_col=0)
classification

Unnamed: 0,x
SRR8477377,2
SRR8477378,2
SRR8477379,2
SRR8477380,2
SRR8477381,2
...,...
SRR8477706,10
SRR8477707,10
SRR8477708,10
SRR8477709,10


In [372]:
top10000 = top10000.drop(['stressed'], axis=1)
top10000['classification'] = classification
top10000

gene,Hbb-bs,Hba-a2,Hba-a1,Hbb-bt,Alas2,Ube2l6,Slc1a2,Tent5c,Bnip3l,Camk2a,...,Il1rl2,Gm5535,Ccz1,Zfp777,Gm5148,Mx1,M1ap,Mfap1b,Capn10,classification
SRR8477377,182.460673,173.839732,153.765477,140.532415,118.555250,67.828143,130.200581,60.224164,89.831858,123.934603,...,1.400053,1.881272,4.801759,5.697882,2.412197,1.630315,0.157460,5.046474,4.800482,2
SRR8477378,182.460673,173.839732,153.765477,140.532415,117.359420,68.073526,131.873654,59.295312,87.196588,123.934603,...,1.548647,1.668301,5.308594,4.964285,2.079634,1.619073,1.470456,5.706976,4.505957,2
SRR8477379,182.460673,173.839732,153.765477,140.532415,118.555250,68.309429,131.873654,59.079150,88.635569,123.934603,...,1.965138,0.759079,5.067185,4.966250,2.382964,1.153351,0.154648,5.678375,4.843598,2
SRR8477380,10.300817,8.949210,5.218350,2.688350,1.347125,1.361161,140.532415,2.957085,14.215903,117.359420,...,1.827371,3.382115,5.124350,6.413428,2.394464,1.494152,0.150461,5.558671,4.813633,2
SRR8477381,9.965343,8.456053,5.759153,1.684045,1.843833,2.259845,140.532415,2.864434,12.808283,117.359420,...,0.751769,3.216210,5.078073,5.229417,2.188396,1.785717,0.738470,5.678375,4.864634,2
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
SRR8477706,182.460673,173.839732,165.218790,153.765477,140.532415,135.642548,3.822611,128.561118,137.765338,3.094361,...,3.698221,3.327915,7.096265,4.442708,5.008643,0.320094,1.999832,3.171141,0.320094,10
SRR8477707,182.460673,173.839732,165.218790,153.765477,140.532415,135.642548,3.048982,128.561118,137.765338,6.032050,...,3.089249,0.319713,6.052401,5.226524,4.916887,3.837118,3.254350,5.489293,10.601691,10
SRR8477708,182.460673,173.839732,165.218790,153.765477,140.532415,135.642548,0.320536,127.067799,137.765338,3.514561,...,2.715858,0.320536,6.263115,5.592340,4.462372,3.234535,2.625899,5.413907,5.713759,10
SRR8477709,182.460673,173.839732,165.218790,153.765477,140.532415,135.642548,10.809911,127.067799,137.765338,11.197622,...,3.791637,0.318293,7.587733,8.545427,5.854221,4.327893,3.023135,4.578414,5.787677,10


In [373]:
# split data into new training and test sets (80/20)
X = top10000.drop(['classification'], axis=1)
y = top10000['classification']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y)

In [374]:
# train the model
logistic_reg.fit(X_train, y_train)

In [375]:
# Evaluate the model
y_pred = logistic_reg.predict(X_test)

print("Accuracy:", accuracy_score(y_test, y_pred))
print(classification_report(y_test, y_pred))
print(confusion_matrix(y_test, y_pred))

Accuracy: 1.0
              precision    recall  f1-score   support

           1       1.00      1.00      1.00        14
           2       1.00      1.00      1.00        14
           3       1.00      1.00      1.00         2
           4       1.00      1.00      1.00        15
           5       1.00      1.00      1.00         1
           6       1.00      1.00      1.00         1
           7       1.00      1.00      1.00         1
           8       1.00      1.00      1.00         5
           9       1.00      1.00      1.00         2
          10       1.00      1.00      1.00         4
          11       1.00      1.00      1.00         1
          12       1.00      1.00      1.00         1
          13       1.00      1.00      1.00         1

    accuracy                           1.00        62
   macro avg       1.00      1.00      1.00        62
weighted avg       1.00      1.00      1.00        62

[[14  0  0  0  0  0  0  0  0  0  0  0  0]
 [ 0 14  0  0  0  0  0 

In [376]:
# get area under ROC curve
roc_auc_score(y_test, logistic_reg.predict_proba(X_test), multi_class='ovo')

np.float64(1.0)