# Regression #

In [145]:
import os
import numpy as np
import pandas as pd
data_gene = pd.read_csv('data_gene.csv', header=None)
data_meth = pd.read_csv('data_meth.csv', header=None)

In [146]:
avg_gene = pd.DataFrame.mean(data_gene, axis=1)

In [147]:
from sklearn.model_selection import train_test_split
meth_train, meth_test, gene_train, gene_test = train_test_split(data_meth, avg_gene, test_size=0.2, random_state=42)

In [148]:
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error

model = LinearRegression()
model.fit(meth_train, gene_train)
gene_pred = model.predict(meth_test)
mse = mean_squared_error(gene_pred, gene_test)
rmse = mse**0.5
rmse

1.2966712992717235

In [149]:
r2 = model.score(meth_train, gene_train)
r2

0.38938876579986925

In [150]:
print(np.mean(avg_gene))
print(np.std(avg_gene))

6.469304224898995
1.6750265618165283


In [151]:
from sklearn.decomposition import PCA

pca_model = PCA(n_components=6)
X2D = pca_model.fit_transform(data_meth)
print(pca_model.explained_variance_ratio_)
print(sum(pca_model.explained_variance_ratio_))

meth_train_pca, meth_test_pca, gene_train, gene_test = train_test_split(X2D, avg_gene, test_size=0.2, random_state=42)

[0.77093694 0.02824144 0.02111458 0.01884707 0.01124364 0.01053162]
0.860915281923988


In [152]:
model.fit(meth_train_pca, gene_train)
gene_pred_pca = model.predict(meth_test_pca)
mse = mean_squared_error(gene_pred_pca, gene_test)
rmse = mse**0.5
rmse

1.3225639300978704

In [153]:
r2 = model.score(meth_train_pca, gene_train)
r2

0.35843468715747134

# Classification #

# DNA Methylation #

In [154]:
data_gene_full = pd.DataFrame.transpose(pd.read_csv('data_gene_full.csv', header=None))
data_meth_full = pd.DataFrame.transpose(pd.read_csv('data_meth_full.csv', header=None))
data_protein = pd.DataFrame.transpose(pd.read_csv('data_protein.csv', header=None))
invasive_gene = np.ravel(pd.DataFrame.to_numpy(pd.DataFrame.transpose(pd.read_csv('invasive_gene.csv', header=None))))
invasive_meth = np.ravel(pd.DataFrame.to_numpy(pd.DataFrame.transpose(pd.read_csv('invasive_meth.csv', header=None))))
invasive_prot = np.ravel(pd.DataFrame.to_numpy(pd.DataFrame.transpose(pd.read_csv('invasive_prot.csv', header=None))))

In [155]:
pca_model = PCA(n_components=33)
X2D = pca_model.fit_transform(data_meth_full)
print(pca_model.explained_variance_ratio_)
print(sum(pca_model.explained_variance_ratio_))

meth_train, meth_test, inv_meth_train, inv_meth_test = train_test_split(data_meth_full, invasive_meth, test_size=0.2, random_state=42)

[0.23058041 0.08358834 0.07529258 0.04195038 0.03980991 0.03389717
 0.02861578 0.0266598  0.02412398 0.02272524 0.02160548 0.02031721
 0.01992677 0.01917752 0.01865001 0.01802961 0.01757204 0.01704353
 0.01671717 0.01615564 0.0151474  0.01510634 0.01413732 0.01375599
 0.0132123  0.01283581 0.01265166 0.01207266 0.01175983 0.01111457
 0.01058847 0.01050193 0.00981833]
0.9551411845558064


Stochastic Gradient Descent Classification

In [156]:
from sklearn.linear_model import SGDClassifier
from sklearn.model_selection import cross_val_score
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import cross_val_predict
from sklearn.metrics import precision_score, recall_score, f1_score

sgd_clf = SGDClassifier(max_iter=1000, tol=1e-3, random_state=42)
sgd_clf.fit(meth_train, inv_meth_train)

#cross_val = cross_val_score(sgd_clf, meth_train, inv_meth_train, cv=3, scoring="accuracy")
inv_train_pred = cross_val_predict(sgd_clf, meth_train, inv_meth_train, cv=3)
print(confusion_matrix(inv_meth_train, inv_train_pred))
print(precision_score(inv_meth_train, inv_train_pred))
print(recall_score(inv_meth_train, inv_train_pred))
print(f1_score(inv_meth_train, inv_train_pred))

[[23  0]
 [ 4  5]]
1.0
0.5555555555555556
0.7142857142857143


In [157]:
inv_pred = sgd_clf.predict(meth_test)
print(confusion_matrix(inv_meth_test, inv_pred))
print(precision_score(inv_meth_test, inv_pred))
print(recall_score(inv_meth_test, inv_pred))
print(f1_score(inv_meth_test, inv_pred))


[[8 0]
 [0 1]]
1.0
1.0
1.0


Logistic regression

In [158]:
from sklearn.linear_model import LogisticRegression
log_reg = LogisticRegression(solver="lbfgs", random_state=42)
log_reg.fit(meth_train, inv_meth_train)

inv_train_pred = cross_val_predict(log_reg, meth_train, inv_meth_train, cv=3)
print(confusion_matrix(inv_meth_train, inv_train_pred))
print(precision_score(inv_meth_train, inv_train_pred))
print(recall_score(inv_meth_train, inv_train_pred))
print(f1_score(inv_meth_train, inv_train_pred))

[[23  0]
 [ 0  9]]
1.0
1.0
1.0


In [159]:
inv_pred = log_reg.predict(meth_test)
print(confusion_matrix(inv_meth_test, inv_pred))
print(precision_score(inv_meth_test, inv_pred))
print(recall_score(inv_meth_test, inv_pred))
print(f1_score(inv_meth_test, inv_pred))

[[8 0]
 [0 1]]
1.0
1.0
1.0


Decision Tree Classification

In [160]:
from sklearn.tree import DecisionTreeClassifier

dec_tree = DecisionTreeClassifier(max_depth=10)
dec_tree.fit(meth_train, inv_meth_train)

inv_train_pred = cross_val_predict(dec_tree, meth_train, inv_meth_train, cv=3)
print(confusion_matrix(inv_meth_train, inv_train_pred))
print(precision_score(inv_meth_train, inv_train_pred))
print(recall_score(inv_meth_train, inv_train_pred))
print(f1_score(inv_meth_train, inv_train_pred))

[[19  4]
 [ 3  6]]
0.6
0.6666666666666666
0.631578947368421


In [161]:
inv_pred = dec_tree.predict(meth_test)
print(confusion_matrix(inv_meth_test, inv_pred))
print(precision_score(inv_meth_test, inv_pred))
print(recall_score(inv_meth_test, inv_pred))
print(f1_score(inv_meth_test, inv_pred))

[[6 2]
 [0 1]]
0.3333333333333333
1.0
0.5


Random Forest Classification

In [162]:
from sklearn.ensemble import RandomForestClassifier

rdm_for = RandomForestClassifier(max_depth=20)
rdm_for.fit(meth_train, inv_meth_train)

inv_train_pred = cross_val_predict(rdm_for, meth_train, inv_meth_train, cv=3)
print(confusion_matrix(inv_meth_train, inv_train_pred))
print(precision_score(inv_meth_train, inv_train_pred))
print(recall_score(inv_meth_train, inv_train_pred))
print(f1_score(inv_meth_train, inv_train_pred))

[[23  0]
 [ 4  5]]
1.0
0.5555555555555556
0.7142857142857143


In [163]:
inv_pred = rdm_for.predict(meth_test)
print(confusion_matrix(inv_meth_test, inv_pred))
print(precision_score(inv_meth_test, inv_pred))
print(recall_score(inv_meth_test, inv_pred))
print(f1_score(inv_meth_test, inv_pred))

[[8 0]
 [0 1]]
1.0
1.0
1.0


Non-linear SVM

In [164]:
from sklearn.svm import SVC
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler

poly_kernel_svm_clf = Pipeline([
        ("scaler", StandardScaler()),
        ("svm_clf", SVC(kernel="poly", degree=3, coef0=10, C=5))
    ])
poly_kernel_svm_clf.fit(meth_train, inv_meth_train)

inv_train_pred = cross_val_predict(poly_kernel_svm_clf, meth_train, inv_meth_train, cv=3)
print(confusion_matrix(inv_meth_train, inv_train_pred))
print(precision_score(inv_meth_train, inv_train_pred))
print(recall_score(inv_meth_train, inv_train_pred))
print(f1_score(inv_meth_train, inv_train_pred))

[[23  0]
 [ 2  7]]
1.0
0.7777777777777778
0.8750000000000001


In [165]:
inv_pred = poly_kernel_svm_clf.predict(meth_test)
print(confusion_matrix(inv_meth_test, inv_pred))
print(precision_score(inv_meth_test, inv_pred))
print(recall_score(inv_meth_test, inv_pred))
print(f1_score(inv_meth_test, inv_pred))

[[8 0]
 [0 1]]
1.0
1.0
1.0


# Gene expression #

In [166]:
pca_model = PCA(n_components=35)
X2D = pca_model.fit_transform(data_gene_full)
print(pca_model.explained_variance_ratio_)
print(sum(pca_model.explained_variance_ratio_))

gene_train, gene_test, inv_gene_train, inv_gene_test = train_test_split(data_gene_full, invasive_gene, test_size=0.2, random_state=42)

[0.20187772 0.08033251 0.05290564 0.04444638 0.04004594 0.03769164
 0.02910541 0.02701745 0.02565244 0.02518004 0.02343103 0.02313629
 0.02070333 0.02053692 0.01987141 0.01894142 0.01848458 0.01795077
 0.01752556 0.01743424 0.01672067 0.01591204 0.01488009 0.01448517
 0.01405302 0.01340756 0.01323407 0.01270981 0.01227557 0.0122496
 0.01200381 0.01131424 0.01095561 0.01078171 0.01041441]
0.9576681196204324


Stochastic Gradient Descent Classification

In [167]:
sgd_clf.fit(gene_train, inv_gene_train)

inv_train_pred = cross_val_predict(sgd_clf, gene_train, inv_gene_train, cv=3)
print(confusion_matrix(inv_gene_train, inv_train_pred))
print(precision_score(inv_gene_train, inv_train_pred))
print(recall_score(inv_gene_train, inv_train_pred))
print(f1_score(inv_gene_train, inv_train_pred))

inv_pred = sgd_clf.predict(gene_test)
print(confusion_matrix(inv_gene_test, inv_pred))
print(precision_score(inv_gene_test, inv_pred))
print(recall_score(inv_gene_test, inv_pred))
print(f1_score(inv_gene_test, inv_pred))

[[24  0]
 [ 1  8]]
1.0
0.8888888888888888
0.9411764705882353
[[7 0]
 [0 2]]
1.0
1.0
1.0


Logistic Regression

In [168]:
log_reg.fit(gene_train, inv_gene_train)

inv_train_pred = cross_val_predict(log_reg, gene_train, inv_gene_train, cv=3)
print(confusion_matrix(inv_gene_train, inv_train_pred))
print(precision_score(inv_gene_train, inv_train_pred))
print(recall_score(inv_gene_train, inv_train_pred))
print(f1_score(inv_gene_train, inv_train_pred))

inv_pred = log_reg.predict(gene_test)
print(inv_pred)
print(confusion_matrix(inv_gene_test, inv_pred))
print(precision_score(inv_gene_test, inv_pred))
print(recall_score(inv_gene_test, inv_pred))
print(f1_score(inv_gene_test, inv_pred))

[[23  1]
 [ 0  9]]
0.9
1.0
0.9473684210526316
[0 0 0 0 0 0 1 0 0]
[[7 0]
 [1 1]]
1.0
0.5
0.6666666666666666


Decision Tree Classification

In [169]:
dec_tree.fit(gene_train, inv_gene_train)

inv_train_pred = cross_val_predict(dec_tree, gene_train, inv_gene_train, cv=3)
print(confusion_matrix(inv_gene_train, inv_train_pred))
print(precision_score(inv_gene_train, inv_train_pred))
print(recall_score(inv_gene_train, inv_train_pred))
print(f1_score(inv_gene_train, inv_train_pred))

inv_pred = dec_tree.predict(gene_test)
print(inv_pred)
print(confusion_matrix(inv_gene_test, inv_pred))
print(precision_score(inv_gene_test, inv_pred))
print(recall_score(inv_gene_test, inv_pred))
print(f1_score(inv_gene_test, inv_pred))

[[20  4]
 [ 2  7]]
0.6363636363636364
0.7777777777777778
0.7000000000000001
[0 0 0 0 0 0 1 0 1]
[[7 0]
 [0 2]]
1.0
1.0
1.0


Random Forest Classification

In [170]:
rdm_for.fit(gene_train, inv_gene_train)

inv_train_pred = cross_val_predict(rdm_for, gene_train, inv_gene_train, cv=3)
print(confusion_matrix(inv_gene_train, inv_train_pred))
print(precision_score(inv_gene_train, inv_train_pred))
print(recall_score(inv_gene_train, inv_train_pred))
print(f1_score(inv_gene_train, inv_train_pred))

inv_pred = rdm_for.predict(gene_test)
print(confusion_matrix(inv_gene_test, inv_pred))
print(precision_score(inv_gene_test, inv_pred))
print(recall_score(inv_gene_test, inv_pred))
print(f1_score(inv_gene_test, inv_pred))

[[24  0]
 [ 1  8]]
1.0
0.8888888888888888
0.9411764705882353
[[7 0]
 [2 0]]
0.0
0.0
0.0


  _warn_prf(average, modifier, msg_start, len(result))


Non-linear SVM

In [171]:
poly_kernel_svm_clf.fit(gene_train, inv_gene_train)

inv_train_pred = cross_val_predict(poly_kernel_svm_clf, gene_train, inv_gene_train, cv=3)
print(confusion_matrix(inv_gene_train, inv_train_pred))
print(precision_score(inv_gene_train, inv_train_pred))
print(recall_score(inv_gene_train, inv_train_pred))
print(f1_score(inv_gene_train, inv_train_pred))

inv_pred = poly_kernel_svm_clf.predict(gene_test)
print(inv_pred)
print(confusion_matrix(inv_gene_test, inv_pred))
print(precision_score(inv_gene_test, inv_pred))
print(recall_score(inv_gene_test, inv_pred))
print(f1_score(inv_gene_test, inv_pred))

[[23  1]
 [ 0  9]]
0.9
1.0
0.9473684210526316
[0 0 0 0 0 0 1 0 0]
[[7 0]
 [1 1]]
1.0
0.5
0.6666666666666666


# Protein Expression #

In [172]:
pca_model = PCA(n_components=23)
X2D = pca_model.fit_transform(data_protein)
print(pca_model.explained_variance_ratio_)
print(sum(pca_model.explained_variance_ratio_))

prot_train, prot_test, inv_prot_train, inv_prot_test = train_test_split(data_protein, invasive_prot, test_size=0.2, random_state=42)

[0.32483031 0.11520812 0.07751424 0.05986119 0.04680208 0.04219798
 0.03380475 0.03173915 0.02968572 0.0264308  0.02369654 0.01964152
 0.01811987 0.01651275 0.0139744  0.01273616 0.01182936 0.0103401
 0.00960399 0.00876052 0.00818147 0.00687069 0.00640003]
0.9547417579575342


Stochastic Gradient Descent Classification

In [173]:
sgd_clf.fit(prot_train, inv_prot_train)

inv_train_pred = cross_val_predict(sgd_clf, prot_train, inv_prot_train, cv=3)
print(confusion_matrix(inv_prot_train, inv_train_pred))
print(precision_score(inv_prot_train, inv_train_pred))
print(recall_score(inv_prot_train, inv_train_pred))
print(f1_score(inv_prot_train, inv_train_pred))

inv_pred = sgd_clf.predict(prot_test)
print(confusion_matrix(inv_prot_test, inv_pred))
print(precision_score(inv_prot_test, inv_pred))
print(recall_score(inv_prot_test, inv_pred))
print(f1_score(inv_prot_test, inv_pred))

[[15  5]
 [ 1  8]]
0.6153846153846154
0.8888888888888888
0.7272727272727274
[[6 0]
 [0 2]]
1.0
1.0
1.0


Logistic Regression

In [174]:
log_reg.fit(prot_train, inv_prot_train)

inv_train_pred = cross_val_predict(log_reg, prot_train, inv_prot_train, cv=3)
print(confusion_matrix(inv_prot_train, inv_train_pred))
print(precision_score(inv_prot_train, inv_train_pred))
print(recall_score(inv_prot_train, inv_train_pred))
print(f1_score(inv_prot_train, inv_train_pred))

inv_pred = log_reg.predict(prot_test)
print(confusion_matrix(inv_prot_test, inv_pred))
print(precision_score(inv_prot_test, inv_pred))
print(recall_score(inv_prot_test, inv_pred))
print(f1_score(inv_prot_test, inv_pred))

[[20  0]
 [ 1  8]]
1.0
0.8888888888888888
0.9411764705882353
[[6 0]
 [0 2]]
1.0
1.0
1.0


Decision Tree Classification

In [175]:
dec_tree.fit(prot_train, inv_prot_train)

inv_train_pred = cross_val_predict(dec_tree, prot_train, inv_prot_train, cv=3)
print(confusion_matrix(inv_prot_train, inv_train_pred))
print(precision_score(inv_prot_train, inv_train_pred))
print(recall_score(inv_prot_train, inv_train_pred))
print(f1_score(inv_prot_train, inv_train_pred))

inv_pred = dec_tree.predict(prot_test)
print(confusion_matrix(inv_prot_test, inv_pred))
print(precision_score(inv_prot_test, inv_pred))
print(recall_score(inv_prot_test, inv_pred))
print(f1_score(inv_prot_test, inv_pred))

[[18  2]
 [ 4  5]]
0.7142857142857143
0.5555555555555556
0.6250000000000001
[[6 0]
 [0 2]]
1.0
1.0
1.0


Random Forest Classification

In [176]:
rdm_for.fit(prot_train, inv_prot_train)

inv_train_pred = cross_val_predict(rdm_for, prot_train, inv_prot_train, cv=3)
print(confusion_matrix(inv_prot_train, inv_train_pred))
print(precision_score(inv_prot_train, inv_train_pred))
print(recall_score(inv_prot_train, inv_train_pred))
print(f1_score(inv_prot_train, inv_train_pred))

inv_pred = rdm_for.predict(prot_test)
print(confusion_matrix(inv_prot_test, inv_pred))
print(precision_score(inv_prot_test, inv_pred))
print(recall_score(inv_prot_test, inv_pred))
print(f1_score(inv_prot_test, inv_pred))

[[19  1]
 [ 3  6]]
0.8571428571428571
0.6666666666666666
0.75
[[6 0]
 [0 2]]
1.0
1.0
1.0


Non-linear SVM

In [177]:
poly_kernel_svm_clf.fit(prot_train, inv_prot_train)

inv_train_pred = cross_val_predict(poly_kernel_svm_clf, prot_train, inv_prot_train, cv=3)
print(confusion_matrix(inv_prot_train, inv_train_pred))
print(precision_score(inv_prot_train, inv_train_pred))
print(recall_score(inv_prot_train, inv_train_pred))
print(f1_score(inv_prot_train, inv_train_pred))

inv_pred = poly_kernel_svm_clf.predict(prot_test)
print(confusion_matrix(inv_prot_test, inv_pred))
print(precision_score(inv_prot_test, inv_pred))
print(recall_score(inv_prot_test, inv_pred))
print(f1_score(inv_prot_test, inv_pred))

[[19  1]
 [ 1  8]]
0.8888888888888888
0.8888888888888888
0.8888888888888888
[[6 0]
 [0 2]]
1.0
1.0
1.0
