In [1]:
import numpy as np
import pandas as pd
from utils import read_genes_from_file

all_genes = read_genes_from_file("./GeneLists/all_genes.txt")

In [7]:
# find how many genes can be mapped to symbol to create the PPI dataset using R
mapping_df = pd.read_excel("./GeneLists/mapping.xlsx", engine="openpyxl")
eids = mapping_df.Entrenz.to_numpy()
syms = []
for gene in all_genes:
    inds = np.where(eids == gene)[0]
    if(len(inds) > 0):
        syms.append(mapping_df.iloc[inds[0],0])
print(len(syms))
print(syms[0:10])

with open("mapped_symbols.txt","w") as f:
    for sym in syms:
        f.write(sym + "\n")
f.close()

3749
['VEGFA', 'TGFB1', 'TNF', 'AKT1', 'EGF', 'FGF2', 'FN1', 'MMP9', 'MMP2', 'IL6']


In [2]:
data = pd.read_csv("./Datasets/PPI.tsv", sep = "\t")
print(data.head())
mapping_df = pd.read_excel("./GeneLists/mapping.xlsx", engine="openpyxl")
print(mapping_df.head())
syms = mapping_df.Symbol.to_numpy()

for index, row in data.iterrows():
    gene1 = row['from']
    gene2 = row['to']
    data.iloc[index,0] = mapping_df.iloc[np.where(syms == gene1)[0][0],1]
    data.iloc[index,1] = mapping_df.iloc[np.where(syms == gene2)[0][0],1]

print(data.head())


      from       to  combined_score  label
0     CFTR     DVL2             951   True
1    XYLT2  B4GALT7             977   True
2    XYLT2      DCN             924   True
3  B4GALT7      DCN             940   True
4     CFTR    VAMP3             911   True
    Symbol      Entrenz
0     A1BG       Gene_1
1     A1CF   Gene_29974
2      A2M       Gene_2
3    A2ML1  Gene_144568
4  A3GALT2  Gene_127550
         from          to  combined_score  label
0   Gene_1080   Gene_1856             951   True
1  Gene_64132  Gene_11285             977   True
2  Gene_64132   Gene_1634             924   True
3  Gene_11285   Gene_1634             940   True
4   Gene_1080   Gene_9341             911   True


In [3]:
data.to_csv("PPI.tsv", index = False, sep ="\t")

In [62]:
from gensim.models  import KeyedVectors, Word2Vec
from sklearn.model_selection import train_test_split
from utils import load_embedding
from time import time

t = time()
w2v_cbow = load_embedding("./WordVectors/Computed/word2vec_cbow.bin", binary=True)
print("Time to load cbow embeddings in mins: ", round(((time() - t)/60.0),4))

t = time()
w2v_sg = load_embedding("./WordVectors/Computed/word2vec_skipgram.bin", binary=True)
print("Time to load skipgram embeddings in mins: ", round(((time() - t)/60.0),4))

data = pd.read_csv("./Datasets/PPI.tsv", sep = "\t")
X = data.iloc[:,0:2]
y = data.label.to_numpy().tolist()

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1234, test_size=0.2)

print(data[data.label == True].shape)
print(data[data.label == False].shape)


embedding loaded from ./WordVectors/Computed/word2vec_cbow.bin
Time to load cbow embeddings in mins:  0.0036
embedding loaded from ./WordVectors/Computed/word2vec_skipgram.bin
Time to load skipgram embeddings in mins:  0.0032
(57126, 4)
(55200, 4)


In [14]:
from sklearn.metrics import precision_recall_fscore_support
from sklearn.metrics import classification_report
from utils import cosine

model = w2v_cbow

thr = 0.5

predictions = []
for index, row in X_test.iterrows():
    sim = cosine(model[row['from']], model[row['to']])
    if sim > thr:
        predictions.append(True)
    else:
        predictions.append(False)

print(precision_recall_fscore_support(y_test, predictions, average='macro'))
target_names = ["True", "False"]
print(classification_report(y_test, predictions, target_names=target_names))






(0.7433494848738598, 0.6750857096005907, 0.6470819960558369, None)
              precision    recall  f1-score   support

        True       0.61      0.94      0.74     11027
       False       0.88      0.41      0.56     11439

    accuracy                           0.67     22466
   macro avg       0.74      0.68      0.65     22466
weighted avg       0.75      0.67      0.65     22466



In [17]:
X_train_vecs = []
for index, row in X_train.iterrows():
    X_train_vecs.append(np.hstack([model[row['from']], model[row['to']]]))
X_train_vecs = np.vstack(X_train_vecs)

X_test_vecs = []
for index, row in X_test.iterrows():
    X_test_vecs.append(np.hstack([model[row['from']], model[row['to']]]))
X_test_vecs = np.vstack(X_test_vecs)

print("Train shape: ", X_train_vecs.shape)
print("Test shape: ", X_test_vecs.shape)

Train shape:  (89860, 200)
Test shape:  (22466, 200)


In [28]:
from sklearn.linear_model import LogisticRegression

clf = LogisticRegression(random_state=0, max_iter=1000, solver = "saga", penalty="l1", C = 5).fit(X_train_vecs, y_train)
predictions = clf.predict(X_test_vecs)

print(precision_recall_fscore_support(y_test, predictions, average='macro'))
target_names = ["True", "False"]
print(classification_report(y_test, predictions, target_names=target_names))


(0.695112362059394, 0.69504430204627, 0.6947340325483564, None)
              precision    recall  f1-score   support

        True       0.68      0.71      0.70     11027
       False       0.71      0.68      0.69     11439

    accuracy                           0.69     22466
   macro avg       0.70      0.70      0.69     22466
weighted avg       0.70      0.69      0.69     22466



In [30]:
from sklearn.naive_bayes import GaussianNB
gnb = GaussianNB().fit(X_train_vecs, y_train)
predictions = gnb.predict(X_test_vecs)

print(precision_recall_fscore_support(y_test, predictions, average='macro'))
target_names = ["True", "False"]
print(classification_report(y_test, predictions, target_names=target_names))

(0.6423230413410772, 0.6385176145339972, 0.6350128003189133, None)
              precision    recall  f1-score   support

        True       0.61      0.72      0.66     11027
       False       0.67      0.55      0.61     11439

    accuracy                           0.64     22466
   macro avg       0.64      0.64      0.64     22466
weighted avg       0.64      0.64      0.63     22466



In [31]:
from sklearn.svm import SVC
clf = SVC(C = 5, kernel='rbf').fit(X_train_vecs, y_train)
predictions = clf.predict(X_test_vecs)

print(precision_recall_fscore_support(y_test, predictions, average='macro'))
target_names = ["True", "False"]
print(classification_report(y_test, predictions, target_names=target_names))

(0.8979244712599455, 0.8973162837962685, 0.8974815777312427, None)
              precision    recall  f1-score   support

        True       0.91      0.88      0.89     11027
       False       0.89      0.91      0.90     11439

    accuracy                           0.90     22466
   macro avg       0.90      0.90      0.90     22466
weighted avg       0.90      0.90      0.90     22466



In [55]:
from xgboost import XGBClassifier
xgb = XGBClassifier(learning_rate=0.1, max_delta_step=0, max_depth=10,
       min_child_weight=1, n_estimators=500, nthread=-1,
       objective='binary:logistic', seed=0, silent=True, subsample=1)

xgb.fit(X_train_vecs, y_train)
predictions = xgb.predict(X_test_vecs)
predictions = [True if value > 0.5 else False for value in predictions]

print(precision_recall_fscore_support(y_test, predictions, average='macro'))
target_names = ["True", "False"]
print(classification_report(y_test, predictions, target_names=target_names))


Parameters: { "silent" } might not be used.

  This could be a false alarm, with some parameters getting used by language bindings but
  then being mistakenly passed down to XGBoost core, or some parameter actually being used
  but getting flagged wrongly here. Please open an issue if you find any such cases.


(0.8958392282622161, 0.8957667370476015, 0.8957985403402515, None)
              precision    recall  f1-score   support

        True       0.90      0.89      0.89     11027
       False       0.90      0.90      0.90     11439

    accuracy                           0.90     22466
   macro avg       0.90      0.90      0.90     22466
weighted avg       0.90      0.90      0.90     22466



In [56]:
model = w2v_sg

thr = 0.5

predictions = []
for index, row in X_test.iterrows():
    sim = cosine(model[row['from']], model[row['to']])
    if sim > thr:
        predictions.append(True)
    else:
        predictions.append(False)

print(precision_recall_fscore_support(y_test, predictions, average='macro'))
target_names = ["True", "False"]
print(classification_report(y_test, predictions, target_names=target_names))



(0.7454506530087759, 0.6812419187125375, 0.6555267053725609, None)
              precision    recall  f1-score   support

        True       0.61      0.94      0.74     11027
       False       0.88      0.42      0.57     11439

    accuracy                           0.68     22466
   macro avg       0.75      0.68      0.66     22466
weighted avg       0.75      0.68      0.65     22466



In [57]:
X_train_vecs = []
for index, row in X_train.iterrows():
    X_train_vecs.append(np.hstack([model[row['from']], model[row['to']]]))
X_train_vecs = np.vstack(X_train_vecs)

X_test_vecs = []
for index, row in X_test.iterrows():
    X_test_vecs.append(np.hstack([model[row['from']], model[row['to']]]))
X_test_vecs = np.vstack(X_test_vecs)

print("Train shape: ", X_train_vecs.shape)
print("Test shape: ", X_test_vecs.shape)

Train shape:  (89860, 200)
Test shape:  (22466, 200)


In [58]:
from sklearn.linear_model import LogisticRegression

clf = LogisticRegression(random_state=0, max_iter=1000, solver = "saga", penalty="l1", C = 5).fit(X_train_vecs, y_train)
predictions = clf.predict(X_test_vecs)

print(precision_recall_fscore_support(y_test, predictions, average='macro'))
target_names = ["True", "False"]
print(classification_report(y_test, predictions, target_names=target_names))


(0.6975858608201462, 0.6975504767787668, 0.6972744434670354, None)
              precision    recall  f1-score   support

        True       0.68      0.71      0.70     11027
       False       0.71      0.68      0.70     11439

    accuracy                           0.70     22466
   macro avg       0.70      0.70      0.70     22466
weighted avg       0.70      0.70      0.70     22466



In [59]:
from sklearn.naive_bayes import GaussianNB
gnb = GaussianNB().fit(X_train_vecs, y_train)
predictions = gnb.predict(X_test_vecs)

print(precision_recall_fscore_support(y_test, predictions, average='macro'))
target_names = ["True", "False"]
print(classification_report(y_test, predictions, target_names=target_names))

(0.6453693901691318, 0.6422387378037899, 0.6393221964982218, None)
              precision    recall  f1-score   support

        True       0.61      0.72      0.66     11027
       False       0.68      0.57      0.62     11439

    accuracy                           0.64     22466
   macro avg       0.65      0.64      0.64     22466
weighted avg       0.65      0.64      0.64     22466



In [60]:
from xgboost import XGBClassifier
xgb = XGBClassifier(learning_rate=0.1, max_delta_step=0, max_depth=10,
       min_child_weight=1, n_estimators=500, nthread=-1,
       objective='binary:logistic', seed=0, silent=True, subsample=1)

xgb.fit(X_train_vecs, y_train)
predictions = xgb.predict(X_test_vecs)
predictions = [True if value > 0.5 else False for value in predictions]

print(precision_recall_fscore_support(y_test, predictions, average='macro'))
target_names = ["True", "False"]
print(classification_report(y_test, predictions, target_names=target_names))


Parameters: { "silent" } might not be used.

  This could be a false alarm, with some parameters getting used by language bindings but
  then being mistakenly passed down to XGBoost core, or some parameter actually being used
  but getting flagged wrongly here. Please open an issue if you find any such cases.


(0.8975643408611877, 0.8974293545332502, 0.8974836781799639, None)
              precision    recall  f1-score   support

        True       0.90      0.89      0.90     11027
       False       0.90      0.90      0.90     11439

    accuracy                           0.90     22466
   macro avg       0.90      0.90      0.90     22466
weighted avg       0.90      0.90      0.90     22466



In [61]:
from sklearn.svm import SVC
clf = SVC(C = 5, kernel='rbf').fit(X_train_vecs, y_train)
predictions = clf.predict(X_test_vecs)

print(precision_recall_fscore_support(y_test, predictions, average='macro'))
target_names = ["True", "False"]
print(classification_report(y_test, predictions, target_names=target_names))

(0.8991822038683968, 0.8985597646092802, 0.8987279722405602, None)
              precision    recall  f1-score   support

        True       0.91      0.88      0.90     11027
       False       0.89      0.91      0.90     11439

    accuracy                           0.90     22466
   macro avg       0.90      0.90      0.90     22466
weighted avg       0.90      0.90      0.90     22466

