In [79]:
import warnings
warnings.filterwarnings("ignore")

#Importing the Data
import pandas as pd
import re
import csv
from sklearn.model_selection import train_test_split
data = pd.read_csv("proteomes.csv",header=0,index_col=0)
clinical = pd.read_csv("clinical.csv",header=0,index_col=0)
#Manipulation
clinical.columns = [c.replace(' ', '_') for c in clinical.columns]
data.columns = [c.replace(' ', '_') for c in data.columns]

clinical.rename(columns = {'Tumor--T1_Coded':'T1_Coded'}, inplace = True)
clinical.rename(columns = {'Node-Coded':'Node_Coded'}, inplace = True)
clinical.rename(columns = {'Metastasis-Coded':'Metastasis_Coded'}, inplace = True)
clinical.rename(columns = {'Integrated_Clusters_(with_PAM50)':'Integrated_Clusters_with_PAM50'}, inplace = True)
clinical.rename(columns = {'Integrated_Clusters_(no_exp)':'Integrated_Clusters_no_exp'}, inplace = True)
clinical.rename(columns = {'Integrated_Clusters_(unsup_exp)':'Integrated_Clusters_unsup_exp'}, inplace = True)

clinical.Gender[clinical.Gender == 'MALE'] = 0
clinical.Gender[clinical.Gender == 'FEMALE'] = 1

clinical.ER_Status[clinical.ER_Status == 'Positive'] = 1
clinical.ER_Status[clinical.ER_Status == 'Negative'] = 0

clinical.PR_Status[clinical.PR_Status == 'Positive'] = 1
clinical.PR_Status[clinical.PR_Status == 'Negative'] = 0

clinical.HER2_Final_Status[clinical.HER2_Final_Status == 'Positive'] = 1
clinical.HER2_Final_Status[clinical.HER2_Final_Status == 'Negative'] = 0

clinical.Tumor[clinical.Tumor == 'T1'] = 1
clinical.Tumor[clinical.Tumor == 'T2'] = 2
clinical.Tumor[clinical.Tumor == 'T3'] = 3
clinical.Tumor[clinical.Tumor == 'T4'] = 4

clinical.T1_Coded[clinical.T1_Coded == 'T1'] = 1
clinical.T1_Coded[clinical.T1_Coded == 'T_Other'] = 0

clinical.Node[clinical.Node == 'N0'] = 0
clinical.Node[clinical.Node == 'N1'] = 1
clinical.Node[clinical.Node == 'N2'] = 2
clinical.Node[clinical.Node == 'N3'] = 3

clinical.Node_Coded[clinical.Node_Coded == 'Positive'] = 1
clinical.Node_Coded[clinical.Node_Coded == 'Negative'] = 0

clinical.Metastasis[clinical.Metastasis == 'M0'] = 0
clinical.Metastasis[clinical.Metastasis == 'M1'] = 1
clinical.Metastasis_Coded[clinical.Metastasis_Coded == 'Positive'] = 1
clinical.Metastasis_Coded[clinical.Metastasis_Coded == 'Negative'] = 0

clinical.AJCC_Stage[clinical.AJCC_Stage == 'Stage I'] = 1
clinical.AJCC_Stage[clinical.AJCC_Stage == 'Stage IA'] = 11
clinical.AJCC_Stage[clinical.AJCC_Stage == 'Stage IB'] = 12
clinical.AJCC_Stage[clinical.AJCC_Stage == 'Stage II'] = 2
clinical.AJCC_Stage[clinical.AJCC_Stage == 'Stage IIA'] = 21
clinical.AJCC_Stage[clinical.AJCC_Stage == 'Stage IIB'] = 22
clinical.AJCC_Stage[clinical.AJCC_Stage == 'Stage III'] = 3
clinical.AJCC_Stage[clinical.AJCC_Stage == 'Stage IIIA'] = 31
clinical.AJCC_Stage[clinical.AJCC_Stage == 'Stage IIIB'] = 32
clinical.AJCC_Stage[clinical.AJCC_Stage == 'Stage IIIC'] = 33
clinical.AJCC_Stage[clinical.AJCC_Stage == 'Stage IV'] = 4

clinical.Converted_Stage[clinical.Converted_Stage == 'No_Conversion'] = 0
clinical.Converted_Stage[clinical.Converted_Stage == 'Stage I'] = 1
clinical.Converted_Stage[clinical.Converted_Stage == 'Stage IIA'] = 21
clinical.Converted_Stage[clinical.Converted_Stage == 'Stage IIB'] = 22
clinical.Converted_Stage[clinical.Converted_Stage == 'Stage IIIA'] = 31
clinical.Converted_Stage[clinical.Converted_Stage == 'Stage IIIB'] = 32
clinical.Converted_Stage[clinical.Converted_Stage == 'Stage IIIC'] = 33

clinical.Survival_Data_Form[clinical.Survival_Data_Form == 'enrollment'] = 1
clinical.Survival_Data_Form[clinical.Survival_Data_Form == 'followup'] = 1

clinical.Vital_Status[clinical.Vital_Status == 'LIVING'] = 0
clinical.Vital_Status[clinical.Vital_Status == 'DECEASED'] = 1

clinical.PAM50_mRNA[clinical.PAM50_mRNA == 'Luminal A'] = 1
clinical.PAM50_mRNA[clinical.PAM50_mRNA == 'Luminal B'] = 2
clinical.PAM50_mRNA[clinical.PAM50_mRNA == 'Basal-like'] = 3
clinical.PAM50_mRNA[clinical.PAM50_mRNA == 'HER2-enriched'] = 4

clinical.RPPA_Clusters[clinical.RPPA_Clusters == 'Basal'] = 1
clinical.RPPA_Clusters[clinical.RPPA_Clusters == 'LumA/B'] = 2
clinical.RPPA_Clusters[clinical.RPPA_Clusters == 'Her2'] = 3
clinical.RPPA_Clusters[clinical.RPPA_Clusters == 'ReacI'] = 4
clinical.RPPA_Clusters[clinical.RPPA_Clusters == 'ReacII'] = 5
clinical.RPPA_Clusters[clinical.RPPA_Clusters == 'LumA'] = 6
clinical.RPPA_Clusters[clinical.RPPA_Clusters == 'X'] = 7

#for col in clinical.columns: 
    #print(col)

In [2]:
#import pip
#pip.main(['install', 'scikit-multilearn'])

In [80]:
## Change the protein data sample names to a format matching the clinical data set
data.rename(columns=lambda x: "TCGA-%s" % (re.split('[_|-|.]',x)[0]) if bool(re.search("TCGA",x)) is True else x,inplace=True)
 
## Transpose data for the clustering algorithm since we want to divide patient samples, not proteins
data = data.transpose()
 
## Drop clinical entries for samples not in our protein data set
clinical = clinical.loc[[x for x in clinical.index.tolist() if x in data.index],:]
 
## Add clinical meta data to our protein data set, note: all numerical features for analysis start with NP_ or XP_
merged = data.merge(clinical,left_index=True,right_index=True)

print(merged.shape)

(80, 12582)


In [82]:
import numpy as np


def clean_dataset(df):
    assert isinstance(df, pd.DataFrame), "df needs to be a pd.DataFrame"
    df.dropna(inplace=True)
    indices_to_keep = ~df.isin([np.nan, np.inf, -np.inf]).any(1)
    return df[indices_to_keep].astype(np.float64)

merged = clean_dataset(merged)
merged

#np.all(np.isfinite(merged))
#merged.round(3)
#merged = merged.replace([np.inf, -np.inf], np.nan)
#np.where(merged.values >= np.finfo(np.float64).max)
#np.isnan(merged.values.any())
#np.nan_to_num(merged)

Unnamed: 0,NP_958782,NP_958785,NP_958786,NP_000436,NP_958781,NP_958780,NP_958783,NP_958784,NP_112598,NP_001611,...,PAM50_mRNA,SigClust_Unsupervised_mRNA,SigClust_Intrinsic_mRNA,miRNA_Clusters,methylation_Clusters,RPPA_Clusters,CN_Clusters,Integrated_Clusters_with_PAM50,Integrated_Clusters_no_exp,Integrated_Clusters_unsup_exp


In [70]:
#Subsetting the equivocal data and looking at variable counts once again
merged = merged[merged["HER2_Final_Status"] != "Equivocal"]
merged.groupby(["ER_Status", "PR_Status", "HER2_Final_Status"]).size()

ER_Status  PR_Status  HER2_Final_Status
0          0          0                    19
                      1                     7
1          0          0                     6
                      1                     4
           1          0                    35
                      1                     8
dtype: int64

In [71]:
x_train, x_test, y_train, y_test = train_test_split(merged.drop("ER_Status", axis=1), merged["ER_Status"], test_size=0.2, random_state=0)

In [72]:
#k nearest neighbors
from sklearn import neighbors
import matplotlib.pyplot as plt

#Find optimal k
k_values = list(range(1,50))
k_accuracy = []

for i in k_values:
    n_neighbors = i
    knn = neighbors.KNeighborsClassifier(n_neighbors)
    knn.fit(x_train, y_train)
    y_pred = knn.predict(x_test)
    y_delta = abs(y_test - y_pred)
    pred_error = np.sum(y_delta)/y_delta.size
    pred_accuracy = 1 - pred_error
    k_accuracy.append(pred_accuracy)

ValueError: Input contains NaN, infinity or a value too large for dtype('float64').

In [55]:
from skmultilearn.adapt import MLkNN

classifier = MLkNN(k=3)

# train
classifier.fit(x_train, y_train)

# predict
predictions = classifier.predict(x_test)

AttributeError: 'Series' object has no attribute 'getformat'