In [1]:
# header files
%matplotlib inline
import os
import glob
import csv
import numpy as np
import pandas as pd
from sksurv.nonparametric import kaplan_meier_estimator
from sksurv.linear_model import CoxnetSurvivalAnalysis
from sksurv.linear_model import CoxPHSurvivalAnalysis
from sksurv.ensemble import RandomSurvivalForest
from sksurv.svm import HingeLossSurvivalSVM
from sksurv.metrics import (
    concordance_index_censored,
    concordance_index_ipcw,
    cumulative_dynamic_auc,
    integrated_brier_score,
)
from sklearn.preprocessing import StandardScaler, MinMaxScaler, RobustScaler
from sklearn.pipeline import make_pipeline
from sklearn.model_selection import KFold
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2, f_regression, f_classif
import matplotlib.pyplot as plt
plt.rcParams['figure.figsize'] = [4, 4]
print("Header files loaded!")

Header files loaded!


In [2]:
# load ovarian cancer files
bc_files = (glob.glob("data/files/*"))
print(len(bc_files))

714


In [3]:
# collect features
is_breast_cancer = 1
if is_breast_cancer:
    collagen_features = []
    for file in bc_files:
        filename = file.split("/")[-1]
        flag = -1
        file_features = []
        
        if os.path.isfile("results/collagen_dilated/200/" + filename):
            with open("results/collagen_dilated/200/" + filename, newline='') as csvfile:
                spamreader = csv.reader(csvfile)
                for row in spamreader:
                    if flag == -1:
                        array = row
                        for index in range(0, len(array)-1):
                            if index == 1:
                                continue
                            file_features.append(float(array[index]))
            with open("results/collagen/200/" + filename, newline='') as csvfile:
                spamreader = csv.reader(csvfile)
                for row in spamreader:
                    if flag == -1:
                        array = row
                        for index in range(0, len(array)-1):
                            if index == 1:
                                continue
                            file_features.append(float(array[index]))
                            
            with open("results/collagen_dilated/400/" + filename, newline='') as csvfile:
                spamreader = csv.reader(csvfile)
                for row in spamreader:
                    if flag == -1:
                        array = row
                        for index in range(0, len(array)-1):
                            if index == 1:
                                continue
                            file_features.append(float(array[index]))
            with open("results/collagen/400/" + filename, newline='') as csvfile:
                spamreader = csv.reader(csvfile)
                for row in spamreader:
                    if flag == -1:
                        array = row
                        for index in range(0, len(array)-1):
                            if index == 1:
                                continue
                            file_features.append(float(array[index]))
            
            with open("results/collagen_dilated/600/" + filename, newline='') as csvfile:
                spamreader = csv.reader(csvfile)
                for row in spamreader:
                    if flag == -1:
                        array = row
                        for index in range(0, len(array)-1):
                            if index == 1:
                                continue
                            file_features.append(float(array[index]))
            with open("results/collagen/600/" + filename, newline='') as csvfile:
                spamreader = csv.reader(csvfile)
                for row in spamreader:
                    if flag == -1:
                        array = row
                        for index in range(0, len(array)-1):
                            if index == 1:
                                continue
                            file_features.append(float(array[index]))
            collagen_features.append(file_features)
print(len(collagen_features))

714


In [4]:
# train
is_breast_cancer = 1
if is_breast_cancer:
    censor = []
    days = []
    filenames = []
    treatment = []
    flag = -1
    with open("data/dcis_train_updated.csv", newline='', encoding = "ISO-8859-1") as csvfile:
        spamreader = csv.reader(csvfile)
        for row in spamreader:
            if flag == -1:
                flag = 1
                print(row)
            else:
                array = row
                filenames.append(array[1])
                days.append(float(array[6]))
                
                if array[11] == "A1":
                    censor.append(True)
                elif array[11] == "A2":
                    censor.append(True)
                elif array[11] == "B01":
                    censor.append(False)
                elif array[11] == "B02":
                    censor.append(False)
                elif array[11] == "B12":
                    censor.append(False)
                else:
                    censor.append(False)
                    
                if array[9] == "No" and array[10] == "No":
                    treatment.append(0)
                elif array[9] == "Yes" and array[10] == "No":
                    treatment.append(1)
                elif array[9] == "No" and array[10] == "Yes":
                    treatment.append(2)
                else:
                    treatment.append(3)
                
    train_y = []
    train_event = []
    train_survival_time = []
    train_features = []
    train_filenames = []
    main_count = 0
    for file in bc_files:
        count = 0
        filename1 = file.split("/")[-1][:-4]
        for filename in filenames:
            filename2 = filename
            if filename1 == filename2 and (treatment[count] == 2):
                train_features.append(collagen_features[main_count])
                train_y.append([censor[count], days[count]])
                train_event.append(censor[count])
                train_survival_time.append(days[count])
                train_filenames.append(filename1)
                break
            count += 1
        main_count += 1
    print(len(train_y))
    print(len(train_event))
    print(len(train_survival_time))
    print(len(train_features))

['SRNO', 'trialno', 'studynumber', 'tripletNO', 'Age', 'nbdtype', 'Time-to-IBE', 'TotalFU', 'tcode', 'RT', 'Tam', 'subgroup']
102
102
102
102


# test
is_breast_cancer = 1
if is_breast_cancer:
    censor = []
    days = []
    filenames = []
    treatment = []
    flag = -1
    with open("data/dcis_train_updated.csv", newline='', encoding = "ISO-8859-1") as csvfile:
        spamreader = csv.reader(csvfile)
        for row in spamreader:
            if flag == -1:
                flag = 1
                print(row)
            else:
                array = row
                filenames.append(array[1])
                days.append(float(array[6]))
                
                if array[11] == "A1":
                    censor.append(True)
                elif array[11] == "A2":
                    censor.append(True)
                elif array[11] == "B01":
                    censor.append(False)
                elif array[11] == "B02":
                    censor.append(False)
                elif array[11] == "B12":
                    censor.append(False)
                else:
                    censor.append(False)
                    
                if array[9] == "No" and array[10] == "No":
                    treatment.append(0)
                elif array[9] == "Yes" and array[10] == "No":
                    treatment.append(1)
                elif array[9] == "No" and array[10] == "Yes":
                    treatment.append(2)
                else:
                    treatment.append(3)
                
    test_y = []
    test_event = []
    test_survival_time = []
    test_features = []
    test_filenames = []
    main_count = 0
    for file in bc_files:
        count = 0
        filename1 = file.split("/")[-1][:-4]
        for filename in filenames:
            filename2 = filename
            if filename1 == filename2 and (treatment[count] == 0):
                test_features.append(collagen_features[main_count])
                test_y.append([censor[count], days[count]])
                test_event.append(censor[count])
                test_survival_time.append(days[count])
                test_filenames.append(filename1)
                break
            count += 1
        main_count += 1
    print(len(test_y))
    print(len(test_event))
    print(len(test_survival_time))
    print(len(test_features))

In [5]:
# final training information to be used for training model
train_features = np.array(train_features)
train_y = np.array(train_y)
train_event = np.array(train_event)
train_survival_time = np.array(train_survival_time)

In [8]:
# final training information to be used for training model
test_features = np.array(collagen_features)
#test_y = np.array(test_y)
#test_event = np.array(test_event)
#test_survival_time = np.array(test_survival_time)

In [9]:
# run on test set
test_group = []
train_group = []
features_train = train_features
features_test = test_features
y_train = train_y
dt = dtype=[('Status', '?'), ('Survival_in_days', '<f8')]
y_train = np.array([tuple(row) for row in y_train], dtype=dt)
scaler = MinMaxScaler()
features_train = scaler.fit_transform(features_train)
features_test = scaler.transform(features_test)
features_train_df = pd.DataFrame(features_train)
features_test_df = pd.DataFrame(features_test)

# fit model
estimator = CoxnetSurvivalAnalysis(l1_ratio=0.9)
estimator.fit(features_train_df, y_train)
#score, _, _, _, _ = concordance_index_censored(test_event, test_survival_time, estimator.predict(features_test_df))
#print("Test: " + str(score))
score, _, _, _, _ = concordance_index_censored(train_event, train_survival_time, estimator.predict(features_train_df))
print("Train: " + str(score))

# get risk scores
train_risk_scores = estimator.predict(features_train_df)
test_risk_scores = estimator.predict(features_test_df)

risk_scores = []
for index in range(0, len(train_risk_scores)):
    risk_scores.append(train_risk_scores[index])
median = np.percentile(risk_scores, 66)
print(median)
count_low = 0
count_high = 0
for index in range(0, len(train_risk_scores)):
    if train_risk_scores[index] > median:
        count_high += 1
        train_group.append(1)
    else:
        count_low += 1
        train_group.append(0)
count_low = 0
count_high = 0
for index in range(0, len(test_risk_scores)):
    if test_risk_scores[index] > median:
        count_high += 1
        test_group.append(1)
    else:
        count_low += 1
        test_group.append(0)

Train: 0.7335155449265459
0.3994967558194817


In [10]:
# find prognostic features from model trained above
count = 0
for index1 in range(0, len(estimator.coef_)):
    flag = -1
    for index2 in range(0, len(estimator.coef_)):
        if estimator.coef_[index1][index2] > 0 or estimator.coef_[index1][index2] < 0:
            flag = 1
            print(str(index1) + " " + str(estimator.coef_[index1][index2]) + " " + str(np.exp(estimator.coef_[index1][index2])))
            break
    if flag == 1:
        count += 1
print()
print("Prognostic features count = " + str(count))

9 0.1586626926673168 1.1719425751795214
10 -0.13839624408073103 0.8707535924217591
11 -0.20507274956963426 0.81458805332545

Prognostic features count = 3


In [11]:
print(*test_survival_time, sep="; ")

NameError: name 'test_survival_time' is not defined

In [12]:
a = []
for index in range(0, len(test_event)):
    if test_event[index] == False:
        a.append(0)
    else:
        a.append(1)
print(*a, sep="; ")

NameError: name 'test_event' is not defined

In [13]:
print(*test_group, sep="; ")

1; 0; 0; 1; 1; 1; 0; 0; 1; 0; 1; 0; 0; 0; 0; 0; 0; 1; 0; 1; 0; 0; 0; 0; 1; 0; 0; 1; 0; 0; 0; 1; 0; 0; 0; 0; 0; 1; 1; 0; 0; 0; 0; 1; 0; 0; 0; 1; 0; 1; 1; 0; 1; 0; 0; 0; 0; 0; 0; 0; 0; 0; 0; 0; 1; 0; 1; 0; 1; 1; 0; 0; 0; 0; 1; 0; 0; 0; 0; 0; 1; 1; 0; 0; 0; 0; 0; 0; 0; 0; 0; 0; 0; 1; 0; 0; 0; 0; 0; 0; 1; 0; 0; 0; 0; 0; 0; 1; 0; 0; 0; 0; 1; 0; 1; 0; 0; 0; 0; 0; 0; 0; 1; 0; 0; 0; 0; 0; 0; 0; 0; 0; 0; 0; 0; 0; 0; 1; 0; 0; 1; 0; 0; 1; 0; 0; 0; 0; 0; 0; 1; 1; 1; 0; 0; 0; 0; 0; 0; 0; 0; 0; 0; 0; 1; 0; 0; 0; 0; 0; 0; 0; 1; 0; 0; 0; 0; 1; 0; 0; 0; 1; 1; 0; 0; 0; 0; 0; 0; 0; 1; 0; 0; 0; 0; 0; 0; 0; 0; 0; 0; 1; 0; 0; 1; 1; 0; 0; 0; 0; 1; 0; 1; 0; 0; 1; 0; 0; 0; 0; 0; 0; 0; 0; 0; 1; 0; 0; 0; 1; 1; 0; 0; 1; 0; 0; 1; 0; 0; 1; 0; 0; 1; 0; 0; 1; 1; 0; 0; 0; 1; 1; 1; 0; 0; 1; 0; 0; 0; 0; 0; 0; 0; 1; 0; 1; 0; 1; 0; 1; 1; 1; 0; 1; 0; 0; 0; 0; 0; 0; 1; 1; 0; 1; 0; 1; 0; 0; 1; 0; 0; 0; 0; 0; 0; 0; 0; 0; 0; 1; 0; 0; 0; 1; 0; 1; 0; 1; 1; 0; 0; 1; 1; 1; 1; 0; 0; 0; 1; 0; 0; 0; 1; 0; 0; 0; 0; 0; 0; 0; 0; 1; 0; 0

In [19]:
print(*test_risk_scores, sep=", ")

2.316019003523844, -1.074271667465354, 0.13570381586685865, 1.0518009573356537, 1.6461410827221463, 1.1119103995318005, -0.24875551915931737, 0.30278038084840775, 1.1671165302057376, -0.4745649684626745, 1.319782812000534, 0.05403730935585349, -0.6076054828084341, 0.2467396241825306, -0.39169743755088504, -0.6743861472282822, -1.4989573796816327, 0.7614692627883706, 0.041711406957417374, 1.08486923630923, -0.7401739708071675, 0.2279620789890111, -1.5237308613559355, -1.4533298800728591, 2.1183528430574743, -0.8275482452428267, -1.837832353579512, 0.5261559208360522, -0.47333511662688554, 0.06864222765195183, 0.264133048820274, 1.2296800360112767, -0.6179270980775162, -0.7364384425552699, 0.32407370562710947, -3.4793768855520746, 0.2110861997417004, 0.9632254368904931, 4.050498998809417, -0.3623852232264353, -0.9972723202103602, 0.3331924815821772, 0.08122094326952456, 1.0481736766203542, -0.10135690416187426, -0.5945821207391404, -0.6024232086058934, 0.712413335626308, -0.0698285901122

In [15]:
print(*train_survival_time, sep="; ")

4400.0; 4448.0; 889.0; 952.0; 5310.0; 234.0; 1382.0; 3603.0; 5052.0; 3177.0; 4095.0; 4357.0; 4723.0; 2229.0; 4528.0; 196.0; 2438.0; 4648.0; 302.0; 1076.0; 4632.0; 1998.0; 5269.0; 5080.0; 2205.0; 1935.0; 5885.0; 5667.0; 1274.0; 295.0; 2154.0; 3981.0; 4862.0; 5879.0; 3159.0; 4007.0; 1473.0; 4463.0; 4862.0; 5437.0; 4830.0; 814.0; 3080.0; 1143.0; 5599.0; 5555.0; 3492.0; 946.0; 6338.0; 380.0; 6034.0; 5401.0; 5523.0; 1281.0; 3959.0; 5774.0; 548.0; 5339.0; 4157.0; 4989.0; 1380.0; 316.0; 3601.0; 5542.0; 4040.0; 2905.0; 6136.0; 5094.0; 6115.0; 1749.0; 5186.0; 4275.0; 4456.0; 4652.0; 4555.0; 4756.0; 5611.0; 847.0; 5931.0; 362.0; 5617.0; 4248.0; 4492.0; 258.0; 4351.0; 4045.0; 1143.0; 5780.0; 4633.0; 5731.0; 5710.0; 330.0; 402.0; 4591.0; 4890.0; 6039.0; 6286.0; 3899.0; 4455.0; 2702.0; 4687.0; 4429.0


In [16]:
a = []
for index in range(0, len(train_event)):
    if train_event[index] == False:
        a.append(0)
    else:
        a.append(1)
print(*a, sep="; ")

0; 0; 1; 1; 0; 1; 1; 0; 0; 1; 0; 0; 0; 1; 0; 1; 1; 0; 1; 1; 0; 1; 0; 0; 1; 1; 0; 0; 1; 1; 1; 0; 0; 0; 1; 0; 0; 0; 0; 0; 0; 1; 1; 1; 0; 0; 0; 1; 0; 1; 0; 0; 0; 1; 0; 0; 1; 0; 0; 0; 1; 1; 0; 0; 0; 1; 0; 0; 0; 1; 0; 0; 0; 0; 0; 0; 0; 1; 0; 1; 0; 0; 0; 1; 0; 0; 1; 0; 0; 0; 0; 1; 1; 0; 0; 0; 0; 0; 0; 1; 0; 0


In [21]:
print(*train_group, sep=", ")

0, 1, 1, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 1, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 1, 1, 1, 0, 1, 0, 1, 1, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 1, 0, 0, 1, 1, 0, 1, 0, 1, 0, 1, 0, 0, 1, 0, 0, 1, 0, 1, 1, 0, 0, 0, 0, 1, 1, 0, 0, 0


In [20]:
print(*train_risk_scores, sep=", ")

-0.24875551915931737, 1.1671165302057376, 0.5261559208360522, -0.10135690416187426, -0.06982859011220555, 0.7864269229268359, 1.3246967397668368, 0.26673578351992, -0.9223662036942253, 0.017259725105119195, -0.6526837939447976, -0.12947217798681443, -0.6748597505413874, -0.1437630258782856, -0.4622610355720096, -0.24705811846101655, 0.5362284178422225, -1.4758778522605174, -0.42950059195803725, 1.016474956559045, -0.9932869047752866, 0.7151339644694121, -0.2777618474995984, 0.2888131079568732, -0.9110238686427197, 1.257830408073687, -2.817667008343377, -0.5897150454599198, 1.630884369096584, -2.1029252023129956, -0.6148364552440886, -0.5529965131845809, -1.301441213544405, -0.3793852175170409, 1.243693185004965, -1.3836962237471386, 0.2058761196576322, 0.45447379009167577, -1.3409439794994884, 0.13363430326489611, 0.19416403675184823, 1.1810594621321053, 0.8320726678714896, 0.9644383274446047, 0.24764567197766318, 0.6906161834975943, -1.0109501388934592, 1.263886672870669, 1.9668059547

In [None]:
### FOR WRITING RISK SCORES TO FILE

In [None]:
actual_files = []
flag = -1
with open("../../dcis_qmul/data/dcis_qmul.csv", newline='') as csvfile:
    spamreader = csv.reader(csvfile)
    for row in spamreader:
        if flag == -1:
            array = row
            flag = 1
        else:
            array = row
            actual_files.append(array[0])
print(len(actual_files))

In [None]:
row_1 = []
row_2 = []
row_3 = []
for index in range(0, len(bc_files)):
    filename1 = bc_files[index].split("/")[-1][:-4]
    for index1 in range(0, len(actual_files)):
        filename2 = actual_files[index1]
        if filename1 in filename2:
            row_1.append(actual_files[index1])
            row_2.append(test_risk_scores[index])
            
            g = 0
            if test_group[index] == 1:
                g = 1
            else:
                g = 0
            row_3.append(g)
            break
print(len(row_1))
print(len(row_2))
print(len(row_3))

In [None]:
count = 0
for index in range(0, len(row_3)):
    if row_3[index] == 0:
        count += 1
print(count)

In [None]:
with open("../Collagen_Signature_TAM.csv", 'w', newline='') as csvfile:
    spamwriter = csv.writer(csvfile)
    spamwriter.writerow(["Patient ID", "Risk Score", "Risk (Category)"])
    for index in range(0, len(row_1)):    
        spamwriter.writerow([str(row_1[index]), str(row_2[index]), str(row_3[index])])