In [11]:
import json
import gzip
import pandas as pd
import numpy as np
from concurrent.futures import ThreadPoolExecutor, as_completed
from sklearn.cluster import KMeans

In [8]:
def generate_features(array):
    mean_array = np.mean(array, axis=0)
    median_array = np.median(array, axis=0)
    max_array = np.max(array, axis=0)
    min_array = np.min(array, axis=0)
    sd_array = np.std(array, axis=0)
    concatenated_array = np.concatenate((mean_array, median_array, max_array, min_array, sd_array))
    return concatenated_array

In [19]:
def cluster_samples(array):
    kmeans = KMeans(n_clusters=2, random_state=0)
    kmeans.fit(array)
    labels = kmeans.labels_
    cluster_1 = array[labels == 0]
    cluster_2 = array[labels == 1]
    return cluster_1, cluster_2

In [22]:
# Parse Json
def parse_row(row):
    ls = []
    for key,value in row.items():
        ls.append(key)
        for key1, value1 in value.items():
            ls.append(key1)
            for key2, value2 in value1.items():
                ls.append(key2)
                array = np.array(value2)
                cluster_1, cluster_2 = cluster_samples(array)
                
                whole_set = generate_features(array).tolist()
                cluster_1_set = generate_features(cluster_1).tolist()
                cluster_2_set = generate_features(cluster_2).tolist()
                ls += whole_set + cluster_1_set + cluster_2_set
    return ls

def process_line(index, line):
    row = json.loads(line)
    parsed_row = parse_row(row)
    print(f"Processed line {index + 1}")
    return parsed_row

def parse_json(json_path,csv_path):
    with gzip.open(json_path, 'rt') as f:
        lines = f.readlines()[:1]

    # Create new dimensions
    columns = ["transcript_id", "transcript_position", "seq"]
    values = ["dt_1", "sd_1", "curr_1", "dt_2", "sd_2", "curr_2", "dt_3", "sd_3", "curr_3"]

    for data_range in ["whole", "cluster_1", "cluster_2"]:
        for aggregate in ["mean","median","max","min","sd"]:
            for val in values:
                columns.append(f"{data_range}_{aggregate}_{val}")

    parsed_rows = []  

    with ThreadPoolExecutor() as executor:
        future_to_index = {executor.submit(process_line, i, line): i for i, line in enumerate(lines)}
        for future in as_completed(future_to_index):
            parsed_row = future.result()
            parsed_rows.append(parsed_row)

    df = pd.DataFrame(parsed_rows, columns = columns)
    
    df["transcript_position"] = df["transcript_position"].astype(int)
    df = df.sort_values(by = ["transcript_id","transcript_position"])

    labels = pd.read_csv(csv_path)
    df_with_labels = df.merge(labels, on = ["transcript_id","transcript_position"])

    return df_with_labels

df = parse_json('data/dataset0.json.gz','data/data.info.labelled')
df.to_parquet("data/data.parquet")

Processed line 1


In [26]:
train_df = pd.read_parquet("train.parquet")
test_df = pd.read_parquet("test.parquet")

In [31]:
from sklearn.metrics import roc_auc_score, precision_recall_curve, auc, accuracy_score, confusion_matrix, classification_report
from sklearn.linear_model import LogisticRegression

y_train = train_df['label'] 
X_train = train_df.drop(columns=['label','transcript_id', 'transcript_position', 'seq','gene_id'])

y_test = test_df['label'] 
X_test = test_df.drop(columns=['label','transcript_id', 'transcript_position', 'seq','gene_id']) 

log_reg = LogisticRegression(random_state=42, max_iter=100000)
log_reg.fit(X_train, y_train)
y_pred = log_reg.predict(X_test)
y_proba = log_reg.predict_proba(X_test)[:, 1]

conf_matrix = confusion_matrix(y_test, y_pred)
print('Confusion Matrix:')
print(conf_matrix)

accuracy = accuracy_score(y_test, y_pred)
print(f'Accuracy: {accuracy:.2f}')

roc_auc = roc_auc_score(y_test, y_proba)
print(f'ROC AUC: {roc_auc:.2f}')

precision, recall, _ = precision_recall_curve(y_test, y_proba)
pr_auc = auc(recall, precision)
print(f'PR AUC: {pr_auc:.2f}')


Confusion Matrix:
[[23273     0]
 [ 1095     0]]
Accuracy: 0.96
ROC AUC: 0.71
PR AUC: 0.09


In [33]:
from sklearn.metrics import roc_auc_score, precision_recall_curve, auc, accuracy_score, confusion_matrix, classification_report
from sklearn.ensemble import RandomForestClassifier

y_train = train_df['label'] 
X_train = train_df.drop(columns=['label','transcript_id', 'transcript_position', 'seq','gene_id'])

y_test = test_df['label'] 
X_test = test_df.drop(columns=['label','transcript_id', 'transcript_position', 'seq','gene_id']) 

rf_model = RandomForestClassifier(random_state=42)
rf_model.fit(X_train, y_train)
y_pred = rf_model.predict(X_test)
y_proba = rf_model.predict_proba(X_test)[:, 1]

conf_matrix = confusion_matrix(y_test, y_pred)
print('Confusion Matrix:')
print(conf_matrix)

accuracy = accuracy_score(y_test, y_pred)
print(f'Accuracy: {accuracy:.2f}')

roc_auc = roc_auc_score(y_test, y_proba)
print(f'ROC AUC: {roc_auc:.2f}')

precision, recall, _ = precision_recall_curve(y_test, y_proba)
pr_auc = auc(recall, precision)
print(f'PR AUC: {pr_auc:.2f}')


Confusion Matrix:
[[23135   138]
 [  851   244]]
Accuracy: 0.96
ROC AUC: 0.88
PR AUC: 0.41


In [24]:
pd.read_parquet("data/dataset_all_features.parquet")

Unnamed: 0,transcript_id,transcript_position,seq,whole_mean_dt_1,whole_mean_sd_1,whole_mean_curr_1,whole_mean_dt_2,whole_mean_sd_2,whole_mean_curr_2,whole_mean_dt_3,...,cluster_2_sd_sd_1,cluster_2_sd_curr_1,cluster_2_sd_dt_2,cluster_2_sd_sd_2,cluster_2_sd_curr_2,cluster_2_sd_dt_3,cluster_2_sd_sd_3,cluster_2_sd_curr_3,gene_id,label
0,ENST00000000233,244,AAGACCA,0.008264,4.223784,123.702703,0.009373,7.382162,125.913514,0.007345,...,1.331372,2.297925,0.004924,2.697470,2.029415,0.004075,3.021890,2.461195,ENSG00000004059,0
1,ENST00000000233,261,CAAACTG,0.006609,3.216424,109.681395,0.006813,3.226535,107.889535,0.007710,...,2.160400,3.475172,0.003448,0.788118,2.110211,0.003597,1.184897,2.557029,ENSG00000004059,0
2,ENST00000000233,316,GAAACAG,0.007570,2.940541,105.475676,0.007416,3.642703,98.947027,0.007555,...,1.147157,1.376761,0.005301,0.721445,1.853102,0.004336,0.824743,1.446572,ENSG00000004059,0
3,ENST00000000233,332,AGAACAT,0.010620,6.476350,129.355000,0.008632,2.899200,97.836500,0.006102,...,2.335826,2.644269,0.004252,1.018904,1.932062,0.003584,0.661910,1.822120,ENSG00000004059,0
4,ENST00000000233,368,AGGACAA,0.010701,6.415051,117.924242,0.011479,5.870303,121.954545,0.010019,...,1.383761,1.883420,0.005636,1.859353,2.004888,0.004506,1.466036,2.251555,ENSG00000004059,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
121833,ENST00000641834,1348,GGGACAT,0.009594,3.294164,118.232877,0.007300,4.929726,116.342466,0.006555,...,1.174303,2.235561,0.004556,1.678792,2.706338,0.005078,2.342449,3.736387,ENSG00000167747,1
121834,ENST00000641834,1429,CTGACAC,0.008393,4.511014,110.969565,0.010305,9.105797,114.927536,0.005568,...,1.979405,3.978471,0.004743,1.382768,2.568104,0.003849,0.796583,3.087344,ENSG00000167747,0
121835,ENST00000641834,1531,TGGACAC,0.008161,3.918438,113.968750,0.006877,4.759687,113.562500,0.006410,...,1.174073,2.582586,0.004565,1.179861,1.791664,0.003236,1.046608,1.858941,ENSG00000167747,1
121836,ENST00000641834,1537,CTGACCA,0.008044,3.191228,109.354386,0.007419,6.552982,123.263158,0.006472,...,0.796188,4.831773,0.002296,2.261881,2.397088,0.001914,0.543752,1.941561,ENSG00000167747,0
