In [2]:
import json
import gzip
import pandas as pd
import numpy as np
from concurrent.futures import ThreadPoolExecutor, as_completed

In [10]:
def generate_features(value):
    array = np.array(value)
    mean_array = np.mean(array, axis=0)
    median_array = np.median(array, axis=0)
    max_array = np.max(array, axis=0)
    min_array = np.min(array, axis=0)
    sd_array = np.std(array, axis=0)
    concatenated_array = np.concatenate((mean_array, median_array, max_array, min_array, sd_array))
    return concatenated_array

def process_line(index, line):
    row = json.loads(line)
    parsed_row = parse_row(row)
    print(f"Processed line {index + 1}")
    return parsed_row

def parse_row(row):
    ls = []

    for key,value in row.items():
        ls.append(key)
        for key1, value1 in value.items():
            ls.append(key1)
            for key2, value2 in value1.items():
                ls.append(key2)
                whole_set = generate_features(value2).tolist()
                ls += whole_set

    return ls

In [13]:
# Parse Json
def parse_row(row):
    ls = []
    for key,value in row.items():
        ls.append(key)
        for key1, value1 in value.items():
            ls.append(key1)
            for key2, value2 in value1.items():
                ls.append(key2)
                whole_set = generate_features(value2).tolist()
                ls += whole_set

    return ls

def process_line(index, line):
    row = json.loads(line)
    parsed_row = parse_row(row)
    print(f"Processed line {index + 1}")
    return parsed_row

def parse_json(json_path,csv_path):
    with gzip.open(json_path, 'rt') as f:
        lines = f.readlines()

    print(lines)

    # Create new dimensions
    columns = ["transcript_id", "transcript_position", "seq"]
    values = ["dt_1", "sd_1", "curr_1", "dt_2", "sd_2", "curr_2", "dt_3", "sd_3", "curr_3"]

    for aggregate in ["mean","median","max","min","sd"]:
        for val in values:
            columns.append(f"{aggregate}_{val}")

    parsed_rows = []  

    with ThreadPoolExecutor() as executor:
        future_to_index = {executor.submit(process_line, i, line): i for i, line in enumerate(lines)}
        for future in as_completed(future_to_index):
            parsed_row = future.result()
            parsed_rows.append(parsed_row)

    df = pd.DataFrame(parsed_rows, columns = columns)
    
    df["transcript_position"] = df["transcript_position"].astype(int)
    df = df.sort_values(by = ["transcript_id","transcript_position"])

    labels = pd.read_csv(csv_path)
    df_with_labels = df.merge(labels, on = ["transcript_id","transcript_position"])

    return df_with_labels

df = parse_json('data/dataset0.json.gz','data/data.info.labelled')
df.to_parquet("data/data.parquet")

In [3]:
with gzip.open('data/dataset0.json.gz', 'rt') as f:
    lines = f.readlines()

In [12]:
len(process_line(0, lines[0]))

Processed line 1


48

In [26]:
train_df = pd.read_parquet("train.parquet")
test_df = pd.read_parquet("test.parquet")

In [31]:
from sklearn.metrics import roc_auc_score, precision_recall_curve, auc, accuracy_score, confusion_matrix, classification_report
from sklearn.linear_model import LogisticRegression

y_train = train_df['label'] 
X_train = train_df.drop(columns=['label','transcript_id', 'transcript_position', 'seq','gene_id'])

y_test = test_df['label'] 
X_test = test_df.drop(columns=['label','transcript_id', 'transcript_position', 'seq','gene_id']) 

log_reg = LogisticRegression(random_state=42, max_iter=100000)
log_reg.fit(X_train, y_train)
y_pred = log_reg.predict(X_test)
y_proba = log_reg.predict_proba(X_test)[:, 1]

conf_matrix = confusion_matrix(y_test, y_pred)
print('Confusion Matrix:')
print(conf_matrix)

accuracy = accuracy_score(y_test, y_pred)
print(f'Accuracy: {accuracy:.2f}')

roc_auc = roc_auc_score(y_test, y_proba)
print(f'ROC AUC: {roc_auc:.2f}')

precision, recall, _ = precision_recall_curve(y_test, y_proba)
pr_auc = auc(recall, precision)
print(f'PR AUC: {pr_auc:.2f}')


Confusion Matrix:
[[23273     0]
 [ 1095     0]]
Accuracy: 0.96
ROC AUC: 0.71
PR AUC: 0.09


In [33]:
from sklearn.metrics import roc_auc_score, precision_recall_curve, auc, accuracy_score, confusion_matrix, classification_report
from sklearn.ensemble import RandomForestClassifier

y_train = train_df['label'] 
X_train = train_df.drop(columns=['label','transcript_id', 'transcript_position', 'seq','gene_id'])

y_test = test_df['label'] 
X_test = test_df.drop(columns=['label','transcript_id', 'transcript_position', 'seq','gene_id']) 

rf_model = RandomForestClassifier(random_state=42)
rf_model.fit(X_train, y_train)
y_pred = rf_model.predict(X_test)
y_proba = rf_model.predict_proba(X_test)[:, 1]

conf_matrix = confusion_matrix(y_test, y_pred)
print('Confusion Matrix:')
print(conf_matrix)

accuracy = accuracy_score(y_test, y_pred)
print(f'Accuracy: {accuracy:.2f}')

roc_auc = roc_auc_score(y_test, y_proba)
print(f'ROC AUC: {roc_auc:.2f}')

precision, recall, _ = precision_recall_curve(y_test, y_proba)
pr_auc = auc(recall, precision)
print(f'PR AUC: {pr_auc:.2f}')


Confusion Matrix:
[[23135   138]
 [  851   244]]
Accuracy: 0.96
ROC AUC: 0.88
PR AUC: 0.41
