In [2]:
import json
import pandas as pd
import numpy as np
from concurrent.futures import ThreadPoolExecutor, as_completed

In [20]:
# Parse Json
def parse_row(row):
    ls = []

    transcript_id = None
    position = None
    seq = None

    for key,value in row.items():
        ls.append(key)
        for key1, value1 in value.items():
            ls.append(key1)
            for key2, value2 in value1.items():
                ls.append(key2)
                array = np.array(value2)
                mean_array = np.mean(array, axis=0)
                mean_list = mean_array.tolist()
                ls += mean_list

    return ls

def process_line(index, line):
    row = json.loads(line)
    parsed_row = parse_row(row)
    print(f"Processed line {index + 1}")
    return parsed_row

def parse_json(json_path,csv_path):
    with open(json_path) as f:
        lines = f.readlines()

    parsed_rows = []  

    with ThreadPoolExecutor() as executor:
        future_to_index = {executor.submit(process_line, i, line): i for i, line in enumerate(lines)}
        for future in as_completed(future_to_index):
            parsed_row = future.result()
            parsed_rows.append(parsed_row)

    df = pd.DataFrame(parsed_rows, columns=["transcript_id", "transcript_position", "seq", 
                                            "dt_1", "sd_1", "curr_1", 
                                            "dt_2", "sd_2", "curr_2", 
                                            "dt_3", "sd_3", "curr_3"])
    
    df["transcript_position"] = df["transcript_position"].astype(int)
    df = df.sort_values(by = ["transcript_id","transcript_position"])

    labels = pd.read_csv(csv_path)
    df_with_labels = df.merge(labels, on = ["transcript_id","transcript_position"])

    return df_with_labels

df = parse_json('dataset0.json','data_info.csv')
df.to_parquet("data.parquet")

Processed line 1
Processed line 2
Processed line 3
Processed line 4
Processed line 5
Processed line 6
Processed line 7
Processed line 8
Processed line 9
Processed line 10
Processed line 11
Processed line 12
Processed line 13
Processed line 14
Processed line 15
Processed line 16
Processed line 17
Processed line 18
Processed line 19
Processed line 20
Processed line 21
Processed line 22
Processed line 23
Processed line 24
Processed line 25
Processed line 26
Processed line 27
Processed line 28
Processed line 29
Processed line 30
Processed line 31
Processed line 32
Processed line 33
Processed line 34
Processed line 35
Processed line 36
Processed line 37
Processed line 38
Processed line 39
Processed line 40
Processed line 41
Processed line 42
Processed line 43
Processed line 44
Processed line 45
Processed line 46
Processed line 47
Processed line 48
Processed line 49
Processed line 50
Processed line 51
Processed line 52
Processed line 53
Processed line 54
Processed line 55
Processed line 56
P

In [26]:
train_df = pd.read_parquet("train.parquet")
test_df = pd.read_parquet("test.parquet")

In [31]:
from sklearn.metrics import roc_auc_score, precision_recall_curve, auc, accuracy_score, confusion_matrix, classification_report
from sklearn.linear_model import LogisticRegression

y_train = train_df['label'] 
X_train = train_df.drop(columns=['label','transcript_id', 'transcript_position', 'seq','gene_id'])

y_test = test_df['label'] 
X_test = test_df.drop(columns=['label','transcript_id', 'transcript_position', 'seq','gene_id']) 

log_reg = LogisticRegression(random_state=42, max_iter=100000)
log_reg.fit(X_train, y_train)
y_pred = log_reg.predict(X_test)
y_proba = log_reg.predict_proba(X_test)[:, 1]

conf_matrix = confusion_matrix(y_test, y_pred)
print('Confusion Matrix:')
print(conf_matrix)

accuracy = accuracy_score(y_test, y_pred)
print(f'Accuracy: {accuracy:.2f}')

roc_auc = roc_auc_score(y_test, y_proba)
print(f'ROC AUC: {roc_auc:.2f}')

precision, recall, _ = precision_recall_curve(y_test, y_proba)
pr_auc = auc(recall, precision)
print(f'PR AUC: {pr_auc:.2f}')


Confusion Matrix:
[[23273     0]
 [ 1095     0]]
Accuracy: 0.96
ROC AUC: 0.71
PR AUC: 0.09


In [33]:
from sklearn.metrics import roc_auc_score, precision_recall_curve, auc, accuracy_score, confusion_matrix, classification_report
from sklearn.ensemble import RandomForestClassifier

y_train = train_df['label'] 
X_train = train_df.drop(columns=['label','transcript_id', 'transcript_position', 'seq','gene_id'])

y_test = test_df['label'] 
X_test = test_df.drop(columns=['label','transcript_id', 'transcript_position', 'seq','gene_id']) 

rf_model = RandomForestClassifier(random_state=42)
rf_model.fit(X_train, y_train)
y_pred = rf_model.predict(X_test)
y_proba = rf_model.predict_proba(X_test)[:, 1]

conf_matrix = confusion_matrix(y_test, y_pred)
print('Confusion Matrix:')
print(conf_matrix)

accuracy = accuracy_score(y_test, y_pred)
print(f'Accuracy: {accuracy:.2f}')

roc_auc = roc_auc_score(y_test, y_proba)
print(f'ROC AUC: {roc_auc:.2f}')

precision, recall, _ = precision_recall_curve(y_test, y_proba)
pr_auc = auc(recall, precision)
print(f'PR AUC: {pr_auc:.2f}')


Confusion Matrix:
[[23135   138]
 [  851   244]]
Accuracy: 0.96
ROC AUC: 0.88
PR AUC: 0.41
