In [1]:
import json
import gzip
import pandas as pd
import numpy as np
from concurrent.futures import ThreadPoolExecutor, as_completed
from sklearn.cluster import KMeans

In [8]:
def generate_features(array):
    mean_array = np.mean(array, axis=0)
    median_array = np.median(array, axis=0)
    max_array = np.max(array, axis=0)
    min_array = np.min(array, axis=0)
    sd_array = np.std(array, axis=0)
    concatenated_array = np.concatenate((mean_array, median_array, max_array, min_array, sd_array))
    return concatenated_array

In [19]:
def cluster_samples(array):
    kmeans = KMeans(n_clusters=2, random_state=0)
    kmeans.fit(array)
    labels = kmeans.labels_
    cluster_1 = array[labels == 0]
    cluster_2 = array[labels == 1]
    return cluster_1, cluster_2

In [22]:
# Parse Json
def parse_row(row):
    ls = []
    for key,value in row.items():
        ls.append(key)
        for key1, value1 in value.items():
            ls.append(key1)
            for key2, value2 in value1.items():
                ls.append(key2)
                array = np.array(value2)
                cluster_1, cluster_2 = cluster_samples(array)
                
                whole_set = generate_features(array).tolist()
                cluster_1_set = generate_features(cluster_1).tolist()
                cluster_2_set = generate_features(cluster_2).tolist()
                ls += whole_set + cluster_1_set + cluster_2_set
    return ls

def process_line(index, line):
    row = json.loads(line)
    parsed_row = parse_row(row)
    print(f"Processed line {index + 1}")
    return parsed_row

def parse_json(json_path,csv_path):
    with gzip.open(json_path, 'rt') as f:
        lines = f.readlines()[:1]

    # Create new dimensions
    columns = ["transcript_id", "transcript_position", "seq"]
    values = ["dt_1", "sd_1", "curr_1", "dt_2", "sd_2", "curr_2", "dt_3", "sd_3", "curr_3"]

    for data_range in ["whole", "cluster_1", "cluster_2"]:
        for aggregate in ["mean","median","max","min","sd"]:
            for val in values:
                columns.append(f"{data_range}_{aggregate}_{val}")

    parsed_rows = []  

    with ThreadPoolExecutor() as executor:
        future_to_index = {executor.submit(process_line, i, line): i for i, line in enumerate(lines)}
        for future in as_completed(future_to_index):
            parsed_row = future.result()
            parsed_rows.append(parsed_row)

    df = pd.DataFrame(parsed_rows, columns = columns)
    
    df["transcript_position"] = df["transcript_position"].astype(int)
    df = df.sort_values(by = ["transcript_id","transcript_position"])

    labels = pd.read_csv(csv_path)
    df_with_labels = df.merge(labels, on = ["transcript_id","transcript_position"])

    return df_with_labels

df = parse_json('data/dataset0.json.gz','data/data.info.labelled')
df.to_parquet("data/data.parquet")

Processed line 1


In [26]:
train_df = pd.read_parquet("train.parquet")
test_df = pd.read_parquet("test.parquet")

In [31]:
from sklearn.metrics import roc_auc_score, precision_recall_curve, auc, accuracy_score, confusion_matrix, classification_report
from sklearn.linear_model import LogisticRegression

y_train = train_df['label'] 
X_train = train_df.drop(columns=['label','transcript_id', 'transcript_position', 'seq','gene_id'])

y_test = test_df['label'] 
X_test = test_df.drop(columns=['label','transcript_id', 'transcript_position', 'seq','gene_id']) 

log_reg = LogisticRegression(random_state=42, max_iter=100000)
log_reg.fit(X_train, y_train)
y_pred = log_reg.predict(X_test)
y_proba = log_reg.predict_proba(X_test)[:, 1]

conf_matrix = confusion_matrix(y_test, y_pred)
print('Confusion Matrix:')
print(conf_matrix)

accuracy = accuracy_score(y_test, y_pred)
print(f'Accuracy: {accuracy:.2f}')

roc_auc = roc_auc_score(y_test, y_proba)
print(f'ROC AUC: {roc_auc:.2f}')

precision, recall, _ = precision_recall_curve(y_test, y_proba)
pr_auc = auc(recall, precision)
print(f'PR AUC: {pr_auc:.2f}')


Confusion Matrix:
[[23273     0]
 [ 1095     0]]
Accuracy: 0.96
ROC AUC: 0.71
PR AUC: 0.09


In [33]:
from sklearn.metrics import roc_auc_score, precision_recall_curve, auc, accuracy_score, confusion_matrix, classification_report
from sklearn.ensemble import RandomForestClassifier

y_train = train_df['label'] 
X_train = train_df.drop(columns=['label','transcript_id', 'transcript_position', 'seq','gene_id'])

y_test = test_df['label'] 
X_test = test_df.drop(columns=['label','transcript_id', 'transcript_position', 'seq','gene_id']) 

rf_model = RandomForestClassifier(random_state=42)
rf_model.fit(X_train, y_train)
y_pred = rf_model.predict(X_test)
y_proba = rf_model.predict_proba(X_test)[:, 1]

conf_matrix = confusion_matrix(y_test, y_pred)
print('Confusion Matrix:')
print(conf_matrix)

accuracy = accuracy_score(y_test, y_pred)
print(f'Accuracy: {accuracy:.2f}')

roc_auc = roc_auc_score(y_test, y_proba)
print(f'ROC AUC: {roc_auc:.2f}')

precision, recall, _ = precision_recall_curve(y_test, y_proba)
pr_auc = auc(recall, precision)
print(f'PR AUC: {pr_auc:.2f}')


Confusion Matrix:
[[23135   138]
 [  851   244]]
Accuracy: 0.96
ROC AUC: 0.88
PR AUC: 0.41


In [3]:
df = pd.read_parquet("data/dataset_all_features.parquet")

In [19]:
import pandas as pd
from sklearn.model_selection import train_test_split

# Assuming your dataset is in a DataFrame named df and 'column_name' is the column you want to split on

# Step 1: Get the unique values from the specified column
unique_values = df['gene_id'].unique()

# Step 2: Perform an 80-20 split on the unique values
train_values, test_values = train_test_split(unique_values, test_size=0.2, random_state=42)

# Step 3: Filter the dataset based on the train and test values
train_df = df[df['gene_id'].isin(train_values)]
test_df = df[df['gene_id'].isin(test_values)]

# Now train_df and test_df are your train and test datasets with no overlap on 'column_name'
print(f"Train set size: {len(train_df)}")
print(f"Test set size: {len(test_df)}")


Train set size: 96821
Test set size: 25017


In [21]:
train_df.to_parquet("data/train_all_features.parquet")
test_df.to_parquet("data/test_all_features.parquet")

In [22]:
train_df

Unnamed: 0,transcript_id,transcript_position,seq,whole_mean_dt_1,whole_mean_sd_1,whole_mean_curr_1,whole_mean_dt_2,whole_mean_sd_2,whole_mean_curr_2,whole_mean_dt_3,...,cluster_2_sd_sd_1,cluster_2_sd_curr_1,cluster_2_sd_dt_2,cluster_2_sd_sd_2,cluster_2_sd_curr_2,cluster_2_sd_dt_3,cluster_2_sd_sd_3,cluster_2_sd_curr_3,gene_id,label
18,ENST00000000412,355,GAAACTA,0.007340,2.977180,108.360000,0.007782,2.608600,106.584000,0.007045,...,2.295391,2.368845,0.003721,0.605664,1.576356,0.003138,0.442328,2.147127,ENSG00000003056,0
19,ENST00000000412,367,GGGACCG,0.008988,3.961489,118.638298,0.007403,6.045319,122.489362,0.006636,...,1.113159,2.396142,0.002825,1.860764,2.381905,0.004161,1.457417,2.129269,ENSG00000003056,0
20,ENST00000000412,496,AGGACTG,0.011065,7.299608,115.549020,0.009377,5.986667,125.666667,0.006474,...,1.764011,2.193497,0.004701,1.673851,1.558126,0.003241,0.891852,1.524671,ENSG00000003056,0
21,ENST00000000412,501,TGGACTG,0.006904,2.803571,119.142857,0.010334,5.950893,123.821429,0.010214,...,1.846205,1.564059,0.003473,3.109129,2.829401,0.005521,1.089678,1.202462,ENSG00000003056,0
22,ENST00000000412,547,CAGACAG,0.006961,4.949231,108.373077,0.009155,5.005962,123.750000,0.011251,...,1.011763,3.794733,0.003542,1.812695,2.289105,0.008535,0.998551,1.012373,ENSG00000003056,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
121833,ENST00000641834,1348,GGGACAT,0.009594,3.294164,118.232877,0.007300,4.929726,116.342466,0.006555,...,1.174303,2.235561,0.004556,1.678792,2.706338,0.005078,2.342449,3.736387,ENSG00000167747,1
121834,ENST00000641834,1429,CTGACAC,0.008393,4.511014,110.969565,0.010305,9.105797,114.927536,0.005568,...,1.979405,3.978471,0.004743,1.382768,2.568104,0.003849,0.796583,3.087344,ENSG00000167747,0
121835,ENST00000641834,1531,TGGACAC,0.008161,3.918438,113.968750,0.006877,4.759687,113.562500,0.006410,...,1.174073,2.582586,0.004565,1.179861,1.791664,0.003236,1.046608,1.858941,ENSG00000167747,1
121836,ENST00000641834,1537,CTGACCA,0.008044,3.191228,109.354386,0.007419,6.552982,123.263158,0.006472,...,0.796188,4.831773,0.002296,2.261881,2.397088,0.001914,0.543752,1.941561,ENSG00000167747,0


In [26]:
import numpy as np
import pandas as pd
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline

# Example dataset (replace with your own)
# X: Features, y: Binary target
X = train_df.drop(columns = ["transcript_id", "transcript_position", "seq", "gene_id", "label"])
y = train_df["label"]

# Step 1: Set up Logistic Regression with L1 (Lasso) penalty
logistic = LogisticRegression(penalty='l1', solver='liblinear')

# Step 2: Set up the cross-validation grid
# 'C' is the inverse of regularization strength; smaller values imply stronger regularization
param_grid = {
    'logistic__C': np.logspace(-4, 4, 10)  # Search over a range of values for 'C'
}

# Step 3: Use a pipeline for scaling and modeling
pipeline = Pipeline([
    ('scaler', StandardScaler()),  # Standardize features (important for regularization)
    ('logistic', logistic)
])

# Step 4: Set up GridSearchCV for cross-validation
grid_search = GridSearchCV(pipeline, param_grid, cv=5, scoring='average_precision', verbose = 3)  # Adjust scoring if needed

# Step 5: Fit the model with cross-validation
grid_search.fit(X, y)

# Step 6: Get the best model and the selected features
best_model = grid_search.best_estimator_
logistic_model = best_model.named_steps['logistic']

# Step 7: Identify selected features (non-zero coefficients)
selected_features = np.where(logistic_model.coef_[0] != 0)[0]

print(f"Selected features: {selected_features}")
print(f"Best regularization strength (C): {grid_search.best_params_['logistic__C']}")


Fitting 5 folds for each of 10 candidates, totalling 50 fits
[CV 1/5] END .logistic__C=9.999999999999999e-05;, score=0.046 total time=   0.8s
[CV 2/5] END .logistic__C=9.999999999999999e-05;, score=0.046 total time=   0.8s
[CV 3/5] END .logistic__C=9.999999999999999e-05;, score=0.046 total time=   0.8s
[CV 4/5] END .logistic__C=9.999999999999999e-05;, score=0.046 total time=   0.8s
[CV 5/5] END .logistic__C=9.999999999999999e-05;, score=0.046 total time=   0.8s
[CV 1/5] END ..logistic__C=0.000774263682681127;, score=0.080 total time=   0.8s
[CV 2/5] END ..logistic__C=0.000774263682681127;, score=0.083 total time=   0.9s
[CV 3/5] END ..logistic__C=0.000774263682681127;, score=0.084 total time=   0.8s
[CV 4/5] END ..logistic__C=0.000774263682681127;, score=0.085 total time=   0.8s
[CV 5/5] END ..logistic__C=0.000774263682681127;, score=0.075 total time=   0.8s
[CV 1/5] END ..logistic__C=0.005994842503189409;, score=0.205 total time=   2.5s
[CV 2/5] END ..logistic__C=0.005994842503189409;

In [28]:
features_pr_auc = X.columns[selected_features].tolist()

In [29]:
with open("data/features_pr_auc.json","w") as file:
    json.dump(features_pr_auc, file)