# Workflow
---
This notebook is a summarised version of the essential steps needed to train our model & generate a prediction.
We assume using method 2 (column version) in this notebook.

What you need for this notebook:
- Original dataset

The notebook contains 3 section:
1. Data parsing & preprocessing (+ feature engineering & selection)
2. Model training
3. Prediction

We assume using XGBoost as the model here.

## Section 1: Data Parsing & Preprocessing

In [1]:
import json
import pandas as pd
import numpy as np

In [None]:
# !gzip -d dataset2.json.gz


In [2]:
## mount drive and retrieve labelled data

from google.colab import drive
drive.mount("/content/drive")

import os
os.chdir("/content/drive/MyDrive/DSA4266_Tundra")

Mounted at /content/drive


In [3]:
data_list = []

with open('dataset1.json', 'r') as json_file:
    for line in json_file:
        data = json.loads(line)
        data_list.append(data)

In [4]:
def explore_dataset(data_list):
    results = []
    for data in data_list:
        trans_id, first = next(iter(data.items()))
        position, second = next(iter(first.items()))
        sequence, data = next(iter(second.items()))

        avg = np.mean(data, axis=0)
        med = np.median(data, axis=0)
        #smallest = np.min(data, axis=0)
        #largest = np.max(data, axis=0)
        std = np.std(data, axis=0)

        result = [trans_id, position, sequence] + list(avg)+ list(med) + list(std)
        results.append(result)

        #result=[trans_id, position, sequence]
        #results += list(avg)+ list(med) + list(smallest) + list(largest)+ list(std)

    return results

In [5]:
results = explore_dataset(data_list)
result_df = pd.DataFrame(results)

In [6]:
result_df.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,20,21,22,23,24,25,26,27,28,29
0,ENST00000000233,244,AAGACCA,0.008722,3.990545,123.090909,0.010218,6.716545,125.50303,0.006773,...,79.9,0.006961,2.043149,5.111862,0.00774,3.162649,2.971836,0.004178,2.649395,2.414444
1,ENST00000000233,261,CAAACTG,0.006292,2.849892,109.027711,0.007084,3.197289,107.190964,0.007168,...,94.15,0.003249,1.066933,3.038837,0.004254,1.428168,3.689105,0.005046,1.082745,2.752559
2,ENST00000000233,316,GAAACAG,0.007237,2.78271,105.440237,0.00705,3.463964,98.539053,0.007141,...,88.9,0.004005,1.437534,1.661621,0.004286,0.972003,3.335626,0.003814,0.777871,1.486479
3,ENST00000000233,332,AGAACAT,0.010299,6.042386,129.304348,0.008195,2.733641,97.320652,0.005806,...,89.9,0.005658,2.288102,2.673294,0.004195,0.943466,1.945091,0.003578,1.054379,3.112562
4,ENST00000000233,368,AGGACAA,0.011535,6.031751,118.0,0.012108,5.748927,121.485876,0.009484,...,85.2,0.007941,1.345474,2.928527,0.005699,1.745973,2.321862,0.004775,1.206345,2.331742


In [7]:
colnames = ['transcript_id', 'transcript_position', 'sequence']
for i in ['avg', 'med','std']:
    for j in ['1-flank', 'central', '1+flank']:
        for q in ["length", "std", "mean"]:
            colnames.append(i + '_' + j + '_' + q)

result_df.columns = colnames

In [8]:
result_df['transcript_position'] = result_df['transcript_position'].astype(int)

In [9]:
result_df.head()

Unnamed: 0,transcript_id,transcript_position,sequence,avg_1-flank_length,avg_1-flank_std,avg_1-flank_mean,avg_central_length,avg_central_std,avg_central_mean,avg_1+flank_length,...,med_1+flank_mean,std_1-flank_length,std_1-flank_std,std_1-flank_mean,std_central_length,std_central_std,std_central_mean,std_1+flank_length,std_1+flank_std,std_1+flank_mean
0,ENST00000000233,244,AAGACCA,0.008722,3.990545,123.090909,0.010218,6.716545,125.50303,0.006773,...,79.9,0.006961,2.043149,5.111862,0.00774,3.162649,2.971836,0.004178,2.649395,2.414444
1,ENST00000000233,261,CAAACTG,0.006292,2.849892,109.027711,0.007084,3.197289,107.190964,0.007168,...,94.15,0.003249,1.066933,3.038837,0.004254,1.428168,3.689105,0.005046,1.082745,2.752559
2,ENST00000000233,316,GAAACAG,0.007237,2.78271,105.440237,0.00705,3.463964,98.539053,0.007141,...,88.9,0.004005,1.437534,1.661621,0.004286,0.972003,3.335626,0.003814,0.777871,1.486479
3,ENST00000000233,332,AGAACAT,0.010299,6.042386,129.304348,0.008195,2.733641,97.320652,0.005806,...,89.9,0.005658,2.288102,2.673294,0.004195,0.943466,1.945091,0.003578,1.054379,3.112562
4,ENST00000000233,368,AGGACAA,0.011535,6.031751,118.0,0.012108,5.748927,121.485876,0.009484,...,85.2,0.007941,1.345474,2.928527,0.005699,1.745973,2.321862,0.004775,1.206345,2.331742


In [10]:
df = result_df.copy()
rfe = ['transcript_id',
'transcript_position',
'sequence',
'avg_central_mean',
'avg_1+flank_std',
'med_central_std',
'med_central_mean',
'med_1+flank_std',
'std_1-flank_std',
'std_1-flank_mean',
'std_central_std',
'std_central_mean',
'std_1+flank_std',
'std_1+flank_mean']
df = df[rfe]

In [11]:
df['seq_left'] = df['sequence'].str[0:5]
df['seq_center'] = df['sequence'].str[1:6]
df['seq_right'] = df['sequence'].str[2:7]

In [12]:
from sklearn.preprocessing import LabelEncoder
df_le = df.copy()
label_encoder = LabelEncoder()

seq_data = ['seq_left','seq_center','seq_right']
for seq in seq_data:
    encoded_labels = label_encoder.fit_transform(df_le[seq])
    df_le[seq] = encoded_labels

df_le.head()

Unnamed: 0,transcript_id,transcript_position,sequence,avg_central_mean,avg_1+flank_std,med_central_std,med_central_mean,med_1+flank_std,std_1-flank_std,std_1-flank_mean,std_central_std,std_central_mean,std_1+flank_std,std_1+flank_mean,seq_left,seq_center,seq_right
0,ENST00000000233,244,AAGACCA,125.50303,3.999745,6.16,126.0,3.12,2.043149,5.111862,3.162649,2.971836,2.649395,2.414444,1,4,16
1,ENST00000000233,261,CAAACTG,107.190964,2.686892,3.02,108.0,2.54,1.066933,3.038837,1.428168,3.689105,1.082745,2.752559,6,2,10
2,ENST00000000233,316,GAAACAG,98.539053,2.083775,3.5,98.9,1.95,1.437534,1.661621,0.972003,3.335626,0.777871,1.486479,12,0,2
3,ENST00000000233,332,AGAACAT,97.320652,2.252565,2.565,97.35,2.175,2.288102,2.673294,0.943466,1.945091,1.054379,3.112562,2,6,3
4,ENST00000000233,368,AGGACAA,121.485876,3.958192,5.61,122.0,3.81,1.345474,2.928527,1.745973,2.321862,1.206345,2.331742,3,9,12


## Section 2: Model

In [None]:
import xgboost as xgb

In [13]:
test = df_le.copy()

In [14]:
X_test = test.drop(['transcript_id', 'sequence'], axis=1)
X_test.head()

Unnamed: 0,transcript_position,avg_central_mean,avg_1+flank_std,med_central_std,med_central_mean,med_1+flank_std,std_1-flank_std,std_1-flank_mean,std_central_std,std_central_mean,std_1+flank_std,std_1+flank_mean,seq_left,seq_center,seq_right
0,244,125.50303,3.999745,6.16,126.0,3.12,2.043149,5.111862,3.162649,2.971836,2.649395,2.414444,1,4,16
1,261,107.190964,2.686892,3.02,108.0,2.54,1.066933,3.038837,1.428168,3.689105,1.082745,2.752559,6,2,10
2,316,98.539053,2.083775,3.5,98.9,1.95,1.437534,1.661621,0.972003,3.335626,0.777871,1.486479,12,0,2
3,332,97.320652,2.252565,2.565,97.35,2.175,2.288102,2.673294,0.943466,1.945091,1.054379,3.112562,2,6,3
4,368,121.485876,3.958192,5.61,122.0,3.81,1.345474,2.928527,1.745973,2.321862,1.206345,2.331742,3,9,12


In [15]:
from sklearn import preprocessing
scaler = preprocessing.MinMaxScaler()
d = scaler.fit_transform(X_test)
normalised_X_test = pd.DataFrame(d, columns=X_test.columns)
normalised_X_test.head()

Unnamed: 0,transcript_position,avg_central_mean,avg_1+flank_std,med_central_std,med_central_mean,med_1+flank_std,std_1-flank_std,std_1-flank_mean,std_central_std,std_central_mean,std_1+flank_std,std_1+flank_mean,seq_left,seq_center,seq_right
0,0.017035,0.784456,0.298511,0.439353,0.782178,0.208155,0.052962,0.301764,0.089098,0.266979,0.0892,0.077677,0.043478,0.235294,0.695652
1,0.018237,0.406697,0.156152,0.157233,0.425743,0.145923,0.024773,0.161799,0.037241,0.351434,0.032719,0.091367,0.26087,0.117647,0.434783
2,0.022125,0.228217,0.090752,0.200359,0.245545,0.082618,0.035474,0.068814,0.023603,0.309813,0.021727,0.040106,0.521739,0.0,0.086957
3,0.023256,0.203082,0.109055,0.116352,0.214851,0.10676,0.060035,0.137119,0.02275,0.146085,0.031696,0.105942,0.086957,0.352941,0.130435
4,0.025801,0.701586,0.294006,0.389937,0.70297,0.282189,0.032816,0.154352,0.046743,0.190448,0.037175,0.074329,0.130435,0.529412,0.521739


In [16]:
import joblib
# Load the trained model from the file
best_model = joblib.load('bext_lstm.pkl')

In [17]:
test.head()

Unnamed: 0,transcript_id,transcript_position,sequence,avg_central_mean,avg_1+flank_std,med_central_std,med_central_mean,med_1+flank_std,std_1-flank_std,std_1-flank_mean,std_central_std,std_central_mean,std_1+flank_std,std_1+flank_mean,seq_left,seq_center,seq_right
0,ENST00000000233,244,AAGACCA,125.50303,3.999745,6.16,126.0,3.12,2.043149,5.111862,3.162649,2.971836,2.649395,2.414444,1,4,16
1,ENST00000000233,261,CAAACTG,107.190964,2.686892,3.02,108.0,2.54,1.066933,3.038837,1.428168,3.689105,1.082745,2.752559,6,2,10
2,ENST00000000233,316,GAAACAG,98.539053,2.083775,3.5,98.9,1.95,1.437534,1.661621,0.972003,3.335626,0.777871,1.486479,12,0,2
3,ENST00000000233,332,AGAACAT,97.320652,2.252565,2.565,97.35,2.175,2.288102,2.673294,0.943466,1.945091,1.054379,3.112562,2,6,3
4,ENST00000000233,368,AGGACAA,121.485876,3.958192,5.61,122.0,3.81,1.345474,2.928527,1.745973,2.321862,1.206345,2.331742,3,9,12


In [22]:
normalised_X_test.head()

Unnamed: 0,transcript_position,avg_central_mean,avg_1+flank_std,med_central_std,med_central_mean,med_1+flank_std,std_1-flank_std,std_1-flank_mean,std_central_std,std_central_mean,std_1+flank_std,std_1+flank_mean,seq_left,seq_center,seq_right
0,0.017035,0.784456,0.298511,0.439353,0.782178,0.208155,0.052962,0.301764,0.089098,0.266979,0.0892,0.077677,0.043478,0.235294,0.695652
1,0.018237,0.406697,0.156152,0.157233,0.425743,0.145923,0.024773,0.161799,0.037241,0.351434,0.032719,0.091367,0.26087,0.117647,0.434783
2,0.022125,0.228217,0.090752,0.200359,0.245545,0.082618,0.035474,0.068814,0.023603,0.309813,0.021727,0.040106,0.521739,0.0,0.086957
3,0.023256,0.203082,0.109055,0.116352,0.214851,0.10676,0.060035,0.137119,0.02275,0.146085,0.031696,0.105942,0.086957,0.352941,0.130435
4,0.025801,0.701586,0.294006,0.389937,0.70297,0.282189,0.032816,0.154352,0.046743,0.190448,0.037175,0.074329,0.130435,0.529412,0.521739


In [40]:
prob = best_model.predict(normalised_X_test)#[:,1]



In [43]:
prob1 = [item for sublist in prob for item in sublist]

In [42]:
prob

array([[2.6707672e-02],
       [1.7196848e-03],
       [5.8326996e-06],
       ...,
       [5.2604306e-01],
       [3.0646473e-04],
       [7.2507646e-06]], dtype=float32)

In [44]:
prob1

[0.026707672,
 0.0017196848,
 5.8326996e-06,
 0.26109922,
 0.00019667225,
 0.04039232,
 0.0030699172,
 0.00028568896,
 0.0045231064,
 0.28622478,
 0.04631599,
 0.0055066384,
 0.055016566,
 0.00035798535,
 0.00036108764,
 0.022570606,
 0.71994305,
 0.6739172,
 0.0001014719,
 0.0012738288,
 0.0004047926,
 0.00012677305,
 0.001630343,
 4.384754e-05,
 0.0023918685,
 0.0002703853,
 2.6704918e-06,
 0.0064006126,
 2.628069e-05,
 6.112375e-05,
 0.0015594424,
 9.733767e-06,
 0.0026366513,
 0.00034432905,
 0.0021489605,
 6.994924e-05,
 1.3305857e-06,
 6.948897e-06,
 9.2900535e-08,
 0.00020395915,
 7.908706e-06,
 0.0032633182,
 4.7569607e-05,
 0.004465424,
 2.799591e-05,
 0.0041048992,
 0.00041393357,
 0.04118459,
 5.346437e-10,
 0.0006792882,
 0.009971182,
 8.196756e-08,
 2.4102543e-05,
 4.5293604e-07,
 0.00056287705,
 0.9965131,
 5.172915e-07,
 3.528149e-10,
 2.9443156e-05,
 0.0025216448,
 0.051487237,
 0.00083505886,
 1.846135e-06,
 0.07906816,
 0.00044181524,
 1.2816409e-05,
 0.11348337,
 0.8

In [45]:
transcript_id = test['transcript_id']
transcript_position = test['transcript_position']
data = {
    'transcript_id': transcript_id,
    'transcript_position': transcript_position,
    'probability': prob1
}




In [46]:
data

{'transcript_id': 0        ENST00000000233
 1        ENST00000000233
 2        ENST00000000233
 3        ENST00000000233
 4        ENST00000000233
               ...       
 90805    ENST00000641784
 90806    ENST00000641784
 90807    ENST00000641784
 90808    ENST00000641784
 90809    ENST00000641784
 Name: transcript_id, Length: 90810, dtype: object,
 'transcript_position': 0         244
 1         261
 2         316
 3         332
 4         368
          ... 
 90805    3122
 90806    3142
 90807    3224
 90808    3243
 90809    3266
 Name: transcript_position, Length: 90810, dtype: int64,
 'probability': [0.026707672,
  0.0017196848,
  5.8326996e-06,
  0.26109922,
  0.00019667225,
  0.04039232,
  0.0030699172,
  0.00028568896,
  0.0045231064,
  0.28622478,
  0.04631599,
  0.0055066384,
  0.055016566,
  0.00035798535,
  0.00036108764,
  0.022570606,
  0.71994305,
  0.6739172,
  0.0001014719,
  0.0012738288,
  0.0004047926,
  0.00012677305,
  0.001630343,
  4.384754e-05,
  0.00239186

In [47]:
# Create a DataFrame from the data dictionary
output = pd.DataFrame(data)

output.head()


Unnamed: 0,transcript_id,transcript_position,probability
0,ENST00000000233,244,0.026708
1,ENST00000000233,261,0.00172
2,ENST00000000233,316,6e-06
3,ENST00000000233,332,0.261099
4,ENST00000000233,368,0.000197


In [48]:
# export output
file_name = 'tundra_dataset1_1.csv'
output.to_csv(file_name, index=False)