# Task 1: Develop a machine learning method to identify RNA modifications from direct RNA-Seq data

Write a computational method that predicts m6A RNA modification from direct RNA-Seq data. The method should be able to train a new new model, and make predictions on unseen test data. Specifically, your method should fullfil the following requirements:

Your method should contain two scripts, one for model training, and one for making predictions. The prediction script will be evaluated by other students.

In [1]:
import json
import os
import sys
import gzip

import pandas as pd
import numpy as np

#### 1. Reading the files

In [2]:
os.listdir("data/")

['Student_evaluation_guideline.html',
 'dataset0.json.gz',
 'data.info.labelled',
 'handout_TeamProject_RNAModifications.html']

In [3]:
# I'm putting these files into my ~/data, if it's different for u, change the path accordingly :)
M6A_FILE_PATH = "data/data.info.labelled"
DIRECT_RNA_SEQ_DATA_FILE_PATH = "data/dataset0.json.gz"

In [4]:
# Read m6a labels
def read_m6A_labels(m6a_file_path):
    m6a_df = pd.read_csv(m6a_file_path, sep=",")
    m6a_df.columns = ["gene_id", "transcript_id", "transcript_position", "label"]
    return m6a_df

In [5]:
# df = read_m6A_labels(M6A_FILE_PATH)
# df.head()

In [6]:
def read_direct_rna_seq_data(data_path):
    data = []
    with gzip.open(data_path, 'rt') as f:
        for line in f:
            line_data = json.loads(line)
            for transcript_id, position_data in line_data.items():
                for transcript_position, combined_nucleotides_data in position_data.items():
                    for combined_nucleotide, reads in combined_nucleotides_data.items():
                        for read_idx, read in enumerate(reads):
                            data.append({
                                'transcript_id': transcript_id,
                                'position': int(transcript_position),
                                'read_id': read_idx,
                                'x_1': read[0],
                                'x_2': read[1],
                                'x_3': read[2],
                                'x_4': read[3],
                                'x_5': read[4],
                                'x_6': read[5],
                                'x_7': read[6],
                                'x_8': read[7],
                                'x_9': read[8]
                            })

    df = pd.DataFrame(data)
    return df


In [7]:
rna_seq_data_df = read_direct_rna_seq_data(DIRECT_RNA_SEQ_DATA_FILE_PATH)

In [8]:
# rna_seq_data.head()
rna_seq_data_df.head(100)

Unnamed: 0,transcript_id,position,read_id,x_1,x_2,x_3,x_4,x_5,x_6,x_7,x_8,x_9
0,ENST00000000233,244,0,0.00299,2.06,125.0,0.01770,10.40,122.0,0.00930,10.90,84.1
1,ENST00000000233,244,1,0.00631,2.53,125.0,0.00844,4.67,126.0,0.01030,6.30,80.9
2,ENST00000000233,244,2,0.00465,3.92,109.0,0.01360,12.00,124.0,0.00498,2.13,79.6
3,ENST00000000233,244,3,0.00398,2.06,125.0,0.00830,5.01,130.0,0.00498,3.78,80.4
4,ENST00000000233,244,4,0.00664,2.92,120.0,0.00266,3.94,129.0,0.01300,7.15,82.2
...,...,...,...,...,...,...,...,...,...,...,...,...
95,ENST00000000233,244,95,0.00590,5.57,126.0,0.01200,11.20,127.0,0.00564,9.24,87.3
96,ENST00000000233,244,96,0.01200,3.73,124.0,0.02520,14.40,123.0,0.00510,4.16,81.2
97,ENST00000000233,244,97,0.01350,4.09,126.0,0.00540,5.71,127.0,0.00396,3.48,81.4
98,ENST00000000233,244,98,0.00830,4.17,121.0,0.00973,5.68,124.0,0.00316,1.60,81.8


##### Strategy 1: Get Aggregate of all the reads

In [9]:
aggregated_df = rna_seq_data_df.drop(columns=['read_id']).groupby(['transcript_id', 'position']).mean().reset_index()
aggregated_df.head(100)

Unnamed: 0,transcript_id,position,x_1,x_2,x_3,x_4,x_5,x_6,x_7,x_8,x_9
0,ENST00000000233,244,0.008264,4.223784,123.702703,0.009373,7.382162,125.913514,0.007345,4.386989,80.570270
1,ENST00000000233,261,0.006609,3.216424,109.681395,0.006813,3.226535,107.889535,0.007710,3.016599,94.290698
2,ENST00000000233,316,0.007570,2.940541,105.475676,0.007416,3.642703,98.947027,0.007555,2.087146,89.364324
3,ENST00000000233,332,0.010620,6.476350,129.355000,0.008632,2.899200,97.836500,0.006101,2.236520,89.154000
4,ENST00000000233,368,0.010701,6.415051,117.924242,0.011479,5.870303,121.954545,0.010019,4.260253,85.178788
...,...,...,...,...,...,...,...,...,...,...,...
95,ENST00000001008,2242,0.008410,3.393768,104.312077,0.009123,3.659807,99.084541,0.007017,2.583527,88.815942
96,ENST00000002165,35,0.013804,7.024259,116.333333,0.009167,7.397778,119.327778,0.006937,2.754444,83.496296
97,ENST00000002165,54,0.013525,3.403200,119.960000,0.008390,6.730200,121.180000,0.006244,3.840600,83.824000
98,ENST00000002165,207,0.006547,3.563962,101.907547,0.005667,7.683396,116.830189,0.006573,4.287358,77.590566


#### 2. Train Test Split