<a href="https://colab.research.google.com/github/kxk302/Mutation_As_Time_Series/blob/main/Mutation_As_Time_Series.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [2]:
from google.colab import drive
drive.mount('/content/gdrive')

Mounted at /content/gdrive


In [3]:
!ls '/content/gdrive/MyDrive/Colab Notebooks/data'

data_sorted.tsv  data.tsv


In [None]:
#
# Sort the dataset based on Collection_Date and Sample, in ascending order
#

import numpy as np
import pandas as pd

from dateutil import parser

pd.set_option('max_rows', None)

df_in = pd.read_csv('/content/gdrive/MyDrive/Colab Notebooks/data/data.tsv', sep='\t', names=['Sample', 'Collection_Date', 'UNK1', 'UNK2', 'UNK3', 'POS', 'REF', 'ALT', 'EFFECT', 'CODON', 'TRID', 'AA', 'AF'])

# DEBUG
# df_in = df_in.iloc[:1000,:]

print(type(df_in.Collection_Date[0]))
df_in.Collection_Date = df_in.Collection_Date.apply(lambda x: parser.parse(x))
print(type(df_in.Collection_Date[0]))

df_in.sort_values(by=['Collection_Date', 'Sample'], ascending=[True, True], inplace=True)
print('\n\n')
print(df_in.shape)
print(df_in.head(1000))

# df_in.to_csv('/content/gdrive/MyDrive/Colab Notebooks/data/data_sorted.tsv', sep='\t', index=False)

In [None]:
#
# Filter the dataset, calculate normalized mutation counts, pivot the data, and save to file
#

import numpy as np
import pandas as pd

from dateutil import parser

pd.set_option('max_rows', None)
pd.set_option('max_columns', None)

start_date = '2021-01-27'
end_date = '2021-08-09'
file_name = '/content/gdrive/MyDrive/Colab Notebooks/data/data_sorted.tsv'
separator = '\t'
# NON_SYNONYMOUS_CODING: missense mutation
effects = ['NON_SYNONYMOUS_CODING']
drop_columns = ['UNK1', 'UNK2', 'UNK3']

# Index of nucleotide changes
#
# 0: A->T
# 1: A->G
# 2: A->C
#
# 3: T->G
# 4: T->C
# 5: T->A
#
# 6: G->C
# 7: G->A
# 8: G->T
#
# 9:  C->A
# 10: C->T
# 11: C->G
#
atgc_dict = {'A': {'T': 0, 'G': 1, 'C': 2}, 'T': {'G': 3, 'C': 4, 'A': 5}, 'G': {'C': 6, 'A': 7, 'T': 8}, 'C': {'A': 9, 'T':10, 'G': 11}}

# Read the input file
df_in = pd.read_csv(file_name, sep=separator)

# Drop the unnecessary columns
df_in.drop(columns=drop_columns, inplace=True)

# Select only rows with mutation type specified in 'effects' list
df_eff = df_in[ df_in.EFFECT.isin(effects) ]

# Select only rows where the Collection_Date falls between start_date and end_date 
df_fil = df_eff[ (df_eff.Collection_Date >= start_date) & (df_eff.Collection_Date <= end_date) ]
print('\n\n')
print('Filtered df shape {}'.format(df_fil.shape))
print('Number of unique dates {}'.format(len(df_fil.Collection_Date.unique())))
 
df_fil['nucleotide_change'] = df_fil.apply(lambda x: atgc_dict[x.REF][x.ALT], axis=1)
print('Calculating normalize_nucleotide_change')
normalize_nucleotide_change = df_fil.groupby(df_fil.Collection_Date).nucleotide_change.value_counts() / df_fil.groupby(df_fil.Collection_Date).nucleotide_change.count()
print('\n\n')
print('Type of normalize_nucleotide_change')
print(type(normalize_nucleotide_change))
print('Name of normalize_nucleotide_change BEFORE rename')
print(normalize_nucleotide_change.name)
normalize_nucleotide_change = normalize_nucleotide_change.rename( 'normalize_nucleotide_change')
print('Name of normalize_nucleotide_change AFTER rename')
print(normalize_nucleotide_change.name)
print('Index of normalize_nucleotide_change')
print(normalize_nucleotide_change.index)

# Convert normalize_nucleotide_change Series to Dataframe
df = normalize_nucleotide_change.to_frame()

print('\n\n')
print('Type of df')
print(type(df))
print('Columns of df')
print(df.columns)
print('df.head(5)')
print(df.head(5))
print('df.index BEFORE reset')
print(df.index)
df.reset_index(inplace=True)
print('df.index AFTER reset')
print(df.index)
print('df.head(5) AFTER reset')
print(df.head(25))

df_piv = pd.pivot_table(df, index='Collection_Date', columns='nucleotide_change', values='normalize_nucleotide_change')
df_piv.fillna(0, inplace=True)
print('\n\n')
print('Pivoted df')
print(df_piv.head(5))
print('Index of df_piv')
print(df_piv.index)
print('Columns of df_piv')
print(df_piv.columns)
print('Shape of df_piv')
print(df_piv.shape)

df_piv.to_csv('/content/gdrive/MyDrive/Colab Notebooks/data/data_pivoted.tsv', sep=separator)

In [None]:
#
# Create time series prediction dataset from pivoted data
#

import numpy as np
import pandas as pd

from dateutil import parser

pd.set_option('max_rows', None)
pd.set_option('max_columns', None)

file_name = '/content/gdrive/MyDrive/Colab Notebooks/data/data_pivoted.tsv'
separator = '\t'
train_data_percentage = 0.80

# Read the input file
df_in = pd.read_csv(file_name, sep=separator)

# Drop the Collection_Date column
df_in.drop(columns='Collection_Date', inplace=True)

print(df_in.head(5))
print(df_in.shape[0])
print(df_in.shape[1])

trainX = []
trainY = []

num_future = 1
num_past = 14
num_rows = df_in.shape[0]
num_cols = df_in.shape[1]

for i in range(num_past, num_rows - num_future + 1):
  trainX.append(df_in.iloc[i - num_past:i, 0:num_cols])
  trainY.append(df_in.iloc[i + num_future - 1:i + num_future, 0:num_cols])

trainX, trainY = np.array(trainX), np.array(trainY)

print('trainX shape == {}'.format(trainX.shape))
print('trainY shape == {}'.format(trainY.shape))

# trainX = trainX.reshape(trainX.shape[0], trainX.shape[1] * trainX.shape[2])
trainY = trainY.reshape(trainY.shape[0], trainY.shape[1] * trainY.shape[2])

# print('trainX shape == {}'.format(trainX.shape))
print('trainY shape == {}'.format(trainY.shape))

train_data_idx = int(trainX.shape[0] * train_data_percentage)

testX = trainX[train_data_idx:,:,:]
testY = trainY[train_data_idx:,:]
trainX = trainX[0:train_data_idx,:,:]
trainY = trainY[0:train_data_idx,:]

print('trainX shape == {}'.format(trainX.shape))
print('trainY shape == {}'.format(trainY.shape))
print('testX shape == {}'.format(testX.shape))
print('testY shape == {}'.format(testY.shape))

'''
np.savetxt('/content/gdrive/MyDrive/Colab Notebooks/data/trainX.tsv', trainX, delimiter=separator)
np.savetxt('/content/gdrive/MyDrive/Colab Notebooks/data/trainY.tsv', trainY, delimiter=separator)
np.savetxt('/content/gdrive/MyDrive/Colab Notebooks/data/testX.tsv', testX, delimiter=separator)
np.savetxt('/content/gdrive/MyDrive/Colab Notebooks/data/testY.tsv', testY, delimiter=separator)
'''

In [None]:
from keras.models import Sequential
from keras.layers import Dense, Dropout, Flatten, LSTM

model = Sequential()
model.add(LSTM(500, activation='relu', input_shape=(trainX.shape[1], trainX.shape[2]), return_sequences=True))
model.add(Dense(250, activation='relu'))
model.add(Dropout(0.2))
model.add(Dense(150, activation='relu'))
model.add(Dropout(0.2))
model.add(Flatten())
model.add(Dense(trainY.shape[1]))

model.compile(optimizer='adam', loss='mse')
model.summary()

history = model.fit(trainX, trainY, epochs=100, batch_size=15, validation_split=0.2, verbose=1)


