In [None]:
!pip install biopython

In [None]:
from Bio import SeqIO
import csv

In [None]:
from google.colab import drive
drive.mount('/content/drive')


# Section 1
For Our dataset we need chromosome numbers along with exons, starting point of exon, endpoint of exon and the gene type (Transcript, long non-coding RNA etc). We have an annotated gtf file already. We just need to extract the requisite data from the file.

In [None]:
exon_count = 0

with open('/content/drive/My Drive/gencode.v42.annotation.gtf', 'r') as gtf_file:
    for line in gtf_file:
        if line.startswith('#'):
            continue
        fields = line.strip().split('\t')
        if fields[2] == 'exon':
            exon_count += 1

print('Number of exons:', exon_count)

In [None]:
# Step 1: Parse GTF file and extract relevant information
gtf_file = open('/content/drive/My Drive/gencode.v42.annotation.gtf', 'r')
# Create a dictionary to map chromosome names
chrom_mapping = {
    'chr1': '1',
    'chr2': '2',
    'chr3': '3',
    'chr4': '4',
    'chr5': '5',
    'chr6': '6',
    'chr7': '7',
    'chr8': '8',
    'chr9': '9',
    'chr10': '10',
    'chr11': '11',
    'chr12': '12',
    'chr13': '13',
    'chr14': '14',
    'chr15': '15',
    'chr16': '16',
    'chr17': '17',
    'chr18': '18',
    'chr19': '19',
    'chr20': '20',
    'chr21': '21',
    'chr22': '22',
    'chrX': 'X',
    'chrY': 'Y',
    'chrM': 'MT'
}
data = []
reader = csv.reader(gtf_file, delimiter='\t')
for i in range(5):
  next(reader)
# Loop through GTF file and replace chromosome names
for row in reader:
    # Get chromosome name
    chrom = row[0]
    # Check if chromosome name needs to be mapped
    if chrom in chrom_mapping:
        # Map chromosome name
        chrom = chrom_mapping[chrom]
        # Update row with mapped chromosome name
        row[0] = chrom

    if row[2] == 'exon':
      chrom = row[0]
      start = int(row[3])
      end = int(row[4])
      gene_type = row[8].split(';')[2].split()[1].strip('"')
      data.append({'chrom': chrom, 'start': start, 'end': end, 'gene_type': gene_type})


In [None]:
data[1]

In [None]:
# Step 2: Parse FASTA file and extract DNA sequence for relevant regions
fasta_file = open('/content/drive/My Drive/Homo_sapiens.GRCh38.dna.primary_assembly.fa', 'r')
seq_dict = {}
current_chrom = ''
current_seq = ''
for line in fasta_file:  
  if line.startswith('>'):
      if current_chrom != '':
          seq_dict[current_chrom] = current_seq
      current_chrom = line[1:].split()[0].strip()
      current_seq = ''
  else:
      current_seq += line.strip()
seq_dict[current_chrom] = current_seq

In [None]:
print(seq_dict.keys())

In [None]:
len(seq_dict)

In [None]:
data[1]

In [None]:
# Step 3: Combine information from GTF and FASTA files
for record in data:
  chrom = record['chrom']
  start = record['start']
  end = record['end']
  gene_type = record['gene_type']
  seq = seq_dict[chrom][start-1:end]
  record['sequence'] = seq

In [None]:
data[1]

In [None]:
import random

#Randomly sample 200000 rows from data dictionary
#small_data = random.sample(data, 200000)

# Print the length of the subset dictionary
#print(len(small_data))


In [None]:
import pandas as pd

df = pd.DataFrame.from_dict(data)

In [None]:
df.head()

In [None]:
#number of classes
gene_types = set(d['gene_type'] for d in data)
num_classes = len(gene_types)
print("Number of classes: ", num_classes)


In [None]:
from collections import Counter

# Get counts of each class
class_counts = Counter(df['gene_type'])

# Get class representation and sort in descending order
class_reps = [(class_name, round(count / len(df) * 100, 4)) for class_name, count in class_counts.items()]
class_reps_sorted = sorted(class_reps, key=lambda x: x[1], reverse=True)

# Print representation of each class
for class_name, class_rep in class_reps_sorted:
    print(f"{class_name}: {class_rep}%")


In [None]:
# Merge and rename the classes
df["gene_type"].replace({
    "processed_pseudogene": "pseudogene",
    "unprocessed_pseudogene": "pseudogene",
    "transcribed_unprocessed_pseudogene": "pseudogene",
    "transcribed_unitary_pseudogene": "pseudogene",
    "transcribed_processed_pseudogene": "pseudogene",
    "snRNA": "snRNA",
    "miRNA": "snRNA",
    "snoRNA": "snRNA"
}, inplace=True)

# Drop the underrepresented classes
df = df[df["gene_type"].isin(["protein_coding", "lncRNA", "pseudogene", "snRNA"])]

# Get the counts of each class
class_counts = df["gene_type"].value_counts()

# Print the counts
print(class_counts)


In [None]:
import pandas as pd
from collections import Counter

# Get counts of each class
class_counts = Counter(df['gene_type'])

# Calculate class representation and put it in a DataFrame
class_df = pd.DataFrame({'gene_type': list(class_counts.keys()),
                         'count': list(class_counts.values())})
class_df['percentage'] = round(class_df['count'] / len(df) * 100, 4)

# Sort the DataFrame in descending order of percentage
class_df = class_df.sort_values(by='percentage', ascending=False)

# Display the DataFrame
#print(class_df)
#class_df.to_excel('class_distribution.xlsx', index=False)

#from google.colab import files
#files.download('class_distribution.xlsx')

In [None]:
#Representation of each class after merging
class_df

In [None]:
# initialize variables to hold largest and smallest sequences
largest_seq = ''
smallest_seq = ' ' * 1000  # set initial value to a long string

# iterate through data and find largest and smallest sequences
for record in data:
    seq = record['sequence']
    if len(seq) > len(largest_seq):
        largest_seq = seq
    if len(seq) < len(smallest_seq):
        smallest_seq = seq

# print results
print('Largest sequence:', len(largest_seq))
print('Smallest sequence:', len(smallest_seq))

In [None]:
# take smaller dataframe for testing purposes as the whole dataframe will crash the colab.
df = df.sample(n=200000, random_state=42)


In [None]:
#import random

#Randomly sample 200000 rows from data dictionary
#df = random.sample(data, 200000)

# Print the length of the subset dictionary
#print(len(df))


In [None]:
from sklearn.model_selection import train_test_split

X = df[['chrom', 'start', 'end', 'sequence']]
y = df['gene_type']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [None]:
from sklearn.preprocessing import OneHotEncoder
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

# One-hot encode the chromosome feature
enc = OneHotEncoder(sparse=False)
X_train_chrom = enc.fit_transform(X_train['chrom'].values.reshape(-1, 1))
X_test_chrom = enc.transform(X_test['chrom'].values.reshape(-1, 1))

# Tokenize the DNA sequence feature
tokenizer = Tokenizer(char_level=True)
tokenizer.fit_on_texts(X_train['sequence'])
X_train_seq = tokenizer.texts_to_sequences(X_train['sequence'])
X_test_seq = tokenizer.texts_to_sequences(X_test['sequence'])

# Pad the DNA sequence feature to a fixed length
max_seq_length = 100
X_train_seq = pad_sequences(X_train_seq, maxlen=max_seq_length)
X_test_seq = pad_sequences(X_test_seq, maxlen=max_seq_length)


In [None]:
import tensorflow as tf

model = tf.keras.Sequential([
    tf.keras.layers.Input(shape=(X_train_chrom.shape[1] + max_seq_length,)),
    tf.keras.layers.Dense(128, activation='relu'),
    tf.keras.layers.Dropout(0.5),
    tf.keras.layers.Dense(64, activation='relu'),
    tf.keras.layers.Dropout(0.5),
    tf.keras.layers.Dense(len(df['gene_type'].unique()), activation='softmax')
])

model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

# Convert gene_type labels to one-hot encoding
from sklearn.preprocessing import LabelEncoder
from tensorflow.keras.utils import to_categorical

le = LabelEncoder()
le.fit(y_train)
y_train_enc = to_categorical(le.transform(y_train))
y_test_enc = to_categorical(le.transform(y_test))

history = model.fit(
    x=tf.concat([X_train_chrom, X_train_seq], axis=1),
    y=y_train_enc,
    validation_data=(tf.concat([X_test_chrom, X_test_seq], axis=1), y_test_enc),
    epochs=10,
    batch_size=32
)


In [None]:
score = model.evaluate(tf.concat([X_test_chrom, X_test_seq], axis=1), y_test_enc, verbose=0)
print(f'Test loss: {score[0]}')
print(f'Test accuracy: {score[1]}')
