In [1]:
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
import pandas as pd
import numpy as np

In [2]:
df = pd.read_csv('merged_data.csv')

In [3]:
rfe = ['transcript_id',
'transcript_position',
'sequence',
'gene_id',
'avg_central_mean',
'avg_1+flank_std',
'med_central_std',
'med_central_mean',
'med_1+flank_std',
'std_1-flank_std',
'std_1-flank_mean',
'std_central_std',
'std_central_mean',
'std_1+flank_std',
'std_1+flank_mean',
'label']
df = df[rfe]

In [4]:
df.shape

(121838, 16)

In [5]:
df['seq_left'] = df['sequence'].str[0:5]
df['seq_center'] = df['sequence'].str[1:6]
df['seq_right'] = df['sequence'].str[2:7]

In [6]:
df.head()

Unnamed: 0,transcript_id,transcript_position,sequence,gene_id,avg_central_mean,avg_1+flank_std,med_central_std,med_central_mean,med_1+flank_std,std_1-flank_std,std_1-flank_mean,std_central_std,std_central_mean,std_1+flank_std,std_1+flank_mean,label,seq_left,seq_center,seq_right
0,ENST00000000233,244,AAGACCA,ENSG00000004059,125.913514,4.386989,6.65,126.0,3.44,1.843025,4.944365,3.302671,2.765244,2.906225,2.522168,0,AAGAC,AGACC,GACCA
1,ENST00000000233,261,CAAACTG,ENSG00000004059,107.889535,3.016599,3.0,108.0,2.66,1.689167,2.981182,1.237045,3.526528,1.466295,2.49253,0,CAAAC,AAACT,AACTG
2,ENST00000000233,316,GAAACAG,ENSG00000004059,98.947027,2.087146,3.78,99.5,1.91,1.311103,1.492577,0.893481,3.203816,0.800496,1.484341,0,GAAAC,AAACA,AACAG
3,ENST00000000233,332,AGAACAT,ENSG00000004059,97.8365,2.23652,2.635,97.5,2.135,2.459317,3.202963,1.393868,1.928009,0.741771,3.494937,0,AGAAC,GAACA,AACAT
4,ENST00000000233,368,AGGACAA,ENSG00000004059,121.954545,4.260253,5.66,122.0,4.16,2.743228,2.664643,1.753894,2.177236,1.341822,2.379045,0,AGGAC,GGACA,GACAA


In [7]:
df_le = df.copy()
label_encoder = LabelEncoder()

seq_data = ['seq_left','seq_center','seq_right']
for seq in seq_data: 
    encoded_labels = label_encoder.fit_transform(df_le[seq])
    df_le[seq] = encoded_labels

df_le.head()

Unnamed: 0,transcript_id,transcript_position,sequence,gene_id,avg_central_mean,avg_1+flank_std,med_central_std,med_central_mean,med_1+flank_std,std_1-flank_std,std_1-flank_mean,std_central_std,std_central_mean,std_1+flank_std,std_1+flank_mean,label,seq_left,seq_center,seq_right
0,ENST00000000233,244,AAGACCA,ENSG00000004059,125.913514,4.386989,6.65,126.0,3.44,1.843025,4.944365,3.302671,2.765244,2.906225,2.522168,0,1,4,16
1,ENST00000000233,261,CAAACTG,ENSG00000004059,107.889535,3.016599,3.0,108.0,2.66,1.689167,2.981182,1.237045,3.526528,1.466295,2.49253,0,6,2,10
2,ENST00000000233,316,GAAACAG,ENSG00000004059,98.947027,2.087146,3.78,99.5,1.91,1.311103,1.492577,0.893481,3.203816,0.800496,1.484341,0,12,0,2
3,ENST00000000233,332,AGAACAT,ENSG00000004059,97.8365,2.23652,2.635,97.5,2.135,2.459317,3.202963,1.393868,1.928009,0.741771,3.494937,0,2,6,3
4,ENST00000000233,368,AGGACAA,ENSG00000004059,121.954545,4.260253,5.66,122.0,4.16,2.743228,2.664643,1.753894,2.177236,1.341822,2.379045,0,3,9,12


In [8]:
df_le = df_le.drop(["sequence"], axis=1)

In [9]:
df_le.columns

Index(['transcript_id', 'transcript_position', 'gene_id', 'avg_central_mean',
       'avg_1+flank_std', 'med_central_std', 'med_central_mean',
       'med_1+flank_std', 'std_1-flank_std', 'std_1-flank_mean',
       'std_central_std', 'std_central_mean', 'std_1+flank_std',
       'std_1+flank_mean', 'label', 'seq_left', 'seq_center', 'seq_right'],
      dtype='object')

In [10]:
from sklearn.model_selection import GroupShuffleSplit
group_split = GroupShuffleSplit(n_splits=1, test_size=0.2, random_state=42)
train_idx, test_idx = next(group_split.split(df_le, groups=df['gene_id']))

train = df_le.iloc[train_idx]
test = df_le.iloc[test_idx]

In [11]:
train.shape

(98006, 18)

In [12]:
test.shape

(23832, 18)

In [13]:
len(train[train['label'] == 0])

93715

In [14]:
len(train[train['label'] == 1])

4291

In [15]:
1 - 4332/93138

0.9534883720930233

In [16]:
X_train = train.drop(['transcript_id', 'gene_id', 'label'], axis=1)
y_train = train['label']

In [17]:
from imblearn.over_sampling import SMOTE

smote = SMOTE(sampling_strategy='auto', random_state=42)
X_resampledtrain, y_resampledtrain = smote.fit_resample(X_train, y_train)


In [18]:
original_class_counts = y_train.value_counts()
resampled_class_counts = pd.Series(y_resampledtrain).value_counts()

print("Original Class Distribution:")
print(original_class_counts)

print("\nResampled Class Distribution:")
print(resampled_class_counts)

Original Class Distribution:
label
0    93715
1     4291
Name: count, dtype: int64

Resampled Class Distribution:
label
0    93715
1    93715
Name: count, dtype: int64


In [19]:
transcript_id_train = train['transcript_id']
gene_id_train = train['gene_id']

In [21]:
resampled_train = pd.DataFrame(data=X_resampledtrain, columns=X_train.columns)
resampled_train['label'] = y_resampledtrain

transcript_id_train.reset_index(drop=True, inplace=True)
gene_id_train.reset_index(drop=True, inplace=True)

resampled_train['transcript_id'] = transcript_id_train
resampled_train['gene_id'] = gene_id_train
resampled_train.to_csv('traindata_rfe.csv', index=False)


In [22]:
test.to_csv('testdata_rfe.csv', index=False)

In [23]:
resampled_train.head()

Unnamed: 0,transcript_position,avg_central_mean,avg_1+flank_std,med_central_std,med_central_mean,med_1+flank_std,std_1-flank_std,std_1-flank_mean,std_central_std,std_central_mean,std_1+flank_std,std_1+flank_mean,seq_left,seq_center,seq_right,label,transcript_id,gene_id
0,244,125.913514,4.386989,6.65,126.0,3.44,1.843025,4.944365,3.302671,2.765244,2.906225,2.522168,1,4,16,0,ENST00000000233,ENSG00000004059
1,261,107.889535,3.016599,3.0,108.0,2.66,1.689167,2.981182,1.237045,3.526528,1.466295,2.49253,6,2,10,0,ENST00000000233,ENSG00000004059
2,316,98.947027,2.087146,3.78,99.5,1.91,1.311103,1.492577,0.893481,3.203816,0.800496,1.484341,12,0,2,0,ENST00000000233,ENSG00000004059
3,332,97.8365,2.23652,2.635,97.5,2.135,2.459317,3.202963,1.393868,1.928009,0.741771,3.494937,2,6,3,0,ENST00000000233,ENSG00000004059
4,368,121.954545,4.260253,5.66,122.0,4.16,2.743228,2.664643,1.753894,2.177236,1.341822,2.379045,3,9,12,0,ENST00000000233,ENSG00000004059


In [24]:
test.head()

Unnamed: 0,transcript_id,transcript_position,gene_id,avg_central_mean,avg_1+flank_std,med_central_std,med_central_mean,med_1+flank_std,std_1-flank_std,std_1-flank_mean,std_central_std,std_central_mean,std_1+flank_std,std_1+flank_mean,label,seq_left,seq_center,seq_right
153,ENST00000005257,470,ENSG00000006451,121.032258,2.513839,9.26,121.0,2.39,3.070013,4.203054,1.73688,3.126204,0.869722,1.267762,0,11,17,21
154,ENST00000005257,504,ENSG00000006451,126.029412,2.674206,4.92,126.0,2.315,1.562898,2.673237,1.436917,2.345023,1.113461,1.57646,0,3,11,20
155,ENST00000005257,525,ENSG00000006451,125.387097,3.073226,6.15,126.0,2.88,1.130465,3.727447,2.105785,2.61111,1.273123,2.040985,0,7,3,14
156,ENST00000005257,600,ENSG00000006451,122.606061,3.255455,5.94,122.0,3.03,2.399447,2.9383,2.782057,3.311499,1.286042,1.975153,0,3,11,20
157,ENST00000005257,696,ENSG00000006451,121.965517,3.116897,7.81,122.0,2.82,1.898227,3.906871,2.458343,3.537803,1.348092,1.171399,0,11,17,23
