## Imports

In [1]:
import pandas as pd
import numpy as np


from sklearn.model_selection import train_test_split

from sklearn.model_selection import cross_val_score
from sklearn.model_selection import KFold
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler

from sklearn.metrics import accuracy_score
from hyperopt import fmin, tpe, hp, STATUS_OK, Trials
from sklearn.feature_selection import SelectKBest

from lightgbm.sklearn import LGBMClassifier

## Loading the data

Download the data directly from Google Bucket and `gsutil`:

In [2]:
!gsutil -m cp -r "gs://indaba2023hack/" .
!mv ./indaba2023hack/* ./
!rm -rf ./indaba2023hack/

Copying gs://indaba2023hack/Test.csv...
/ [0/4 files][    0.0 B/126.9 MiB]   0% Done                                    Copying gs://indaba2023hack/Train_embeddings.npy...
/ [0/4 files][    0.0 B/126.9 MiB]   0% Done                                    Copying gs://indaba2023hack/Train.csv...
Copying gs://indaba2023hack/Test_embeddings.npy...
- [4/4 files][126.9 MiB/126.9 MiB] 100% Done                                    
Operation completed over 4 objects/126.9 MiB.                                    


In [3]:
#reading the train data
data_df = pd.read_csv("Train.csv")
data_df.head()

Unnamed: 0,ID,Sequence,Chromosome,Region,Target
0,ID_3uVXob,AAATCTTATTAATATATCTATATATAAATTTTTTAATATATTATAT...,Chromosome06,23466002-23467001,1
1,ID_ViisQV,GCGTCCTCTCACCAGCAGACAAGAATATAAAGATTGAACAACCAAA...,Chromosome16,27234572-27235571,0
2,ID_4HQWG5,TTCACCGAAAGCTTAGAGTAAGGAAAAAAAAAGGAACATTAAAATT...,Chromosome11,24316102-24317101,1
3,ID_E64KC4,AACCCTTATCAGAAAAGTCTTTATGGTATTCAACGGAGGCAAACAA...,Chromosome05,25321688-25322687,0
4,ID_3BPJGs,TTGTGAGGAATGAGATACACATACCCCTTTTAGATTATGAAATGTC...,Chromosome01,9570952-9571951,1


In [4]:
# The description of the data indicated that the middle 200 bp are esential
# so this step creates a feature out of them and the region where they begin
# and end
data_df['mid'] = data_df['Sequence'].str[400:600]
v = data_df['Region'].str.split('-',expand=True)
data_df['Start'] = v[0]
data_df['end'] = v[1]

In [5]:
#representing the chromosomes by an integer that can be used as a feature
data_df['Chromosome'] = data_df['Chromosome'].str.replace('Chromosome','').astype('int')
data_df.drop('Region',axis=1,inplace=True)

In [6]:
#reading the test data
test_df = pd.read_csv("Test.csv")
test_df.head()

Unnamed: 0,ID,Sequence,Chromosome,Region
0,ID_3qQWny,TTTTCTTCATCATATAAAAAGTTTTATTTTCTCTTAATACTTTTAT...,Chromosome03,16571402-16572401
1,ID_S4Jc8H,CTGTTGTTAGGGATAGGAATTGTGAAACTTTAAGCATCCATCATCA...,Chromosome03,1882527-1883526
2,ID_pnVWrE,TTTCAGTATTAGGCGAGCGTCAGCCAAAGGTGAGTAGAACTGAACT...,Chromosome15,16728813-16729812
3,ID_3Sg4Pp,CAACCTTGACCAAACTAGTAACCCAAGGTGATGGGGAGATAAGATA...,Chromosome18,23291126-23292125
4,ID_3yXmKG,GCCAAAGCTGAACATTTTAATTCCGCCAATCGTTCATTGGCTAGAT...,Chromosome10,25292527-25293526


In [7]:
# The description of the data indicated that the middle 200 bp are esential
# so this step creates a feature out of them and the region where they begin
# and end
test_df['mid'] = test_df['Sequence'].str[400:600]
v = test_df['Region'].str.split('-',expand=True)
test_df['Start'] = v[0]
test_df['end'] = v[1]

In [8]:
#representing the chromosomes by an integer that can be used as a feature
test_df['Chromosome'] = test_df['Chromosome'].str.replace('Chromosome','').astype('int')
test_df.drop('Region',axis=1,inplace=True)

In [9]:
#function to look at the frequency of the different bp in the whole
#chromosome and also in the middle 200 bp
def all_count_letter(text,letter):
    return text.count(letter)/1000

def mid_count_letter(text,letter):
    return text.count(letter)/200


Applying the created function to the two kinds of Sequences(i.e all and mid)  for both train and test data and creating a new feature out of it

In [10]:

test_df['all_A'] = test_df['Sequence'].apply(all_count_letter,args=('A',))
test_df['all_G'] = test_df['Sequence'].apply(all_count_letter,args=('G',))
test_df['all_C'] = test_df['Sequence'].apply(all_count_letter,args=('C',))
test_df['all_T'] = test_df['Sequence'].apply(all_count_letter,args=('T',))

In [11]:
data_df['all_A'] = data_df['Sequence'].apply(all_count_letter,args=('A',))
data_df['all_G'] = data_df['Sequence'].apply(all_count_letter,args=('G',))
data_df['all_C'] = data_df['Sequence'].apply(all_count_letter,args=('C',))
data_df['all_T'] = data_df['Sequence'].apply(all_count_letter,args=('T',))

In [12]:
test_df['mid_A'] = test_df['mid'].apply(mid_count_letter,args=('A',))
test_df['mid_G'] = test_df['mid'].apply(mid_count_letter,args=('G',))
test_df['mid_C'] = test_df['mid'].apply(mid_count_letter,args=('C',))
test_df['mid_T'] = test_df['mid'].apply(mid_count_letter,args=('T',))

In [13]:
data_df['mid_A'] = data_df['mid'].apply(mid_count_letter,args=('A',))
data_df['mid_G'] = data_df['mid'].apply(mid_count_letter,args=('G',))
data_df['mid_C'] = data_df['mid'].apply(mid_count_letter,args=('C',))
data_df['mid_T'] = data_df['mid'].apply(mid_count_letter,args=('T',))

In [14]:
# Dividing train data into two for puposes of validation
train_chromo, valid_chromo = train_test_split(data_df['Chromosome'].unique(), test_size=0.2, random_state=42)
train_chromo.shape, valid_chromo.shape

((9,), (3,))

In [15]:
# dividing train data continued
train_df = data_df[data_df['Chromosome'].isin(train_chromo)]
valid_df = data_df[data_df['Chromosome'].isin(valid_chromo)]
train_df.shape, valid_df.shape

((10328, 15), (2897, 15))

# Modelling with InstaDeep's **AgroNT**!

modelling with the power of InstaDeep's AgroNT.

In [16]:
embeds = np.load("Train_embeddings.npy")
test_embed = np.load("Test_embeddings.npy")

# Split into train and validation embeddings
train_embed = embeds[data_df['Chromosome'].isin(train_chromo)]
train_target = data_df['Target'][data_df['Chromosome'].isin(train_chromo)].values
valid_embed = embeds[data_df['Chromosome'].isin(valid_chromo)]
valid_target = data_df['Target'][data_df['Chromosome'].isin(valid_chromo)].values

In [17]:
# Creating dataframe out of the loaded embeddings
df = pd.DataFrame(train_embed)
test = pd.DataFrame(valid_embed)
sub_test = pd.DataFrame(test_embed)

Adding important features from the original data to the dataframe containing the embeddings

In [18]:
#Adding the bp proportions
sub_test['all_A'] = test_df['all_A']

sub_test['all_G'] = test_df['all_G']

sub_test['all_T'] = test_df['all_T']

sub_test['all_C'] = test_df['all_C']

In [19]:
sub_test['mid_A'] = test_df['mid_A']

sub_test['mid_G'] = test_df['mid_G']

sub_test['mid_T'] = test_df['mid_T']

sub_test['mid_C'] = test_df['mid_C']

In [20]:
test['all_A'] = data_df['all_A'][data_df['Chromosome'].isin(valid_chromo)].values
df['all_A']= data_df['all_A'][data_df['Chromosome'].isin(train_chromo)].values

test['all_G'] = data_df['all_G'][data_df['Chromosome'].isin(valid_chromo)].values
df['all_G']= data_df['all_G'][data_df['Chromosome'].isin(train_chromo)].values


test['all_T'] = data_df['all_T'][data_df['Chromosome'].isin(valid_chromo)].values
df['all_T']= data_df['all_T'][data_df['Chromosome'].isin(train_chromo)].values

test['all_C'] = data_df['all_C'][data_df['Chromosome'].isin(valid_chromo)].values
df['all_C']= data_df['all_C'][data_df['Chromosome'].isin(train_chromo)].values

In [21]:
test['mid_A'] = data_df['mid_A'][data_df['Chromosome'].isin(valid_chromo)].values
df['mid_A']= data_df['mid_A'][data_df['Chromosome'].isin(train_chromo)].values

test['mid_G'] = data_df['mid_G'][data_df['Chromosome'].isin(valid_chromo)].values
df['mid_G']= data_df['mid_G'][data_df['Chromosome'].isin(train_chromo)].values


test['mid_T'] = data_df['mid_T'][data_df['Chromosome'].isin(valid_chromo)].values
df['mid_T']= data_df['mid_T'][data_df['Chromosome'].isin(train_chromo)].values

test['mid_C'] = data_df['mid_C'][data_df['Chromosome'].isin(valid_chromo)].values
df['mid_C']= data_df['mid_C'][data_df['Chromosome'].isin(train_chromo)].values

In [22]:
test['Start'] = data_df['Start'][data_df['Chromosome'].isin(valid_chromo)].values
df['Start']= data_df['Start'][data_df['Chromosome'].isin(train_chromo)].values

In [23]:
test['end'] = data_df['end'][data_df['Chromosome'].isin(valid_chromo)].values
df['end']= data_df['end'][data_df['Chromosome'].isin(train_chromo)].values

In [24]:
test['chromo'] = data_df['Chromosome'][data_df['Chromosome'].isin(valid_chromo)].values
df['chromo']= data_df['Chromosome'][data_df['Chromosome'].isin(train_chromo)].values


In [25]:
# adding the regions where the middle 200 bp are found together with the
# identifation for the chromosome
sub_test['Start']= test_df['Start']
sub_test['end'] = test_df['end']
sub_test['chromo'] = test_df['Chromosome']


df['Start'] = df['Start'].astype('int')
df['end'] = df['end'].astype('int')

test['Start'] = test['Start'].astype('int')
test['end'] = test['end'].astype('int')

In [26]:
# converting column names to one type (str)
sub_test.columns = sub_test.columns.astype('str')
df.columns = df.columns.astype('str')
test.columns = test.columns.astype('str')

# Model Development

In [27]:
# creating an LGBMClassifier object
lgb = LGBMClassifier(random_state=42,n_jobs=-1,objective='binary')

In [28]:
# Defining the loss function for classification (accuracy in this case)
# for a hyperopt tuning

# Running this Cell is time-consuming so the run portions have been commented
# out but the resulting hyperparameters have been extracted and used in the
# cell below
def classification_loss(params):
    # Extract hyperparameters from the dictionary 'params'
    boosting_type = params['boosting_type']#['gbdt', 'dart', 'goss']
    learning_rate = float(params['learning_rate'])
    n_estimators = int(params['n_estimators'])
    min_child_samples = int(params['min_child_samples'])
    reg_alpha = float(params['reg_alpha'])
    reg_lambda = float(params['reg_lambda'])
    max_depth =int(params['max_depth'])
    min_child_weight = float(params['min_child_weight'])
    min_split_gain = float(params['min_split_gain'])
    bagging_freq = int(params['bagging_freq'])
    bagging_fraction = float(params['bagging_fraction'])
    feature_fraction = float(params['feature_fraction'])
    path_smooth = float(params['path_smooth'])

    # Create the LGBMClassifier pipeline with the given hyperparameters
    pipeline = Pipeline([
        ('scaler', StandardScaler()),
        ('feature_selection', SelectKBest(k=int(params['k']))),
        ('classifier', LGBMClassifier(
            boosting_type=params['boosting_type'],
            learning_rate=params['learning_rate'],
            n_estimators=params['n_estimators'],
            min_child_samples=int(params['min_child_samples']),
            reg_alpha=params['reg_alpha'],
            reg_lambda=params['reg_lambda'],
            max_depth=int(params['max_depth']),
            min_child_weight=params['min_child_weight'],
            min_split_gain=params['min_split_gain'],
            bagging_freq=int(params['bagging_freq']),
            bagging_fraction=params['bagging_fraction'],
            feature_fraction=params['feature_fraction'],
            path_smooth=params['path_smooth'],
            random_state=2,
            n_jobs=-1,
            max_iter=5000))])

    # Model fitting and prediction
    pipeline.fit(df, train_target)
    accuracy = pipeline.score(test,valid_target)

    # The loss
    return {'loss': 1 - accuracy, 'status': STATUS_OK}

#the search space for hyperparameters
space = {
    'boosting_type': hp.choice('boosting_type', ['gbdt', 'dart']),
    'learning_rate': hp.loguniform('learning_rate', -3, 0),  # Values between 0.001 and 1.0
    'n_estimators': hp.quniform('n_estimators', 50, 500, 10),
    'min_child_samples': hp.quniform('min_child_samples', 5, 50, 5),
    'reg_alpha': hp.uniform('reg_alpha', 0, 1),
    'reg_lambda': hp.uniform('reg_lambda', 0, 1),
    'max_depth': hp.quniform('max_depth', -1,50,1),  # -1 represents no limit
    'min_child_weight': hp.loguniform('min_child_weight', -3, 1),  # Values between 0.001 and 10
    'min_split_gain': hp.uniform('min_split_gain', 0, 0.2),
    'bagging_freq': hp.quniform('bagging_freq', 0, 5, 1),
    'bagging_fraction': hp.uniform('bagging_fraction', 0.6, 1.0),
    'feature_fraction': hp.uniform('feature_fraction', 0.6, 1.0),
    'path_smooth': hp.uniform('path_smooth', 0, 1),
    'k': hp.quniform('k',500,1200,50)
}


# trials to keep track of optimization process
trials = Trials()

# Hyperparameter optimization using the Tree of Parzen neighbors (TPE) algorithm

#best = fmin(fn=classification_loss, space=space, algo=tpe.suggest, max_evals=6, trials=trials)

# Printing the best hyperparameters found

#print("Best hyperparameters:")
#print(best)


In [29]:
#As a result of the best hyperparameters printed above, lgbtuned is created
lgbtuned = Pipeline([
        ('scaler', StandardScaler()),
        ('feature_selection', SelectKBest(k=1150)),
       ('classifier', LGBMClassifier(
            boosting_type='gbdt',
            learning_rate= 0.06025942784583524,
            n_estimators=120,
            min_child_samples=10,
            reg_alpha=0.5292571452457728,
            reg_lambda=0.5860506586879405,
            max_depth=38,
            min_child_weight=1.4489276427214004,
            min_split_gain=0.07353635639056559,
            bagging_freq=2,
            bagging_fraction=0.7144619043935099,
            feature_fraction=0.7377391018850801,
            path_smooth=0.8754542801363957,
            random_state=2,
            n_jobs=-1,
            max_iter=5000))
        ])

In [30]:
#fitting the tuned pipeline
lgbtuned.fit(df,train_target)



[1;30;43mLe flux de sortie a été tronqué et ne contient que les 5000 dernières lignes.[0m


In [31]:
#predicting on the test data
results=lgbtuned.predict(sub_test)



In [32]:
# creating a submission dataframe
sub_df = test_df[['ID']].copy()
sub_df['Target'] = results
sub_df['Target'].value_counts()

0    2947
1    2721
Name: Target, dtype: int64

In [34]:
# exporting to csv
sub_df.to_csv('lgbm_hyp_dftrained_submit.csv',index=False)

#END