In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [2]:
resistance_data = pd.read_csv('antibio_dataset/resist.csv')
azithromycin_res = resistance_data[resistance_data['azm_sr'].isna() != True][['Sample_ID','azm_sr']]
cefixime_res = resistance_data[resistance_data['cfx_sr'].isna() != True][['Sample_ID','cfx_sr']]
ciprofloxacin_res = resistance_data[resistance_data['cip_sr'].isna() != True][['Sample_ID','cip_sr']]

In [3]:
kmers_azithromycin = pd.read_csv('antibio_dataset/kmers_azithromycin.tsv', sep="\t")
kmers_cefixime = pd.read_csv('antibio_dataset/kmers_cefixime.tsv', sep="\t")
kmers_ciprofloxacin = pd.read_csv('antibio_dataset/kmers_ciprofloxacin.tsv', sep="\t")

#### Azithromycin

In [4]:
kmers_azithromycin = kmers_azithromycin.transpose()

#Making the first row of the transposed DataFrame as column headers
new_header = kmers_azithromycin.iloc[0]
kmers_azithromycin = kmers_azithromycin[1:] 
kmers_azithromycin.columns = new_header

#Making the structure of column names similar to the structure of resistance tables
kmers_azithromycin = kmers_azithromycin.reset_index()
kmers_azithromycin = kmers_azithromycin.rename(columns={'index': 'Sample_ID'})
kmers_azithromycin.index.name = None

#### Cefixime


In [5]:
kmers_cefixime = kmers_cefixime.transpose()

#Making the first row of the transposed DataFrame as column headers
new_header = kmers_cefixime.iloc[0]
kmers_cefixime = kmers_cefixime[1:] 
kmers_cefixime.columns = new_header

#Making the structure of column names similar to the structure of resistance tables
kmers_cefixime = kmers_cefixime.reset_index()
kmers_cefixime = kmers_cefixime.rename(columns={'index': 'Sample_ID'})
kmers_cefixime.index.name = None

#### Ciprofloxacin


In [6]:
kmers_ciprofloxacin = kmers_ciprofloxacin.transpose()

#Making the first row of the transposed DataFrame as column headers
new_header = kmers_ciprofloxacin.iloc[0]
kmers_ciprofloxacin = kmers_ciprofloxacin[1:] 
kmers_ciprofloxacin.columns = new_header

#Making the structure of column names similar to the structure of resistance tables
kmers_ciprofloxacin = kmers_ciprofloxacin.reset_index()
kmers_ciprofloxacin = kmers_ciprofloxacin.rename(columns={'index': 'Sample_ID'})
kmers_ciprofloxacin.index.name = None

### Merging the data


In [7]:
azithromycin_data = kmers_azithromycin.merge(azithromycin_res, how = 'right', on = 'Sample_ID')
cefixime_data = kmers_cefixime.merge(cefixime_res, how = 'right', on = 'Sample_ID')
ciprofloxacin_data = kmers_ciprofloxacin.merge(ciprofloxacin_res, how = 'right', on = 'Sample_ID')

In [8]:
X_azithromycin = azithromycin_data.iloc[:, :-1] 
y_azithromycin = azithromycin_data.iloc[:, -1]

X_cefixime = cefixime_data.iloc[:, :-1]
y_cefixime = cefixime_data.iloc[:, -1]

X_ciprofloxacin = ciprofloxacin_data.iloc[:, :-1]
y_ciprofloxacin = ciprofloxacin_data.iloc[:, -1]

### Feature selection

We only keep the kmers that are not present or absent in almost all samples.

In [9]:
def delete_columns(df, u, l):
    cols_to_remove = []
    for col in df.columns[1:]:
        col_sum = df[col].sum()
        if (col_sum > (len(df) * u)) | (col_sum < (len(df) * l)): 
            cols_to_remove.append(col)
    df_selected = df.drop(columns=cols_to_remove)
    return df_selected

In [12]:
X_azithromycin_selected = delete_columns(X_azithromycin, 0.95, 0.05)

In [13]:
X_cefixime_selected = delete_columns(X_cefixime, 0.95, 0.05)

In [14]:
X_ciprofloxacin_selected = delete_columns(X_ciprofloxacin, 0.8, 0.2)

# Model selection

### Train-test split

In [21]:
import pandas as pd
from sklearn.model_selection import train_test_split

# Split the data into training and testing sets
X_train_azithromycin, X_test_azithromycin, y_train_azithromycin, y_test_azithromycin = train_test_split(X_azithromycin_selected.iloc[:, 1:], y_azithromycin, test_size=0.3, random_state=42)
X_train_cefixime, X_test_cefixime, y_train_cefixime, y_test_cefixime = train_test_split(X_cefixime_selected.iloc[:, 1:], y_cefixime, test_size=0.3, random_state=42)
X_train_ciprofloxacin, X_test_ciprofloxacin, y_train_ciprofloxacin, y_test_ciprofloxacin = train_test_split(X_ciprofloxacin_selected.iloc[:, 1:], y_ciprofloxacin, test_size=0.3, random_state=42)

## Linear SVM

### Azithromycin

In [26]:
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score as acc
model = SVC(kernel='linear')
model.fit(X_train_azithromycin, y_train_azithromycin)
y_pred_train_azithromycin = model.predict(X_train_azithromycin)
y_pred_test_azithromycin = model.predict(X_test_azithromycin)
accuracy_train = acc(y_pred_train_azithromycin, y_train_azithromycin)
accuracy_test = acc(y_pred_test_azithromycin, y_test_azithromycin)
print(f"Accuray on training set: {accuracy_train} ")
print(f"Accuray on test set: {accuracy_test} ")

Accuray on training set: 0.9753492193919474 
Accuray on test set: 0.9636015325670498 


### Cefixime

In [27]:
model = SVC(kernel='linear')
model.fit(X_train_cefixime, y_train_cefixime)
y_pred_train_cefixime = model.predict(X_train_cefixime)
y_pred_test_cefixime = model.predict(X_test_cefixime)
accuracy_train = acc(y_pred_train_cefixime, y_train_cefixime)
accuracy_test = acc(y_pred_test_cefixime, y_test_cefixime)
print(f"Accuray on training set: {accuracy_train} ")
print(f"Accuray on test set: {accuracy_test} ")

Accuray on training set: 0.9983193277310924 
Accuray on test set: 0.9990205680705191 


### Ciprofloxacin

In [28]:
model = SVC(kernel='linear')
model.fit(X_train_ciprofloxacin, y_train_ciprofloxacin)
y_pred_train_ciprofloxacin = model.predict(X_train_ciprofloxacin)
y_pred_test_ciprofloxacin = model.predict(X_test_ciprofloxacin)
accuracy_train = acc(y_pred_train_ciprofloxacin, y_train_ciprofloxacin)
accuracy_test = acc(y_pred_test_ciprofloxacin, y_test_ciprofloxacin)
print(f"Accuray on training set: {accuracy_train} ")
print(f"Accuray on test set: {accuracy_test} ")

Accuray on training set: 0.999537251272559 
Accuray on test set: 0.9546925566343042 
