# Project Setup

You must have access to the "PROJECT 2" folder on Google Drive for this notebook to run properly. The notebook must also be run in Google Colab.

https://drive.google.com/drive/folders/1VdeCGZHseDDQfzhV2jRnSmbxW39bp2_g?usp=sharing


## Mounting Google Drive



In [2]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


## Imports

In [3]:
!pip install biopython
import numpy as np
import pandas as pd
import sklearn.model_selection
import matplotlib.pyplot as plt
from Bio import SeqIO
from Bio.SeqIO.FastaIO import SimpleFastaParser

Collecting biopython
[?25l  Downloading https://files.pythonhosted.org/packages/5a/42/de1ed545df624180b84c613e5e4de4848f72989ce5846a74af6baa0737b9/biopython-1.79-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.whl (2.3MB)
[K     |████████████████████████████████| 2.3MB 6.3MB/s 
Installing collected packages: biopython
Successfully installed biopython-1.79


# Helper Functions

## FASTA to DataFrame of Sequences

In [4]:
def fastaDF (input_file,gene_n,drug):
  """converts a FASTA file to a pandas DataFrame. 
    parameters:
    input_file is the file path from current directory to the file. should be a .fa or .fasta file (string)
    gene_n is the name of the gene for which the sequences are in the FASTA file (string)
    drug : name of the antibiotic that the species have resistance for (string)
  """

  #import cipro gyrA FASTA and make DataFrame
  with open(input_file) as fasta_file: #cipro
      identifiers = []
      bacteria_IDs = []
      names = []
      lengths = []
      sequences = []
      gene = []
      for seq_record in SeqIO.parse(fasta_file, 'fasta'):  # (generator)
          identifiers.append(seq_record.id)
          lengths.append(len(seq_record.seq))
          s,d = seq_record.description.split('[')
          d = d[0:len(d)-1]
          bacteria_IDs.append(d)
          bact_name = d.split(' ')[0:2]
          n = ' '.join(bact_name[0:2])
          names.append(n)
          sequences.append(seq_record.seq)
      gene.append(names)
      gene.append(bacteria_IDs)
      gene.append(sequences)
      gene_df = pd.DataFrame(gene,index=['Bacteria Name','Strain',f'{gene_n} Sequence {drug}']).transpose()
  df1=gene_df.set_index('Bacteria Name')
  return df1

## Bag of Words (K-mers)

In [5]:
def makeKmers(sequ,gene,drug):
  '''
  create list of k-mer words in DataFrame, along with bacteria name as index
  gene is string name
  drug is single letter abbreviation of drug name
  '''

  def getKmers(sequence,ksize=6):
    '''
    function to convert sequence strings into k-mer words, default size = 6 (hexamer words)
    sequence input must be in String format
    '''
    return [sequence[x:x+ksize] for x in range(len(sequence) - ksize + 1)]

  ind1 = []
  seq1 = []
  df_test = []
  for i, s in sequ.iterrows():
      ind1.append(i)
      seq1.append(str(s[f'{gene} Sequence {drug}']))
  df_test.append(ind1)
  df_test.append(seq1)
  df_test = pd.DataFrame(df_test,index=['Bacteria Name',f'{gene}-{drug} Seq']).transpose()
  df_test=df_test.set_index(['Bacteria Name'])
  df_test=df_test.sort_index()
  df=df_test
  df[f'{gene}-{drug}']=df.apply(lambda x: getKmers(x[f'{gene}-{drug} Seq']), axis=1)
  df = df.drop([f'{gene}-{drug} Seq'],axis=1)
  return df

# Data Processing

In [6]:
#Cipro gyrA FASTA to DF
gyr_path = '/content/drive/Shareddrives/Project2_Drive/PROJECT 2/combined_sequences/combined_gyra.fasta'
df_gyrA_cipro=fastaDF(gyr_path,'gyrA','C')
df_parC_cipro=fastaDF('/content/drive/Shareddrives/Project2_Drive/PROJECT 2/combined_sequences/combined_parc.fasta','parC','C')
df_gyrA_moxi = fastaDF('/content/drive/Shareddrives/Project2_Drive/PROJECT 2/combined_sequences/gyrA_moxi.fasta','gyrA','M')
df_parC_moxi = fastaDF('/content/drive/Shareddrives/Project2_Drive/PROJECT 2/combined_sequences/parC_moxi.fasta', 'parC', 'M')

In [7]:
df_cipro = pd.merge(df_gyrA_cipro,df_parC_cipro,how="inner", on=['Bacteria Name','Strain'])
df_cipro = df_cipro.drop("Strain",axis=1)
df_cipro = df_cipro.drop("Staphylococcus hominis",axis=0)
df_cipro.sort_index()

Unnamed: 0_level_0,gyrA Sequence C,parC Sequence C
Bacteria Name,Unnamed: 1_level_1,Unnamed: 2_level_1
Acinetobacter lwoffii,"(a, t, g, a, g, c, g, t, a, t, c, g, g, a, a, ...","(a, t, g, a, c, a, a, g, c, c, t, t, g, c, g, ..."
Aerococcus urinae,"(a, t, g, g, t, t, g, a, a, g, a, a, c, a, t, ...","(a, t, g, g, c, g, a, t, t, g, a, t, a, t, t, ..."
Alcaligenes faecalis,"(a, t, g, g, a, t, t, c, c, t, t, t, g, c, c, ...","(a, t, g, g, a, c, a, g, c, a, a, t, c, a, a, ..."
Citrobacter braakii,"(a, t, g, a, g, c, g, a, c, c, t, t, g, c, g, ...","(a, t, g, a, g, c, g, a, t, a, t, g, g, c, a, ..."
Citrobacter freundii,"(a, t, g, a, g, c, g, a, c, c, t, t, g, c, g, ...","(a, t, g, a, g, c, g, a, t, a, t, g, g, c, a, ..."
Clostridium perfringens,"(a, t, g, g, c, t, a, a, g, a, a, g, a, a, t, ...","(a, t, g, a, g, c, t, t, a, a, a, t, g, a, g, ..."
Enterobacter cloacae,"(a, t, g, a, g, c, g, a, c, c, t, t, g, c, g, ...","(a, t, g, a, g, c, g, a, t, a, t, g, g, c, a, ..."
Enterococcus faecalis,"(a, t, g, a, g, t, g, a, a, g, a, a, a, t, t, ...","(t, t, g, g, a, a, a, a, a, c, g, c, c, a, a, ..."
Haemophilus influenzae,"(a, t, g, a, c, a, a, a, t, a, t, c, a, a, c, ...","(a, t, g, a, c, g, g, a, t, t, c, a, a, t, c, ..."
Haemophilus parainfluenzae,"(a, t, g, a, c, g, g, a, t, t, c, a, a, t, c, ...","(a, t, g, a, g, c, a, a, t, a, t, t, a, a, c, ..."


In [8]:
df_moxi = pd.merge(df_gyrA_moxi,df_parC_moxi,how="inner", on=['Bacteria Name'],suffixes=(': gyrA moxi',': parC moxi'))
df_moxi = df_moxi.sort_index()
df_moxi=df_moxi.drop_duplicates("Strain: gyrA moxi",keep='first')
df_moxi=df_moxi.drop_duplicates("Strain: parC moxi",keep='first')
df_moxi=df_moxi.drop(["Strain: parC moxi",'Strain: gyrA moxi'],axis=1)
df_moxi.sort_index()

Unnamed: 0_level_0,gyrA Sequence M,parC Sequence M
Bacteria Name,Unnamed: 1_level_1,Unnamed: 2_level_1
Bacteroides caccae,"(a, t, g, c, t, t, g, a, a, c, a, a, g, a, c, ...","(a, t, g, a, g, t, g, a, c, g, a, a, a, t, t, ..."
Bacteroides fragilis,"(a, t, g, c, t, t, g, a, a, c, a, a, g, a, c, ...","(a, t, g, a, g, c, g, a, a, g, a, g, a, a, c, ..."
Bacteroides thetaiotaomicron,"(a, t, g, c, t, t, g, a, a, c, a, a, g, a, c, ...","(a, t, g, a, g, t, g, a, c, g, a, a, a, t, c, ..."
Bacteroides vulgatus,"(a, t, g, a, t, t, g, a, a, c, t, g, g, g, a, ...","(a, t, g, a, g, c, g, a, c, g, a, t, t, t, t, ..."
Citrobacter freundii,"(a, t, g, a, g, c, g, a, c, c, t, t, g, c, g, ...","(a, t, g, a, g, c, g, a, t, a, t, g, g, c, a, ..."
Clostridium perfringens,"(a, t, g, a, g, c, t, t, a, a, a, t, g, a, g, ...","(a, t, g, g, c, t, a, a, g, a, a, g, a, a, t, ..."
Enterobacter cloacae,"(a, t, g, a, g, c, g, a, c, c, t, t, g, c, g, ...","(a, t, g, a, g, c, g, a, t, a, t, g, g, c, a, ..."
Enterococcus faecalis,"(a, t, g, a, g, t, g, a, a, g, a, a, a, t, t, ...","(t, t, g, g, a, a, a, a, a, c, g, c, c, a, a, ..."
Enterococcus faecium,"(a, t, g, a, g, t, g, a, a, g, a, a, a, t, c, ...","(a, t, g, g, a, a, a, a, t, c, g, a, c, a, a, ..."
Escherichia coli,"(a, t, g, a, g, c, g, a, c, c, t, t, g, c, g, ...","(a, t, g, a, g, c, g, a, t, a, t, g, g, c, a, ..."


In [9]:
df = pd.concat([df_moxi,df_cipro],join='outer')
# df[['Bacteria Name'],[Bacterioides]]

## Make a DataFrame of Kmers

In [10]:
#k-mer DF for cipro genes
df_1 = makeKmers(df_cipro,'gyrA','C')
df_2 = makeKmers(df_cipro,'parC','C')
df_words = df_1.merge(df_2,on='Bacteria Name')

In [11]:
#k-mer DF for cipro genes
df_1 = makeKmers(df_moxi,'gyrA','M')
df_2 = makeKmers(df_moxi,'parC','M')
df_words_m = df_1.merge(df_2,on='Bacteria Name')

In [12]:
df_words_m.to_csv('/content/drive/Shareddrives/Project2_Drive/PROJECT 2/df_m.csv')

In [13]:
gyrA_C_word = list(df_words['gyrA-C'])
for item in range(len(gyrA_C_word)):
    gyrA_C_word[item] = ' '.join(gyrA_C_word[item])
C_spec = df_words.index.values                         #C_ for C species
parC_C_word = list(df_words['parC-C'])
for item in range(len(parC_C_word)):
    parC_C_word[item] = ' '.join(parC_C_word[item])

In [14]:
gyrA_M_word = list(df_words_m['gyrA-M'])
for item in range(len(gyrA_M_word)):
    gyrA_M_word[item] = ' '.join(gyrA_M_word[item])
M_spec = df_words_m.index.values                         #M_ for moxi species
parC_M_word = list(df_words_m['parC-M'])
for item in range(len(parC_M_word)):
    parC_M_word[item] = ' '.join(parC_M_word[item])

In [15]:
spec=np.append(C_spec,M_spec)
gyrA_words=gyrA_C_word+gyrA_M_word
parC_words=parC_C_word+parC_M_word
cipro_words = gyrA_C_word+parC_C_word

In [16]:
df_kmer = pd.concat([df_words,df_words_m],join='outer')

In [17]:
from sklearn.feature_extraction.text import CountVectorizer
cv = CountVectorizer(ngram_range=(4,4)) #The n-gram size of 4 is previously determined by testing
X_gyrA = cv.fit_transform(gyrA_words)
X_parC = cv.transform(parC_words)

## Make a DataFrame of MIC Values

In [18]:
weighted_MIC = pd.read_csv("/content/drive/Shareddrives/Project2_Drive/PROJECT 2/MIC_resistance_all.csv")
weighted_MIC = weighted_MIC.set_index("Bacteria Name")
weighted_MIC.dropna(axis=0,how="all",inplace=True)
weighted_MIC=weighted_MIC.drop('Staphylococcus hominis',axis=0)

In [19]:
MIC_df_cipro = weighted_MIC[["Ciprofloxacin Weighted MIC"]]
MIC_df_cipro = MIC_df_cipro.dropna(how='any')
MIC_df_cipro = MIC_df_cipro.rename(columns={"Ciprofloxacin Weighted MIC":"cipro MIC"})
y_cipro = MIC_df_cipro.iloc[:, 0].values # y_cipro for Cipro MIC

In [20]:
MIC_df_moxi = weighted_MIC[["Moxifloxacin Weighted MIC"]]
MIC_df_moxi = MIC_df_moxi.dropna(how='any')
MIC_df_moxi = MIC_df_moxi.rename(columns={"Moxifloxacin Weighted MIC":"moxi MIC"})
MIC_df_moxi
y_moxi = MIC_df_moxi.iloc[:, 0].values # y_moxi for moxi MIC
y_moxi

array([3.68396226e+00, 2.18169348e+00, 3.05313351e+00, 5.60311189e+00,
       2.36377850e+00, 4.28879310e-01, 1.41813830e+00, 1.75797577e+01,
       8.09083187e+00, 6.01519416e+00, 3.22200040e-02, 1.49605634e-01,
       2.30359903e+00, 1.20064915e+00, 1.51173925e+00, 5.64555838e-01,
       7.14902520e-02, 2.01465860e+00, 3.10024390e+00, 1.53618109e+00,
       6.19968672e-01, 1.44829224e+01, 8.52192177e-01, 1.33350206e+01,
       2.81250000e-01, 2.97509487e+02, 3.12475170e+02, 8.81431325e+01,
       1.06800000e-01, 2.96607774e-01, 4.60535685e+00, 2.17774814e+01,
       1.86748120e-01])

In [21]:
y_MIC=np.append(y_cipro,y_moxi)

In [22]:
MIC_df = pd.concat([MIC_df_cipro,MIC_df_moxi],join='outer')
MIC_df

Unnamed: 0_level_0,cipro MIC,moxi MIC
Bacteria Name,Unnamed: 1_level_1,Unnamed: 2_level_1
Acinetobacter lwoffii,4.356195,
Aerococcus urinae,1.133333,
Alcaligenes faecalis,3.826087,
Citrobacter braakii,5.358667,
Citrobacter freundii,1.689,
Clostridium perfringens,0.578696,
Enterobacter cloacae,2.289254,
Enterococcus faecalis,7.509937,
Haemophilus influenzae,0.033003,
Haemophilus parainfluenzae,0.014325,


## Make a DataFrame of Percent Identity Values

In [23]:
PIM_df_cipro = pd.read_csv("/content/drive/Shareddrives/Project2_Drive/PROJECT 2/ciprofloaxin_PIM_singleval.csv")
PIM_df_cipro = PIM_df_cipro.set_index("Bacteria Name")
PIM_df_cipro = PIM_df_cipro.sort_index()
PIM_df_cipro = PIM_df_cipro.rename(columns={"gyrA":"c-gyrA", "parC": "c-parC"})

In [24]:
PIM_df_moxi = pd.read_csv("/content/drive/Shareddrives/Project2_Drive/PROJECT 2/moxi_PIM.csv")
PIM_df_moxi = PIM_df_moxi.set_index("Bacteria Name")
PIM_df_moxi = PIM_df_moxi.sort_index()
PIM_df_moxi = PIM_df_moxi.rename(columns={"gyrA PIM":"m-gyrA", "parC PIM": "m-parC"})

In [25]:
PIM_df=pd.concat([PIM_df_cipro,PIM_df_moxi],join='outer',sort=True)
PIM_df.groupby(level=0).sum()

Unnamed: 0_level_0,c-gyrA,c-parC,m-gyrA,m-parC
Bacteria Name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Acinetobacter lwoffii,89.0,93.0,0.0,0.0
Aerococcus urinae,87.0,94.0,0.0,0.0
Alcaligenes faecalis,88.0,95.0,0.0,0.0
Bacteroides caccae,0.0,0.0,96.0,62.0
Bacteroides fragilis,0.0,0.0,96.0,62.0
Bacteroides thetaiotaomicron,0.0,0.0,96.0,62.0
Bacteroides vulgatus,0.0,0.0,97.0,76.0
Citrobacter braakii,89.0,96.0,0.0,0.0
Citrobacter freundii,89.0,96.0,96.0,82.0
Clostridium perfringens,83.0,91.0,97.0,83.0


## Combine all data into two DataFrames

In [26]:
cipro_df = pd.merge(MIC_df_cipro,PIM_df_cipro,how="inner",on=["Bacteria Name"])
cipro_df.drop_duplicates()
cipro_df2=pd.merge(cipro_df,df_words,how="inner",on=["Bacteria Name"])
cipro_df2.drop_duplicates(subset='cipro MIC')
cipro_df2 #ALL CIPRO DATA

Unnamed: 0_level_0,cipro MIC,c-gyrA,c-parC,gyrA-C,parC-C
Bacteria Name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
Acinetobacter lwoffii,4.356195,89,93,"[atgagc, tgagcg, gagcgt, agcgta, gcgtat, cgtat...","[atgaca, tgacaa, gacaag, acaagc, caagcc, aagcc..."
Aerococcus urinae,1.133333,87,94,"[atggtt, tggttg, ggttga, gttgaa, ttgaag, tgaag...","[atggcg, tggcga, ggcgat, gcgatt, cgattg, gattg..."
Alcaligenes faecalis,3.826087,88,95,"[atggat, tggatt, ggattc, gattcc, attcct, ttcct...","[atggac, tggaca, ggacag, gacagc, acagca, cagca..."
Citrobacter braakii,5.358667,89,96,"[atgagc, tgagcg, gagcga, agcgac, gcgacc, cgacc...","[atgagc, tgagcg, gagcga, agcgat, gcgata, cgata..."
Citrobacter freundii,1.689,89,96,"[atgagc, tgagcg, gagcga, agcgac, gcgacc, cgacc...","[atgagc, tgagcg, gagcga, agcgat, gcgata, cgata..."
Clostridium perfringens,0.578696,83,91,"[atggct, tggcta, ggctaa, gctaag, ctaaga, taaga...","[atgagc, tgagct, gagctt, agctta, gcttaa, cttaa..."
Enterobacter cloacae,2.289254,89,94,"[atgagc, tgagcg, gagcga, agcgac, gcgacc, cgacc...","[atgagc, tgagcg, gagcga, agcgat, gcgata, cgata..."
Enterococcus faecalis,7.509937,87,95,"[atgagt, tgagtg, gagtga, agtgaa, gtgaag, tgaag...","[ttggaa, tggaaa, ggaaaa, gaaaaa, aaaaac, aaaac..."
Haemophilus influenzae,0.033003,80,89,"[atgaca, tgacaa, gacaaa, acaaat, caaata, aaata...","[atgacg, tgacgg, gacgga, acggat, cggatt, ggatt..."
Haemophilus parainfluenzae,0.014325,88,94,"[atgacg, tgacgg, gacgga, acggat, cggatt, ggatt...","[atgagc, tgagca, gagcaa, agcaat, gcaata, caata..."


In [27]:
moxi_df = pd.merge(MIC_df_moxi,PIM_df_moxi,how="inner",on=["Bacteria Name"])
moxi_df.groupby(level=0)
moxi_df=moxi_df.drop_duplicates()

In [28]:
moxi_df2 = pd.merge(moxi_df,df_words_m,how="inner",on=["Bacteria Name"])
moxi_df2.drop_duplicates(subset='moxi MIC') #ALL MOXI DATA

Unnamed: 0_level_0,moxi MIC,m-gyrA,m-parC,gyrA-M,parC-M
Bacteria Name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
Bacteroides caccae,3.683962,96,62,"[atgctt, tgcttg, gcttga, cttgaa, ttgaac, tgaac...","[atgagt, tgagtg, gagtga, agtgac, gtgacg, tgacg..."
Bacteroides fragilis,2.181693,96,62,"[atgctt, tgcttg, gcttga, cttgaa, ttgaac, tgaac...","[atgagc, tgagcg, gagcga, agcgaa, gcgaag, cgaag..."
Bacteroides thetaiotaomicron,3.053134,96,62,"[atgctt, tgcttg, gcttga, cttgaa, ttgaac, tgaac...","[atgagt, tgagtg, gagtga, agtgac, gtgacg, tgacg..."
Bacteroides vulgatus,5.603112,97,76,"[atgatt, tgattg, gattga, attgaa, ttgaac, tgaac...","[atgagc, tgagcg, gagcga, agcgac, gcgacg, cgacg..."
Citrobacter freundii,2.363779,96,82,"[atgagc, tgagcg, gagcga, agcgac, gcgacc, cgacc...","[atgagc, tgagcg, gagcga, agcgat, gcgata, cgata..."
Clostridium perfringens,0.428879,97,83,"[atgagc, tgagct, gagctt, agctta, gcttaa, cttaa...","[atggct, tggcta, ggctaa, gctaag, ctaaga, taaga..."
Enterococcus faecalis,17.579758,98,86,"[atgagt, tgagtg, gagtga, agtgaa, gtgaag, tgaag...","[ttggaa, tggaaa, ggaaaa, gaaaaa, aaaaac, aaaac..."
Enterococcus faecium,8.090832,98,86,"[atgagt, tgagtg, gagtga, agtgaa, gtgaag, tgaag...","[atggaa, tggaaa, ggaaaa, gaaaat, aaaatc, aaatc..."
Escherichia coli,6.015194,96,82,"[atgagc, tgagcg, gagcga, agcgac, gcgacc, cgacc...","[atgagc, tgagcg, gagcga, agcgat, gcgata, cgata..."
Haemophilus influenzae,0.03222,96,82,"[atgacg, tgacgg, gacgga, acggat, cggatt, ggatt...","[atgaca, tgacaa, gacaaa, acaaat, caaata, aaata..."


## Data Separation

In [29]:
MIC_val_cipro = np.array(cipro_df['cipro MIC'])
columns_list = list(cipro_df.columns)
species_list = list(cipro_df.index)
cipro_df1 = cipro_df.drop("cipro MIC", axis=1)

In [30]:
cipro_df1 = np.array(cipro_df1)

In [31]:
MIC_val_moxi = np.array(moxi_df['moxi MIC'])
columns_list1 = list(moxi_df.columns)
species_list1 = list(moxi_df.index)
moxi_df1 = moxi_df.drop("moxi MIC", axis=1)

In [32]:
moxi_df1 = np.array(moxi_df1)

# Random Forest Regression using cipro MIC

In [33]:
train_features, test_features, train_labels, test_labels = sklearn.model_selection.train_test_split(cipro_df, MIC_val_cipro, test_size = 0.4,random_state=38,shuffle=False)

In [34]:
print('Training Features Shape:', train_features.shape)
print('Training Labels Shape:', train_labels.shape)
print('Testing Features Shape:', test_features.shape)
print('Testing Labels Shape:', test_labels.shape)

Training Features Shape: (16, 3)
Training Labels Shape: (16,)
Testing Features Shape: (11, 3)
Testing Labels Shape: (11,)


In [35]:
from sklearn.ensemble import RandomForestRegressor

# Instantiate model with 1000 decision trees
rf = RandomForestRegressor(n_estimators = 1000, random_state = 18,warm_start=True)

# Train the model on training data
rf.fit(train_features, train_labels)

RandomForestRegressor(bootstrap=True, ccp_alpha=0.0, criterion='mse',
                      max_depth=None, max_features='auto', max_leaf_nodes=None,
                      max_samples=None, min_impurity_decrease=0.0,
                      min_impurity_split=None, min_samples_leaf=1,
                      min_samples_split=2, min_weight_fraction_leaf=0.0,
                      n_estimators=1000, n_jobs=None, oob_score=False,
                      random_state=18, verbose=0, warm_start=True)

In [36]:
predictions = rf.predict(test_features)

# Calculate the absolute errors
errors = abs(predictions - test_labels)
print(predictions)
print(test_labels)

# Print out the mean absolute error (mae)
print('Mean Absolute Error:', round(np.mean(errors), 2))

[5.17279808 1.73286418 6.81096128 3.44742671 4.0973224  6.73872015
 0.62853459 6.78013271 1.76458131 6.84113125 5.46721843]
[ 4.98138461  1.85117178  7.32254717  3.17259649  4.12379357 32.86144977
  0.79964474 27.47911776  2.17220482  9.89460455  5.11393   ]
Mean Absolute Error: 4.72


In [37]:
# Calculate mean absolute percentage error (MAPE)
mape = 100 * (errors / test_labels)

# Calculate and display accuracy
accuracy = 100 - np.mean(mape)
print('Accuracy:', round(accuracy, 2), '%.')

Accuracy: 76.43 %.


# Random Forest Regression using Moxi k-mer

In [38]:
train_featuresm, test_featuresm, train_labelsm, test_labelsm = sklearn.model_selection.train_test_split(moxi_df, MIC_val_moxi, test_size = 0.4,random_state=38,shuffle=False)

In [39]:
print('Training Features Shape:', train_featuresm.shape)
print('Training Labels Shape:', train_labelsm.shape)
print('Testing Features Shape:', test_featuresm.shape)
print('Testing Labels Shape:', test_labelsm.shape)
from sklearn.ensemble import RandomForestRegressor
# Instantiate model with 1000 decision trees
rf = RandomForestRegressor(n_estimators = 2000, random_state = 58,warm_start=True)
# Train the model on training data
rf.fit(train_featuresm, train_labelsm)
predictionsm = rf.predict(test_featuresm)
# Calculate the absolute errors
errorsm = abs(predictionsm - test_labelsm)
print(predictionsm)
print(test_labelsm)
# Print out the mean absolute error (mae)
print('Mean Absolute Error:', round(np.mean(errorsm), 2))
# Calculate mean absolute percentage error (MAPE)
mapem = 100 * (errorsm / test_labelsm)
# Calculate and display accuracy
accuracym = 100 - np.mean(mapem)
print('Accuracy:', round(accuracym, 2), '%.')

Training Features Shape: (19, 3)
Training Labels Shape: (19,)
Testing Features Shape: (13, 3)
Testing Labels Shape: (13,)
[ 0.37283994 12.02669702  2.80493733 13.96822453  0.29964925 13.96822453
 13.96822453 13.96822453  2.46291117  2.53998392  6.27414088 12.460764
  2.47587832]
[6.19968672e-01 1.44829224e+01 8.52192177e-01 1.33350206e+01
 2.81250000e-01 2.97509487e+02 3.12475170e+02 8.81431325e+01
 1.06800000e-01 2.96607774e-01 4.60535685e+00 2.17774814e+01
 1.86748120e-01]
Mean Absolute Error: 52.26
Accuracy: -272.27 %.


# Support Vector Regression

In [40]:
from sklearn.model_selection import train_test_split
from sklearn.svm import SVR
from sklearn.metrics import r2_score,mean_squared_error, mean_absolute_error
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler

In [41]:
from sklearn.feature_extraction.text import CountVectorizer
cv = CountVectorizer(ngram_range=(4,4)) #The n-gram size of 4 is previously determined by testing
X_gyrAC = cv.fit_transform(gyrA_C_word)
X_parCC = cv.transform(parC_C_word)
X_gyrAM = cv.fit_transform(gyrA_M_word)
X_parCM = cv.transform(parC_M_word)
X_cipro = cv.fit_transform(cipro_words)
print(X_gyrAC.shape)
print(X_parCC.shape)
print(X_gyrAM.shape)
print(X_parCM.shape)
print(X_cipro.shape)

(27, 42740)
(27, 42740)
(33, 48975)
(33, 48975)
(54, 68857)


In [42]:
X_train, X_test, Y_train, Y_test = sklearn.model_selection.train_test_split(X_gyrAC, y_cipro, test_size = 0.4,random_state=38,shuffle=False)

In [50]:
SupportVectorRegModel = SVR()
SupportVectorRegModel.fit(X_train,Y_train)
Y_pred = SupportVectorRegModel.predict(X_test)
print(Y_pred)
print(Y_test)
mae = mean_absolute_error(Y_test,Y_pred,multioutput='raw_values')
er=abs(Y_pred-Y_test)
print(mae)
perr = 100 * (er/Y_test)
print('Accuracy: ', 100- np.mean(perr), '%') #CIPRO PERCENT ACCURACY

[2.08576253 2.11395013 2.12801082 2.13672888 2.00762582 2.01230012
 2.00683144 2.01352654 2.01385161 2.01380248 2.02696643]
[ 4.98138461  1.85117178  7.32254717  3.17259649  4.12379357 32.86144977
  0.79964474 27.47911776  2.17220482  9.89460455  5.11393   ]
[7.28663805]
Accuracy:  35.26862549965166 %


In [44]:
y_cipro2 = np.concatenate((y_cipro,y_cipro))
len(y_cipro2)

54

In [45]:
X_train2, X_test2, Y_train2, Y_test2 = sklearn.model_selection.train_test_split(X_cipro, y_cipro2, test_size = 0.4,random_state=38,shuffle=False)

In [46]:
SupportVectorRegModel = SVR(kernel='rbf')
SupportVectorRegModel.fit(X_train2,Y_train2)
Y_pred2 = SupportVectorRegModel.predict(X_test2)
mae2 = mean_absolute_error(Y_test2,Y_pred2,multioutput='raw_values')
er2=abs(Y_pred2-Y_test2)
perr2 = 100 * (er2/Y_test2)
print(100- np.mean(perr2))

-4291.011721159459


In [47]:
X_gyrAC.shape
y_cipro.shape

(27,)

# Model Comparison - Random Forest vs. SVR

This plot is interactive! You can select the models you want to compare using the key on the right.

In [48]:
# Imports
import plotly.graph_objects as go

# Grab the data
bacteria_names = ['Species 1', 'Species 2', 'Species 3', 'Species 4', 'Species 5', 'Species 6', 'Species 7', 'Species 8', 'Species 9', 'Species 10', 'Species 11']
model1_predictions = predictions
model2_predictions = Y_pred
actual_values = test_labels

# Plot the chart
fig = go.Figure(
  data=[
    go.Bar(name='Random Forest Predicted MIC', x=bacteria_names, y=model1_predictions),
    go.Bar(name='SVR Predicted MIC', x=bacteria_names, y=model2_predictions),
    # More models could go here
    go.Bar(name='Actual MIC', x=bacteria_names, y=actual_values)
  ],
  layout={
    'yaxis': {'title': 'Predicted MIC Value'},
    'xaxis': {'title': 'Test Species'},
    'title': 'Comparing Predicted MIC Values'
  }
)
fig.update_layout(barmode='group', xaxis_tickangle=-45)
fig.show()