In [None]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import minmax_scale

from scipy.stats import norm

# Download the date
https://archive.ics.uci.edu/ml/machine-learning-databases/00577/codon_usage.csv.zip


In [None]:
# !wget https://archive.ics.uci.edu/ml/machine-learning-databases/00577/codon_usage.csv.zip

In [None]:
# !unzip codon_usage.csv.zip

In [None]:
def read_data(file_name):
    """
    Reads in a csv file and returns a dataframe
    """
    return pd.read_csv(file_name, low_memory=False)

dataset = read_data('codon_usage.csv')
df = dataset.copy()

In [None]:
# dataset = read_data('codon_usage.csv')
dataset.describe()

In [None]:
dataset

In [None]:
# dataset.dropna(inplace=True)

dataset['Kingdom'].value_counts()

In [None]:
print(dataset.isnull().sum())

In [None]:
dataset['Kingdom'].value_counts().plot(kind='bar')

In [None]:
dataset['Kingdom'].unique()

In [None]:
# dataset['organism_kingdom'] = dataset['Kingdom'].copy()

In [None]:
dataset['Kingdom'] = dataset['Kingdom'].map({   'arc': 0, 'bct': 0, 
                                                'phg': 1, 'plm': 0, 'vrl':0, 
                                                'pln': 1, 'inv': 1, 
                                                'vrt': 1, 'mam': 1,
                                                'rod': 1, 'pri': 1})

In [None]:
dataset['Kingdom'].value_counts().plot(kind='bar')

In [None]:
dataset = dataset.drop(dataset[dataset['SpeciesID']==353569].index)
dataset = dataset.drop(dataset[dataset['SpeciesID']==1238].index)

# dataset[dataset['SpeciesID']==353569]

In [None]:
dataset['UUU'] = dataset['UUU'].astype(float)
dataset['UUC'] = dataset['UUC'].astype(float)
# dataset = minmax_scale(dataset)
cols = dataset.select_dtypes(np.number).columns

num_columns_list = list(cols)
# num_columns_list

num_columns_list.remove('Kingdom')
num_columns_list.remove('DNAtype')
num_columns_list.remove('SpeciesID')
num_columns_list.remove('Ncodons')
# num_columns_list

In [None]:
for num_column in num_columns_list:
    dataset[num_column] = minmax_scale(dataset[num_column])
    # df['a'] = minmax_scale(df['a'])

In [None]:
dataset

In [None]:
X = dataset.iloc[:, 5:-1].values.astype(float)
y = dataset.iloc[:, 0]


In [None]:
y

In [None]:
X

In [None]:
class SingleVsMultiModel:
  def __init__(self, X, y, laplace_smoothing_value, discrete, laplace, total_classes=2):
    # update your Naive Bayes class functions to account for a discrete classification using what we have learned in the class. 
    self.X =X
    self.y = y
    self.laplace_smoothing_value = laplace_smoothing_value
    self.discrete = discrete
    self.laplace = laplace
    self.total_classes = total_classes



  def splitData(self):
    X_train, X_test, y_train, y_test = train_test_split(self.X, self.y, test_size=0.3, random_state=1)
    return X_train, X_test, y_train, y_test



  def fitDistribution(self, data):
    mean = np.mean(data)
    std = np.std(data)
    dist = norm(mean, std)
    return dist



  def laplaceSmoothing(self, class0, class1):
    # print('type 0 ', type(class0))
    # print('size 0 ', class0.size)
    # print('type 1 ', type(class1))
    # print('size 1 ', class1.size)
    smooth0 = (class0 + self.laplace_smoothing_value) / ( (class0.size) + self.laplace_smoothing_value * self.n_features)
    smooth1 = (class1 + self.laplace_smoothing_value) / ( (class1.size) + self.laplace_smoothing_value * self.n_features)

    return smooth0, smooth1
  



  def evaluate(self, y, y_predicted):
    '''

      Takes original classes and predicted classes as input

      Return the values of precision, recall and accuracy
    
    '''
    y = ( y==1 )
    y_predicted = ( y_predicted == 1 )

    precision = (y&y_predicted).sum() / y_predicted.sum()
    recall = (y&y_predicted).sum() / y.sum()
    accuracy = (y==y_predicted).sum() / y.size


    return precision, recall, accuracy



  # def probability(self, X, dist1, dist2, prior):
  #   return prior * ( dist1.pdf(X[0]) *  dist2.pdf(X[1]))


  def probability(self, data):
    py0 = 1
    py1 = 1

    if self.discrete:
      for i in range(self.n_features):
        # py0 *= self.features['X'+str(i)+str(data[i])+'0']
        # py1 *= self.features['X'+str(i)+str(data[i])+'1']
        py0 *= self.features['X'+str(i)+'00']
        py1 *= self.features['X'+str(i)+'01']

    else:
      for i in range(self.n_features):
        py0 *= self.features['X'+str(i)+'0'].pdf(data[i])
        py1 *= self.features['X'+str(i)+'1'].pdf(data[i])

    
    return  py0 * self.prior_0, py1 * self.prior_1



  def laplaceProbability(self, data):
    py0 = 1
    py1 = 1
    if self.discrete:
      for i in range(self.n_features):
        smooth0 , smooth1 = self.laplaceSmoothing( self.features['X'+ str(i) + '0'], self.features['X'+str(i)+'1'])
        py0 *= smooth0
        py1 *= smooth1
    
    else:
      for i in range(self.n_features):
        pdf0 = self.features['X'+str(i)+'0'].pdf(data[i])
        pdf1 = self.features['X'+str(i)+'1'].pdf(data[i])
        smooth0 , smooth1 = self.laplaceSmoothing(pdf0, pdf1)
        py0 *= smooth0
        py1 *= smooth1

    return  py0 * self.prior_0, py1 * self.prior_1                                                                             




  def fit(self):

    self.X_train, self.X_test, self.y_train, self.y_test = self.splitData()

    X0_train = self.X_train[self.y_train == 0]
    X1_train = self.X_train[self.y_train == 1]

    self.prior_1 = len(X1_train) / len(self.X_train)
    self.prior_0 = len(X0_train) / len(self.X_train)

    self.n_features = self.X_train.shape[1]
    self.features = {}
    
    if self.discrete:

      print('Discrete Data')

      for i in range(self.n_features):
        self.features['X'+str(i)+'00'] = ( (X0_train[ X0_train[:,i]==0 ]).sum() ) / len(X0_train)
        self.features['X'+str(i)+'01'] = ( (X1_train[ X1_train[:,i]==0 ]).sum() ) / len(X1_train)
        self.features['X'+str(i)+'10'] = ( (X0_train[ X0_train[:,i]==1 ]).sum() ) / len(X0_train)
        self.features['X'+str(i)+'11'] = ( (X1_train[ X1_train[:,i]==1 ]).sum() ) / len(X1_train)

    
    else:

      print('Continuous Data')

      for i in range(self.n_features):
        self.features['X'+str(i)+'0'] = self.fitDistribution(X0_train[:, i])
        self.features['X'+str(i)+'1'] = self.fitDistribution(X1_train[:, i])


    # self.X00_dist = self.fitDistribution(X0_train[:, 0])
    # self.X01_dist = self.fitDistribution(X0_train[:, 1])

    # self.X10_dist = self.fitDistribution(X1_train[:, 0])
    # self.X11_dist = self.fitDistribution(X1_train[:, 1])

  
  
  def predict(self):
    count = 0
    y_predicted = []
    y = []
    for sample, target in zip(self.X_test, self.y_test):
      # py0 = self.probability(sample, self.X00_dist, self.X01_dist, self.prior_0)
      # py1 = self.probability(sample, self.X10_dist, self.X11_dist, self.prior_1)

      # print("P(y=0|%s = %.3f" % (sample, py0*100))
      # print("P(y=1|%s = %.3f" % (sample, py1*100))

      if self.laplace:
        py0, py1 = self.laplaceProbability(sample)
      
      else:
        py0, py1 = self.probability(sample)

      # print("Model predicted class {} and truth was {}".format(np.argmax([py0,py1]), target))
      y_predicted.append(np.argmax([py0,py1]))
      y.append(target)
      # np.append(y_predicted, np.argmax([py0,py1]))
      # np.append(y, target)
      # y = target
      if np.argmax([py0,py1]) != target :
        count+=1

      # precision, recall, accuracy = self.evaluate(y, y_predicted)
      # print("precision:", precision)
      # print("recall:", recall)
      # print("accuracy:", accuracy)

    
    print(count)
    # print(y)
    y_predicted = np.array(y_predicted)
    y = np.array(y)

    precision, recall, accuracy = self.evaluate(y, y_predicted)
    print("precision:", precision)
    print("recall:", recall)
    print("accuracy:", accuracy)

    return y_predicted, y


In [None]:
clf = SingleVsMultiModel(X,y, laplace_smoothing_value=0.5, discrete=False, laplace=True)

In [None]:
clf.fit()

In [None]:
y_predicted, y = clf.predict()

In [None]:
print(y_predicted)
print(y)