In [None]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split as tts
from sklearn.preprocessing import minmax_scale

from scipy.stats import norm

# Download the date
https://archive.ics.uci.edu/ml/machine-learning-databases/00577/codon_usage.csv.zip


In [None]:
# !wget https://archive.ics.uci.edu/ml/machine-learning-databases/00577/codon_usage.csv.zip

In [None]:
# !unzip codon_usage.csv.zip

In [None]:
def read_data(file_name):
    """
    Reads in a csv file and returns a dataframe
    """
    return pd.read_csv(file_name, low_memory=False)

dataset = read_data('codon_usage.csv')
df = dataset.copy()

In [None]:
# dataset = read_data('codon_usage.csv')
dataset.describe()

In [None]:
dataset

In [None]:
# dataset.dropna(inplace=True)

dataset['Kingdom'].value_counts()

In [None]:
print(dataset.isnull().sum())

In [None]:
dataset['Kingdom'].value_counts().plot(kind='bar')

In [None]:
dataset['Kingdom'].unique()

In [None]:
# dataset['organism_kingdom'] = dataset['Kingdom'].copy()

In [None]:
dataset['Kingdom'] = dataset['Kingdom'].map({   'arc': 0, 'bct': 0, 
                                                'phg': 1, 'plm': 0, 'vrl':0, 
                                                'pln': 1, 'inv': 1, 
                                                'vrt': 1, 'mam': 1,
                                                'rod': 1, 'pri': 1})

In [None]:
dataset['Kingdom'].value_counts().plot(kind='bar')

In [None]:
dataset = dataset.drop(dataset[dataset['SpeciesID']==353569].index)
dataset = dataset.drop(dataset[dataset['SpeciesID']==1238].index)

# dataset[dataset['SpeciesID']==353569]

In [None]:
dataset['UUU'] = dataset['UUU'].astype(float)
dataset['UUC'] = dataset['UUC'].astype(float)
# dataset = minmax_scale(dataset)
cols = dataset.select_dtypes(np.number).columns

num_columns_list = list(cols)
# num_columns_list

num_columns_list.remove('Kingdom')
num_columns_list.remove('DNAtype')
num_columns_list.remove('SpeciesID')
num_columns_list.remove('Ncodons')
# num_columns_list

In [None]:
for num_column in num_columns_list:
    dataset[num_column] = minmax_scale(dataset[num_column])


In [None]:
dataset.head()

In [None]:
X = dataset.iloc[:, 5:-1].values.astype(float)
y = dataset.iloc[:, 0]


In [None]:
y

In [None]:
class DataGenerator:

    def __init__(self, X, y, sampleSize=16):
        self.X = X
        self.y = y
        self.sampleSize = sampleSize
        if self.sampleSize <= 0 and self.sampleSize > len(self.X):
            return TypeError("sampleSize must be between 0 and ", len(X))



    def split_data(self, test_size=0.3, random_state=1):
        self.X_train, self.X_rem, self.y_train, self.y_rem = tts(self.X, self.y, test_size=test_size, random_state=random_state)
        self.X_test, self.X_val, self.y_test, self.y_val = tts(self.X_rem, self.y_rem, test_size=0.5, random_state=random_state)
        return self.X_train, self.X_test, self.y_train, self.y_test

    
    # def training_data_generator(self):
    #     # print(len(self.X))
    #     idx = np.random.randint(0, len(self.X_train), self.sampleSize)
    #     if self.sampleSize <= 0 and self.sampleSize > len(self.X):
    #         return TypeError("sampleSize must be between 0 and ", len(self.X))
    #     train_X, train_y = self.X_train[idx], self.y_train[idx]
    #     yield train_X
    #     yield train_y
    
    # def validation_data_generator(self):
    #     idx = np.random.randint(0, len(self.X_rem), self.sampleSize)
    #     if self.sampleSize <= 0 and self.sampleSize > len(self.X):
    #         return TypeError("sampleSize must be between 0 and ", len(self.X_rem))
    #     test_X, test_y = self.X[idx], self.y[idx]
    #     # return test_X, test_y
    #     yield test_X
    #     yield test_y
    

In [None]:
data_generator = DataGenerator(y=y, X=X)

In [None]:
X_training, X_testing, y_training, y_testing = data_generator.split_data()

In [None]:
class SingleVsMultiModel:
  def __init__(self, X_train, X_test, y_train, y_test, laplace_smoothing_value, laplace, total_classes=2):
    self.laplace_smoothing_value = laplace_smoothing_value
    self.laplace = laplace
    self.total_classes = total_classes
    self.X_train = X_train
    self.X_test = X_test
    self.y_train = y_train
    self.y_test = y_test


  def fitDistribution(self, data):
    mean = np.mean(data)
    std = np.std(data)
    dist = norm(mean, std)
    return dist


  def laplaceSmoothing(self, class0, class1):
    smooth0 = (class0 + self.laplace_smoothing_value) / ( (class0.size) + self.laplace_smoothing_value * self.n_features)
    smooth1 = (class1 + self.laplace_smoothing_value) / ( (class1.size) + self.laplace_smoothing_value * self.n_features)

    return smooth0, smooth1


  def evaluate(self, y, y_predicted):
    '''

      Takes original classes and predicted classes as input

      Return the values of precision, recall and accuracy
    
    '''
    y = ( y==1 )
    y_predicted = ( y_predicted == 1 )

    precision = (y&y_predicted).sum() / y_predicted.sum()
    recall = (y&y_predicted).sum() / y.sum()
    accuracy = (y==y_predicted).sum() / y.size


    return precision, recall, accuracy
  

  def probability(self, data):
    py0 = 1
    py1 = 1

    for i in range(self.n_features):
      py0 *= self.features['X'+str(i)+'0'].pdf(data[i])
      py1 *= self.features['X'+str(i)+'1'].pdf(data[i])

    
    return  py0 * self.prior_0, py1 * self.prior_1


  def laplaceProbability(self, data):
    py0 = 1
    py1 = 1

    for i in range(self.n_features):
      pdf0 = self.features['X'+str(i)+'0'].pdf(data[i])
      pdf1 = self.features['X'+str(i)+'1'].pdf(data[i])
      smooth0 , smooth1 = self.laplaceSmoothing(pdf0, pdf1)
      py0 *= smooth0
      py1 *= smooth1

    return  py0 * self.prior_0, py1 * self.prior_1                                                                             




  def fit(self):

    X0_train = self.X_train[self.y_train == 0]
    X1_train = self.X_train[self.y_train == 1]

    self.prior_1 = len(X1_train) / len(self.X_train)
    self.prior_0 = len(X0_train) / len(self.X_train)

    self.n_features = self.X_train.shape[1]
    self.features = {}
    
    for i in range(self.n_features):
      self.features['X'+str(i)+'0'] = self.fitDistribution(X0_train[:, i])
      self.features['X'+str(i)+'1'] = self.fitDistribution(X1_train[:, i])


  
  
  def predict(self):
    count = 0
    y_predicted = []
    y = []
    for sample, target in zip(self.X_test, self.y_test):

      if self.laplace:
        py0, py1 = self.laplaceProbability(sample)
      
      else:
        py0, py1 = self.probability(sample)

      # print("Model predicted class {} and truth was {}".format(np.argmax([py0,py1]), target))
      y_predicted.append(np.argmax([py0,py1]))
      y.append(target)
      # np.append(y_predicted, np.argmax([py0,py1]))
      # np.append(y, target)
      # y = target
      if np.argmax([py0,py1]) != target :
        count+=1

      # precision, recall, accuracy = self.evaluate(y, y_predicted)
      # print("precision:", precision)
      # print("recall:", recall)
      # print("accuracy:", accuracy)

    
    print(count)
    # print(y)
    y_predicted = np.array(y_predicted)
    y = np.array(y)

    precision, recall, accuracy = self.evaluate(y, y_predicted)
    print("precision:", precision)
    print("recall:", recall)
    print("accuracy:", accuracy)

    return y_predicted, y


In [None]:
clf = SingleVsMultiModel(X_train= X_training, X_test=X_testing, y_train= y_training,y_test=y_testing, laplace_smoothing_value=0.5, laplace=True)

In [None]:
clf.fit()

In [None]:
y_predicted, y = clf.predict()

In [None]:
print(y_predicted)
print(y)