In [None]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split as tts
from sklearn.preprocessing import minmax_scale

from scipy.stats import norm, gaussian_kde

from imblearn.over_sampling import RandomOverSampler, SMOTE, ADASYN

# Download the date
https://archive.ics.uci.edu/ml/machine-learning-databases/00577/codon_usage.csv.zip


In [None]:
# !wget https://archive.ics.uci.edu/ml/machine-learning-databases/00577/codon_usage.csv.zip

In [None]:
# !unzip codon_usage.csv.zip

# Data Preprocessing

In [None]:
def read_data(file_name):
    """
    Reads in a csv file and returns a dataframe
    """
    return pd.read_csv(file_name, low_memory=False)

dataset = read_data('codon_usage.csv')
df = dataset.copy()

In [None]:
# dataset = read_data('codon_usage.csv')
dataset.describe()

In [None]:
dataset

In [None]:
# dataset.dropna(inplace=True)

dataset['Kingdom'].value_counts()

In [None]:
print(dataset.isnull().sum())

In [None]:
dataset['Kingdom'].value_counts().plot(kind='bar')

In [None]:
classes_dict = {    'arc': 0, 'bct': 1, 'phg': 2, 'plm': 3, 
                    'pln': 4, 'inv': 5, 'vrt': 6, 'mam': 7,
                    'rod': 8, 'pri': 9, 'vrl':10
                }

In [None]:
dataset['Kingdom'] = dataset['Kingdom'].map(classes_dict)

In [None]:
# dataset['Kingdom'].value_counts().plot(kind='bar')

In [None]:
dataset = dataset.drop(dataset[dataset['SpeciesID']==353569].index)
dataset = dataset.drop(dataset[dataset['SpeciesID']==1238].index)


In [None]:
dataset['UUU'] = dataset['UUU'].astype(float)
dataset['UUC'] = dataset['UUC'].astype(float)
# dataset = minmax_scale(dataset)
# cols = dataset.select_dtypes(np.number).columns

# num_columns_list = list(cols)
# # num_columns_list

# num_columns_list.remove('Kingdom')
# # num_columns_list.remove('DNAtype')
# num_columns_list.remove('SpeciesID')
# # num_columns_list.remove('Ncodons')
# # num_columns_list

# Removing the minmax_scale of the columns because it is not necessary it might lead to 0
# for num_column in num_columns_list:
#     dataset[num_column] = minmax_scale(dataset[num_column]) 

In [None]:
dataset.head()

In [None]:
X = dataset.iloc[:, 1:]
X.drop(['SpeciesName', 'SpeciesID', 'DNAtype', 'Ncodons'], axis=1, inplace=True)
X

In [None]:
X = X.values.astype(float)
y = dataset.iloc[:, 0]

In [None]:
class DataGenerator:

    def __init__(self, X, y, sampleSize=16):
        self.X = X
        self.y = y
        self.sampleSize = sampleSize
        if self.sampleSize <= 0 and self.sampleSize > len(self.X):
            return TypeError("sampleSize must be between 0 and ", len(X))

    def split_data(self, test_size=0.3, random_state=1):
        self.X_train, self.X_rem, self.y_train, self.y_rem = tts(self.X, self.y, test_size=test_size, random_state=random_state)
        return self.X_train, self.X_rem, self.y_train, self.y_rem
    

In [None]:
X.shape

In [None]:
y.shape

In [None]:
data_generator = DataGenerator(y=y, X=X)
X_training, X_testing, y_training, y_testing = data_generator.split_data()

In [None]:
def sampler(X, y, technique):
    if technique =='ros':
        ros = RandomOverSampler(random_state=1)
        X_training, y_training = ros.fit_resample(X, y)
    
    elif technique =='smote':
        smoothing = SMOTE(random_state=1)
        X_training, y_training = smoothing.fit_resample(X, y)
    
    elif technique =='adasyn':
        adasyn = ADASYN(random_state=1)
        X_training, y_training = adasyn.fit_resample(X, y)
    
    return X_training, y_training

In [None]:
class MultiClassClassification:
    def __init__(self, X_train, y_train, laplace_smoothing_value, laplace, total_classes=11):
        self.laplace_smoothing_value = laplace_smoothing_value
        self.laplace = laplace
        self.total_classes = total_classes
        self.X_train = X_train
        self.y_train = y_train
        self.fitFeatures = []

    def fit_distribution(self, data):

        # mean = np.mean(data) # mean of the data
        # std = np.std(data) # standard deviation of the data
        # dist = norm(mean, std) # create a normal distribution with the mean and standard deviation

        # return dist
        return gaussian_kde(data)  # return the distribution

    def laplace_smoothing(self, classes, length):
        smooths = np.empty(self.total_classes, dtype=object)
        for i in range(length):
            smooths[i] = (classes[i] + self.laplace_smoothing_value) / \
                ((classes[i].size) +
                 self.laplace_smoothing_value * self.n_features)

        return smooths

    def probability(self, data):
        py = np.ones(self.total_classes)

        for i in range(self.n_features):
            for j in range(self.total_classes):
                py[j] *= self.features['X'+str(i)+str(j)].pdf(data[i])

        for i in range(self.total_classes):
            py[i] *= self.prior[i]

        return py

    def laplace_probability(self, data):

        py = np.ones(self.total_classes)
        pdf = np.ones(self.total_classes)

        for i in range(self.n_features):
            for j in range(self.total_classes):
                pdf[j] *= self.features['X'+str(i)+str(j)].pdf(data[i])

            smooths = self.laplace_smoothing(pdf, self.total_classes)
            for j in range(self.total_classes):
                py[j] *= smooths[j]

        for i in range(self.total_classes):
            py[i] *= self.prior[i]

        return py

    def fit(self):

        X_training_class = np.empty(self.total_classes, dtype=object)
        for i in range(self.total_classes):
            X_training_class[i] = self.X_train[self.y_train == i]

        self.prior = np.zeros(self.total_classes)
        for i in range(self.total_classes):
            # print('setting prior', i)
            self.prior[i] = len(X_training_class[i]) / len(self.X_train)

        self.n_features = self.X_train.shape[1]
        print('Number of features: ', self.n_features)
        self.features = {}

        for i in range(self.n_features):
            for j in range(self.total_classes):
                self.features['X'+str(i)+str(j)
                              ] = self.fit_distribution(X_training_class[j][:, i])

    def predict(self, X_test, y_test):
        y_predicted = []
        y_second_predicted = []
        y = []
        for sample, target in zip(X_test, y_test):

            if self.laplace:
                py = self.laplace_probability(sample)

            else:
                py = self.probability(sample)

            # print("Model predicted class {} and truth was {}".format(np.argmax(py), target))
            y_predicted.append(np.argmax(py))
            # y_second_predicted.append(np.argsort(np.max(py, axis=0))[-2])
            y.append(target)

        y_predicted = np.array(y_predicted)
        y = np.array(y)
        return y_predicted, y_second_predicted, y


In [None]:
import matplotlib.pyplot as plt


def draw_matrix(true_positive, false_positive, false_negative, true_negative):
    '''
      Draw a confusion matrix.
    '''
    matrix = np.array([[true_positive, false_negative],
                      [false_positive, true_negative]])
    plt.imshow(matrix, interpolation='nearest', cmap=plt.cm.inferno_r)
    plt.colorbar()
    plt.show()


def confusion_matrix(y, y_predicted, desired_class):
    '''
      Takes original classes and predicted classes as input
    '''
    print('total y', y.size)
    print('total y_predicted', y_predicted.size)
    actual = (y == desired_class)

    predicted = (y_predicted == desired_class)
    true_positive = (actual & predicted).sum()
    false_positive = (actual & ~predicted).sum()
    false_negative = (~actual & predicted).sum()
    true_negative = (~actual & ~predicted).sum()

    return true_positive, false_positive, false_negative, true_negative


In [None]:
def validation(given_y, y_predicted, classification_class='Kingdom'):
    '''
      Takes sampling technique as input

      Returns:
        precision, recall, accuracy
    '''
    for class_name in classes_dict.keys():
        print(class_name)
        print(dataset[dataset[classification_class] == int(
            classes_dict.get(class_name))][classification_class].count())
        true_positive, false_positive, false_negative, true_negative = confusion_matrix(
            given_y, y_predicted, int(classes_dict.get(class_name)))
        # draw_matrix(true_positive= true_positive, true_negative= true_negative, false_positive= false_positive, false_negative= false_negative)
        print("true positive:", true_positive)
        print("false negative:", false_negative)
        print("false positive:", false_positive)
        print("true negative:", true_negative)
        print("Precision:", true_positive / (true_positive + false_positive))
        print("True Positive rate or Recall:",
              true_positive / (true_positive + false_negative))
        print("specificity, selectivity or True Negative Rate:",
              true_negative / (true_negative + false_positive))
        print("Accuracy:", (true_positive + true_negative) /
              (true_positive + false_positive + false_negative + true_negative))
        print("\n")


In [None]:
def original_accuracy(y, y_predicted):
    '''
      Takes original classes and predicted classes as input
    '''
    actual = (y == y_predicted)
    return actual.sum() / len(y)
