In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
%matplotlib inline
from sklearn.preprocessing import minmax_scale


In [None]:
# !wget https://archive.ics.uci.edu/ml/machine-learning-databases/00577/codon_usage.csv.zip

In [None]:
# !unzip codon_usage.csv.zip

In [None]:
def read_data(file_name):
    """
    Reads in a csv file and returns a dataframe
    """
    return pd.read_csv(file_name, low_memory=False)

dataset = read_data('codon_usage.csv')

In [None]:
dataset.shape

In [None]:
dataset.info()

In [None]:
# dataset = read_data('codon_usage.csv')
dataset.describe(include='all')

In [None]:
dataset = dataset.drop(dataset[dataset['SpeciesID']==353569].index)

In [None]:
# dataset = read_data('codon_usage.csv')
dataset.describe(include='all')

In [None]:
dataset.isnull().sum()

In [None]:
print(dataset.Kingdom.unique())
len(dataset.Kingdom.unique())

In [None]:
dataset

In [None]:
# dataset.dropna(inplace=True)

dataset['Kingdom'].value_counts()


In [None]:
print(dataset.isnull().sum())

In [None]:
dataset['Kingdom'].unique()

In [None]:
dataset

In [None]:
dataset['UUU'] = dataset['UUU'].astype(float)
dataset['UUC'] = dataset['UUC'].astype(float)

cols = dataset.select_dtypes(np.number).columns

num_columns_list = list(cols)
# num_columns_list

# num_columns_list.remove('Kingdom')
num_columns_list.remove('DNAtype')
num_columns_list.remove('SpeciesID')
num_columns_list.remove('Ncodons')

for num_column in num_columns_list:
    dataset[num_column] = minmax_scale(dataset[num_column])
    # df['a'] = minmax_scale(df['a'])

In [None]:
species_kingdoms = dataset['Kingdom'].unique()

In [None]:
features = dataset.columns[5:].values

In [None]:
features

In [None]:
def hist_plot(feature,  classes='Kingdom'):
    """
    Plots a histogram of a feature
    """
    sns.displot(data=dataset, x=feature, hue=classes)
    plt.title(feature)
    plt.show()
    


def boxplot_plot(feature, classes='Kingdom', axes=None):
    """
    Plots a boxplot of a feature
    """
    sns.boxplot(x=classes, y=feature, data=dataset, ax=axes)
    plt.title(feature)
    plt.show()

In [None]:
# anomaly_lister = ['UUU','UUC']
anomaly_lister = []
for feature in features:
    hist_plot(feature, classes='Kingdom')
    if feature not in anomaly_lister:
        print(feature)
        boxplot_plot(feature, classes='Kingdom')

In [None]:
dataset['Kingdom_Class'] = dataset['Kingdom'].map({   'arc': 0, 'bct': 0, 
                                                'phg': 0, 'plm': 0, 'vrl':0, 
                                                'pln': 1, 'inv': 1, 
                                                'vrt': 1, 'mam': 1,
                                                'rod': 1, 'pri': 1})

In [None]:
# anomaly_lister = ['UUU','UUC']
anomaly_lister = []
for feature in features:
    hist_plot(feature, classes='Kingdom_Class')
    if feature not in anomaly_lister:
        print(feature)
        boxplot_plot(feature, classes='Kingdom_Class')

In [None]:
plt.figure(figsize=[100,80])
sns.heatmap(dataset.corr(), annot=True, cmap = 'viridis_r', fmt = '.2f')
plt.savefig('heatmap.pdf') 

In [None]:
sns.pairplot(dataset, hue='Kingdom')
plt.savefig('pairplot.pdf') 