In [2]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
%matplotlib inline
from sklearn.preprocessing import minmax_scale


In [None]:
# !wget https://archive.ics.uci.edu/ml/machine-learning-databases/00577/codon_usage.csv.zip

In [None]:
# !unzip codon_usage.csv.zip

In [3]:
def read_data(file_name):
    """
    Reads in a csv file and returns a dataframe
    """
    return pd.read_csv(file_name, low_memory=False)

dataset = read_data('codon_usage.csv')

In [None]:
dataset.shape

In [None]:
dataset.info()

In [None]:
# dataset = read_data('codon_usage.csv')
dataset.describe(include='all')

In [7]:
dataset = dataset.drop(dataset[dataset['SpeciesID']==353569].index)

In [8]:
# dataset = read_data('codon_usage.csv')
dataset.describe(include='all')

Unnamed: 0,Kingdom,DNAtype,SpeciesID,Ncodons,SpeciesName,UUU,UUC,UUA,UUG,CUU,...,CGG,AGA,AGG,GAU,GAC,GAA,GAG,UAA,UAG,UGA
count,13027,13027.0,13027.0,13027.0,13027,13027.0,13027.0,13027.0,13027.0,13027.0,...,13027.0,13027.0,13027.0,13027.0,13027.0,13027.0,13027.0,13027.0,13027.0,13027.0
unique,11,,,,13015,4790.0,4120.0,,,,...,,,,,,,,,,
top,bct,,,,Escherichia coli O157,0.0,0.01911,,,,...,,,,,,,,,,
freq,2919,,,,4,81.0,43.0,,,,...,,,,,,,,,,
mean,,0.367237,130433.978583,79611.74,,,,0.020637,0.014104,0.017821,...,0.005452,0.009929,0.006422,0.024179,0.021166,0.02829,0.021683,0.001642,0.00059,0.006178
std,,0.688745,124776.561488,719728.3,,,,0.02071,0.00928,0.010587,...,0.006601,0.008574,0.006387,0.013827,0.01304,0.014343,0.015018,0.001794,0.000882,0.010345
min,,0.0,7.0,1000.0,,,,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,,0.0,28850.5,1602.0,,,,0.00561,0.007105,0.01089,...,0.00122,0.00169,0.00117,0.012385,0.01186,0.01736,0.00971,0.00056,0.0,0.00041
50%,,0.0,81971.0,2929.0,,,,0.01526,0.01336,0.01613,...,0.00353,0.00927,0.00454,0.02542,0.01907,0.02608,0.02054,0.00138,0.00042,0.00113
75%,,1.0,222890.0,9120.0,,,,0.02949,0.019805,0.02273,...,0.00715,0.015925,0.01025,0.03419,0.02769,0.0368,0.031125,0.00237,0.00083,0.00289


In [9]:
dataset.isnull().sum()

Kingdom        0
DNAtype        0
SpeciesID      0
Ncodons        0
SpeciesName    0
              ..
GAA            0
GAG            0
UAA            0
UAG            0
UGA            0
Length: 69, dtype: int64

In [15]:
print(dataset.Kingdom.unique())
len(dataset.Kingdom.unique())

['vrl' 'arc' 'bct' 'phg' 'plm' 'pln' 'inv' 'vrt' 'mam' 'rod' 'pri']


11

In [None]:
dataset

In [17]:
# dataset.dropna(inplace=True)

dataset['Kingdom'].value_counts()


bct    2919
vrl    2832
pln    2523
vrt    2077
inv    1345
mam     572
phg     220
rod     215
pri     180
arc     126
plm      18
Name: Kingdom, dtype: int64

In [None]:
print(dataset.isnull().sum())

In [None]:
dataset['Kingdom'].unique()

In [None]:
dataset

In [None]:
dataset['UUU'] = dataset['UUU'].astype(float)
dataset['UUC'] = dataset['UUC'].astype(float)

cols = dataset.select_dtypes(np.number).columns

num_columns_list = list(cols)
# num_columns_list

# num_columns_list.remove('Kingdom')
num_columns_list.remove('DNAtype')
num_columns_list.remove('SpeciesID')
num_columns_list.remove('Ncodons')

for num_column in num_columns_list:
    dataset[num_column] = minmax_scale(dataset[num_column])
    # df['a'] = minmax_scale(df['a'])

In [None]:
species_kingdoms = dataset['Kingdom'].unique()

In [None]:
features = dataset.columns[5:].values

In [None]:
features

In [None]:
def hist_plot(feature,  classes='Kingdom'):
    """
    Plots a histogram of a feature
    """
    sns.displot(data=dataset, x=feature, hue=classes)
    plt.title(feature)
    plt.show()
    


def boxplot_plot(feature, classes='Kingdom', axes=None):
    """
    Plots a boxplot of a feature
    """
    sns.boxplot(x=classes, y=feature, data=dataset, ax=axes)
    plt.title(feature)
    plt.show()

In [None]:
# anomaly_lister = ['UUU','UUC']
anomaly_lister = []
for feature in features:
    hist_plot(feature, classes='Kingdom_Class')
    if feature not in anomaly_lister:
        print(feature)
        boxplot_plot(feature, classes='Kingdom_Class')

In [None]:
plt.figure(figsize=[100,80])
sns.heatmap(dataset.corr(), annot=True, cmap = 'viridis_r', fmt = '.2f')
plt.savefig('heatmap.pdf') 

In [None]:
dataset['Kingdom_Class'] = dataset['Kingdom'].map({   'arc': 0, 'bct': 0, 
                                                'phg': 0, 'plm': 0, 'vrl':0, 
                                                'pln': 1, 'inv': 1, 
                                                'vrt': 1, 'mam': 1,
                                                'rod': 1, 'pri': 1})

In [None]:
# anomaly_lister = ['UUU','UUC']
anomaly_lister = []
for feature in features:
    hist_plot(feature, classes='Kingdom_Class')
    if feature not in anomaly_lister:
        print(feature)
        boxplot_plot(feature, classes='Kingdom_Class')

In [None]:
plt.figure(figsize=[100,80])
sns.heatmap(dataset.corr(), annot=True, cmap = 'viridis_r', fmt = '.2f')
plt.savefig('heatmap.pdf') 