# Exploratory Data Analysis

Exploring the dataset characteristics

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

import os

## Load Data

In [None]:
dataset = 'Alkaline phosphatase PafA'

train_csv = os.path.join('input', f'{dataset}  (In Silico_ Supervised)', 'train.csv')
test_csv = os.path.join('input', f'{dataset}  (In Silico_ Supervised)', 'test (with values).csv')

truth_csv =os.path.join('output', 'truth.csv.gz')

In [None]:
train_df = pd.read_csv(train_csv)
test_df = pd.read_csv(test_csv)
truth_df = pd.read_csv(truth_csv, compression='gzip')

In [None]:
train_df.head()

In [None]:
test_df.head()

In [None]:
len(train_df) / (len(train_df) + len(test_df))

In [None]:
truth_df.head()

In [None]:
truth_ds_df = truth_df[truth_df['dataset'] == dataset]
truth_ds_df.head()

In [None]:
truth_ds_df['truth'].isna().any()

In [None]:
col_names = list(train_df.columns)
col_names

In [None]:
input_cols = col_names[:2]
target_cols = col_names[2:]

input_cols,target_cols

In [None]:
dfs = [train_df, test_df]
for df in dfs:
    for col in target_cols:
        print(df[col].isna().any())

## Merge DataFrames

In [None]:
train_df['dataset'] = 'train'
train_df.head()

In [None]:
test_df['dataset'] = 'test'
test_df.head()

In [None]:
merge_cols = ['mutated_sequence']+[name for name in target_cols]+['dataset']
merge_cols

In [None]:
merge_df = pd.concat((train_df,test_df))
merge_df = merge_df[merge_cols]
merge_df.head()

In [None]:
merge_df.tail()

In [None]:
merge2 = truth_ds_df[['mutated_sequence', 'target', 'truth']]

merge2 = merge2.pivot(index='mutated_sequence', columns='target', values='truth')
merge2.columns.name = None
merge2 = merge2.reset_index()

merge2['dataset'] = 'truth'
merge2.head()

In [None]:
merge_df = pd.concat((merge_df, merge2))
merge_df = merge_df.reset_index(drop=True)
merge_df.head()

In [None]:
merge_df.tail()

## Target Values

In [None]:
# sns.pairplot(data=merge_df, hue='dataset',vars=target_cols)
# plt.show()

In [None]:
filter = merge_df['dataset'] != 'truth'
sns.pairplot(data=merge_df[filter], hue='dataset',vars=target_cols)
plt.show()

In [None]:
# test_df and truth_df appear to be the same
# filter = merge_df['dataset'] != 'train'
# sns.pairplot(data=merge_df[filter], hue='dataset',vars=target_cols)
# plt.show()

In [None]:
filter = train_df['methyl phosphodiester (MecMUP)'] > 500
train_df[filter].head()

In [None]:
filter = (merge_df['dataset'] != 'truth') & (merge_df['methyl phosphodiester (MecMUP)'] < 500)
sns.pairplot(data=merge_df[filter], hue='dataset',vars=target_cols)
plt.show()

In [None]:
filter = train_df['methyl phosphodiester (MecMUP)'] > 75
train_df[filter].head()

In [None]:
filter = (merge_df['dataset'] != 'truth') & (merge_df['methyl phosphodiester (MecMUP)'] < 75)
sns.pairplot(data=merge_df[filter], hue='dataset',vars=target_cols)
plt.show()

In [None]:
for i, name in enumerate(target_cols):
    mu = np.mean(train_df[target_cols[i]].to_numpy())
    std = np.std(train_df[target_cols[i]].to_numpy())
    print(name, mu, std)

In [None]:
# TODO: what's the mean std with the training outliers removed?

## Input Sequences

In [None]:
# TODO: analyze the variation in sequence lengths

In [None]:
# TODO: analyze the patterns in the protein/DNA sequences...somehow