# Exploratory Data Analysis

In [21]:
import pandas as pd
import numpy as np
import seaborn as sns

### Overview of Data

In [22]:
X = pd.read_csv('./data/X_train.csv')
y = pd.read_csv('./data/y_train.csv')

# Every row in dataset has unique ID so carries no useful information
X = X.drop('id', axis=1)
y = y.drop('id', axis=1)

# General information
print(f'X has {X.shape[0]} data points, each with {X.shape[1]} features')
print(f'y has {y.shape[0]} labels')
print()

# Information about y
print('max(y): ', y.y.max())
print('min(y): ', y.y.min())
print(f'mean(y): {y.y.mean():.2f}')
print(f'std(y): {y.y.std():.2f}')

X has 1212 data points, each with 832 features
y has 1212 labels

max(y):  97.0
min(y):  42.0
mean(y): 69.89
std(y): 9.72


#### Missing Data

In [23]:
# Information about missing features
X_nan_count = X.isna().sum()
print(f'Maximum missing values: {X_nan_count.max()}/{X.shape[0]}')
print(f'Minimum missing values: {X_nan_count.min()}/{X.shape[0]}')
print('Average missing values: ', X_nan_count.mean())
print('Std missing values:, ', X_nan_count.std())

print()

# Information about missing labels
print(f'There are {int(y.y.isna().sum())} missing labels')

Maximum missing values: 121/1212
Minimum missing values: 65/1212
Average missing values:  92.43028846153847
Std missing values:,  9.138719462730808

There are 0 missing labels


#### Feature Selection

Drop every feature column which has only zero entries.

Drop every duplicate feature column.


In [34]:
# spot feature columns with only zero or empty entries
zero_feature_cols = X.columns[(X.isna() | (X == 0.0)).all()]
print(f'Dropping {len(zero_feature_cols)} feature columns with only zero or empty entries.')
X_select = X.drop(zero_feature_cols, axis=1)
# drop duplicate feature columns
dup_feats = X_select.T.duplicated()
dup_samples = X_select.duplicated()
print(f'Dropping {dup_feats.sum()} duplicate feature columns and {dup_samples.sum()} duplicate sample rows.')
# drop duplicate features
X_select = X_select.loc[:, ~dup_feats]
print(f'After feature selection, X has {X_select.shape[1]} features remaining.')
# drop duplicate samples
X_select = X_select.loc[~dup_samples, :]
y_select = y.loc[~dup_samples, :]
print(f'After dropping duplicate samples, X has {X_select.shape[0]} data points remaining.')

Dropping 4 feature columns with only zero or empty entries.
Dropping 0 duplicate feature columns and 0 duplicate sample rows.
After feature selection, X has 828 features remaining.
After dropping duplicate samples, X has 1212 data points remaining.
