# Overview
- Work in progress...

# Exploratory Data Analysis

In [1]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import datatable as dt
from tqdm import tqdm
import matplotlib.pyplot as plt
import seaborn as sns
import matplotlib.gridspec as gridspec
from matplotlib.ticker import MaxNLocator

from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.decomposition import PCA

from sklearn.cluster import KMeans

import plotly.express as px

from scipy import stats
from scipy.stats import norm, skew

import gc
plt.style.use('ggplot')

cust_color = ['#fdc029',
'#f7c14c',
'#f0c268',
'#e8c381',
'#dfc498',
'#d4c5af',
'#c6c6c6',
'#a6a6a8',
'#86868a',
'#68686d',
'#4b4c52',
'#303138',
'#171820',
]

plt.rcParams['figure.figsize'] = (18,14)
plt.rcParams['figure.dpi'] = 300
plt.rcParams["axes.grid"] = True
plt.rcParams["grid.color"] = cust_color[3]
plt.rcParams["grid.alpha"] = 0.5
plt.rcParams["grid.linestyle"] = '--'
plt.rcParams["font.family"] = "monospace"

plt.rcParams['axes.edgecolor'] = 'black'
plt.rcParams['figure.frameon'] = False
plt.rcParams['axes.spines.left'] = False
plt.rcParams['axes.spines.bottom'] = False
plt.rcParams['axes.spines.top'] = False
plt.rcParams['axes.spines.right'] = False
plt.rcParams['axes.linewidth'] = 1.5

import warnings
warnings.filterwarnings("ignore")



ModuleNotFoundError: No module named 'datatable'

In [None]:
df = dt.fread('../input/ubiquant-market-prediction/train.csv').to_pandas()

## First Look

We have train, test and submission .csv's, let's take a look our train data first.

In [None]:
df.head()

Number of instances

In [None]:
print(f'Train df number of instance: {df.shape[0]}')

Missing Values:

In [None]:
print(f'Train df missing value count: {df.isna().sum().sum()}')

Investments:

In [None]:
print(f'Train df number of unique investments: {df.investment_id.nunique()}')

In [None]:
print(f'Train df number of unique investments: {df.time_id.nunique()}')

It seems most of the investments ID's having 800+ timestamp records, while there are some having much less with left skewed distribution...

In [None]:
time_count=df.groupby("investment_id")['time_id'].count()
fig, ax = plt.subplots(figsize=(12,9))
sns.histplot(time_count, color=cust_color[-1], kde=True)
plt.title('Number of time_id\'s per Investment Distribution')
plt.show()

# Random Sampling

Since we have high number of instances (3141410) let's take some samples representing the actual population.

In [None]:
sampled_df = df.sample(frac=0.05, random_state=42)

In [None]:
# from statsmodels.stats.weightstats import ztest
# diff = np.mean(df.target) - np.mean(sampled_df.target)
# t, p = ztest(df.target, x2=sampled_df.target, value=diff)
# (np.nanmean(sampled_df.target) - np.nanmean(df.target)) / df.target.std()

In [None]:
del df
gc.collect()

Converting features "float16" to save some memory.

In [None]:
features = [f'f_{i}' for i in range(300)]

for f in features:
    sampled_df[f] = sampled_df[f].astype('float16')

# Target Distribution

In [None]:
def plot_dist3(df, feature, title):
    
    # Creating a customized chart. and giving in figsize and everything.
    
    fig = plt.figure(constrained_layout=True)
    
    # creating a grid of 3 cols and 3 rows.
    
    grid = gridspec.GridSpec(ncols=3, nrows=2, figure=fig)

    # Customizing the histogram grid.
    
    ax1 = fig.add_subplot(grid[0, :2])
    
    # Set the title.
    
    ax1.set_title('Histogram')
    
    # plot the histogram.
    
    sns.distplot(df.loc[:, feature],
                 hist=True,
                 kde=True,
                 fit=norm,
                  hist_kws={
                 'rwidth': 0.85,
                 'edgecolor': 'black',
                 'linewidth':.5,
                 'alpha': 0.8},
                 ax=ax1,
                 color=cust_color[0])
    
    ax1.axvline(df.loc[:, feature].mean(), color='Green', linestyle='dashed', linewidth=3)

    min_ylim, max_ylim = plt.ylim()
    ax1.text(df.loc[:, feature].mean()*2, max_ylim*0.95, 'Mean: {:.2f}'.format(df.loc[:, feature].mean()), color='Green', fontsize='12',
             bbox=dict(boxstyle='round',facecolor='red', alpha=0.5))
    ax1.legend(labels=['Actual','Normal'])
    ax1.xaxis.set_major_locator(MaxNLocator(nbins=12))
    
    ax2 = fig.add_subplot(grid[1, :2])
    
    # Set the title.
    
    ax2.set_title('Probability Plot')
    
    # Plotting the QQ_Plot.
    stats.probplot(df.loc[:, feature],
                   plot=ax2)
    ax2.get_lines()[0].set_markerfacecolor('#e74c3c')
    ax2.get_lines()[0].set_markersize(12.0)
    ax2.xaxis.set_major_locator(MaxNLocator(nbins=16))

    # Customizing the Box Plot:
    
    ax3 = fig.add_subplot(grid[:, 2])
    # Set title.
    
    ax3.set_title('Box Plot')
    
    # Plotting the box plot.
    
    sns.boxplot(y=feature, data=df, ax=ax3, color=cust_color[0])
    ax3.yaxis.set_major_locator(MaxNLocator(nbins=24))
    #ax3.set_ylim(0,clip_value)

    plt.suptitle(f'{title}', fontsize=24, fontname = 'monospace', weight='bold')

In [None]:
plot_dist3(sampled_df, 'target', 'Survey Duration Distribution')

Target has decent distribution centered around 0 with a peak in the middle.

# Some 'Odd' Feature Distributions

These are the top features where their distribution doesn't fit "normal" standards. We mighty something useful four our models by looking at them.

In [None]:
features_std = sampled_df.iloc[:,4:].apply(lambda x: x.std()).sort_values(
    ascending=False)
f_std = sampled_df[features_std.iloc[:20].index.tolist()]

features_skew = np.abs(sampled_df.iloc[:,4:].apply(lambda x: skew(x)).sort_values(
    ascending=False))
skewed = sampled_df[features_skew.iloc[:20].index.tolist()]

In [None]:
def feat_dist(df, cols, rows=3, columns=3, title=None):
    
    '''A function for displaying skew feat distribution'''
    
    fig, axes = plt.subplots(rows, columns, figsize=(30, 25), constrained_layout=True)
    axes = axes.flatten()

    for i, j in zip(cols, axes):
        sns.distplot(
                    df[i],
                    ax=j,
                    fit=norm,
                    hist=False,
                    color=cust_color[3],
                    kde_kws={'linewidth':3}
        )   
        
        (mu, sigma) = norm.fit(df[i])
        j.set_title('Dist of {0} Norm Fit: $\mu=${1:.2g}, $\sigma=${2:.2f}'.format(i, mu, sigma), weight='bold')
        j.legend(labels=[f'{i}', 'Normal Dist'])
        fig.suptitle(f'{title}', fontsize=24, weight='bold')

In [None]:
feat_dist(sampled_df, f_std.columns.tolist(), rows=5, columns=4, title='Distribution of High Std Features')

In [None]:
# Creating distplot of features which has high skewness

feat_dist(sampled_df, skewed.columns.tolist(), rows=5, columns=4, title='Distribution of Skewed Features')

# Feature Target Correlation

In [None]:
correlations = sampled_df.corrwith(sampled_df['target']).iloc[:-1].to_frame()
correlations['Abs Corr'] = correlations[0].abs()
sorted_correlations = correlations.sort_values('Abs Corr', ascending=False)['Abs Corr']
fig, ax = plt.subplots(figsize=(6,8))
sns.heatmap(sorted_correlations.iloc[1:].to_frame()[sorted_correlations>=.04], cmap='coolwarm', annot=True, vmin=-1, vmax=1, ax=ax)
plt.title('Feature Correlations With Target')
plt.show()

Almost no linear correlation between features and target...

# Correlation Between Features

In [None]:
corr = sampled_df.iloc[:, 4:].corr()
sns.clustermap(corr, metric="correlation", cmap="Reds", figsize=(20, 20))
plt.suptitle('Correlations Between Features', fontsize=24, weight='bold')
plt.show()


# Correlations Between Features

In [None]:
corr = corr.abs()

corrs = corr.unstack()
pair = corrs.sort_values(ascending=False)
pair = pair.reset_index(name='correlation').rename(columns={'level_0': 'feature_a', 'level_1': 'feature_b', 0: 'correlation'})
pair = pair[pair['feature_a'] != pair['feature_b']].iloc[::2,:]
pair = pair[:10]
pair

Looks like there are some strongly correlated features. Let's take a closer look:

In [None]:
sns.jointplot(sampled_df[pair['feature_a'].iloc[0]], sampled_df[pair['feature_b'].iloc[0]], kind="reg", color=cust_color[0], height=8,
              joint_kws={'scatter_kws':dict(alpha=0.5, edgecolor="r", linewidth=0.5)})
plt.show()

Yeah linearity there for some features, let's take a look at the general picture with hexbins since there are many points to scatter this might show it better...

In [None]:
def hex_plot(df, rows=3, columns=3, title=None):
    
    '''A function for displaying skew feat distribution'''
    
    fig, axes = plt.subplots(rows, columns, figsize=(30, 25), constrained_layout=True)
    axes = axes.flatten()

    for i,j in enumerate(axes):
        j.hexbin(sampled_df[pair['feature_a'].iloc[i]], sampled_df[pair['feature_b'].iloc[i]],  gridsize=100, cmap='Reds', bins='log')
        j.set_xlabel(pair['feature_a'].iloc[i])
        j.set_ylabel(pair['feature_b'].iloc[i])

        fig.suptitle(f'{title}', fontsize=24, weight='bold')

In [None]:
hex_plot(sampled_df, rows=5, columns=2, title='Highly Correlated Features')

We can clearly see there are strong linear correlations between some features either negative or positive. Since we have kind of a regression problem in our hands we should take a closer look to these variables to prevent multicollinearity...

# Dimension Reduction and Clusters

Since the data is anonymized and lacking categorical variables we might want to look at some reduced dimension plots and use some unsupervised techniques to see if we can find some patterns.

In [None]:
features = sampled_df.iloc[:, 4:].columns.tolist()


pipe = Pipeline([('scaler', StandardScaler()),('pca', PCA())])
pipe.fit(sampled_df[features])
pca_samples = pipe.transform(sampled_df[features])

# explaining variance ratio:

fig, ax = plt.subplots(figsize=(14, 5))
plt.plot(range(sampled_df[features].shape[1]), pipe.named_steps['pca'].explained_variance_ratio_.cumsum(), linestyle='--', drawstyle='steps-mid', color=cust_color[-1],
         label='Cumulative Explained Variance', linewidth = 1.5)
sns.barplot(np.arange(1,sampled_df[features].shape[1]+1), pipe.named_steps['pca'].explained_variance_ratio_, alpha=0.85, color=cust_color[0],
            label='Individual Explained Variance', edgecolor='black', saturation = 2, linewidth = 0.5)

plt.ylabel('Explained Variance Ratio', fontsize = 14, fontname = 'monospace', weight='semibold')
plt.xlabel('Number of Principal Components', fontsize = 14, fontname = 'monospace', weight='semibold')
ax.set_title('Explained Variance', fontsize = 20, fontname = 'monospace', weight='bold')
plt.xticks(fontsize=8, rotation=90)
plt.legend(fontsize = 13)
plt.axis([0,99,0,1])

We do have many features but it seems we cannot reduce them to lower value without losing some signals. To even explain the 80% variance we might have to use 100 principal components.

Let's try our luck with clustering, maybe we can fit some instances into specific clusters so it can allow us to breakdown the problem and inspect different groups individualy. Let's see how many clusters we would need...

In [None]:
kmeans_per_k = [Pipeline([('scaler', StandardScaler()),('km', KMeans(n_clusters=k, random_state=42, max_iter=100, n_init=5, tol=1e-4))]).fit(sampled_df[features])
                for k in range(1, 8)]
inertias = [model.named_steps['km'].inertia_ for model in kmeans_per_k]

plt.figure(figsize=(6, 3))
sns.lineplot(range(1, 8), inertias, color=cust_color[0], linewidth = 1.5)
plt.xlabel("k", fontsize=15)
plt.ylabel("Inertia", fontsize=15)

plt.title('Inertias and n_clusters', fontname = 'monospace', weight='bold')
plt.show()

Hmm, Doesn't look good... Anyways we have the sharpest elbow at k=2 but let's try k=4 it has also somewhat decent curve.

In [None]:
z  = Pipeline([('scaler', StandardScaler()),('km', KMeans(n_clusters=4, random_state=42, max_iter=100, tol=1e-4))]).fit(sampled_df[features])
clusters = z.fit_predict(sampled_df[features])
clusters=[str(number) for number in clusters]

In [None]:
pipe = Pipeline([('scaler', StandardScaler()),('pca', PCA(n_components=2))])
pipe.fit(sampled_df[features])
pca_samples = pipe.transform(sampled_df[features])
sns.scatterplot(pca_samples[:,0], pca_samples[:,1], hue=clusters)
plt.title("Clusters on Reduced Dimension")
plt.show()

Well we have clusters but they don't mean much... Clusters are looking pretty close to each other.

In [None]:
pipe = Pipeline([('scaler', StandardScaler()),('pca', PCA(n_components=4))])
pipe.fit(sampled_df[features])
pca_samples = pipe.transform(sampled_df[features])

total_var = pipe.named_steps['pca'].explained_variance_ratio_.sum() * 100

labels = {
    str(i): f"PC {i+1} ({var:.1f}%)"
    for i, var in enumerate(pipe.named_steps['pca'].explained_variance_ratio_ * 100)
}
labels['color'] = 'Cluster'

fig = px.scatter_matrix(
    pca_samples,
    color=clusters,
    dimensions=range(4),
    labels=labels,
    title=f'Total Explained Variance: {total_var:.2f}% by Clusters',
    opacity=0.5
)
fig.update_traces(diagonal_visible=False)
fig.show()

The picture doesn't change much when we plot components against each other too. Oh well...

Next we should decide what method we can use to get some more insights using classical EDA techniques.

## Work in Progress...
