# Data Analysis

## Load and Inspect Data

In [None]:
import sys
import os

sys.path.append(os.path.abspath(os.path.join('..', 'src')))

from data_processing import load_train_and_test_data

train_data, test_data = load_train_and_test_data()

instances_count = train_data.shape[0]
print(f"Instances: {instances_count}")

In [None]:
train_data.head()

In [None]:
instances_count = test_data.shape[0]
print(f"Instances: {instances_count}")

In [None]:
test_data.head()

In [None]:
import pandas as pd

data_types = train_data.dtypes
pd.DataFrame(data_types, columns=['Type'])

Observations:
- Target variables are casual, registered and count
- Possible categorical features based on sample values are season, holiday, workingday and weather
- Datetime requires conversion from datetime[ns] format to an int64 type timestamp

## Data Cleaning and Preprocessing

### Handling Missing Values

In [None]:
missing_values = train_data.isnull().sum()
pd.DataFrame(missing_values, columns=["Count"])

In [None]:
missing_values = test_data.isnull().sum()
pd.DataFrame(missing_values, columns=["Count"])

### Handling Duplicate Rows

In [None]:
duplicates_count = train_data.duplicated().sum()
print(f"Duplicate rows: {duplicates_count}")

In [None]:
duplicates_count = test_data.duplicated().sum()
print(f"Duplicate rows: {duplicates_count}")

### Adjusting Data Types

In [None]:
from data_processing import convert_datetime_to_timestamp

train_data_converted = convert_datetime_to_timestamp(train_data)
train_data_converted[["datetime"]].head()

In [None]:
unique_values = train_data.nunique()
pd.DataFrame(unique_values, columns=["Unique Values Count"])

Observations:
- Season, holiday, workingday and weather require conversion from numerical to categorical
- Datetime is unique for each instance

In [None]:
from data_processing import perform_categorical_conversion

train_data_processed = perform_categorical_conversion(train_data_converted)
data_types_after_processing = train_data_processed.dtypes
pd.DataFrame(data_types_after_processing, columns=["Type"])

## Exploratory Data Analysis (EDA)

### Univariate Analysis of Numerical Features

#### Summary Statistics

In [None]:
from config import TARGET_VARIABLES

numerical = train_data_processed.select_dtypes(
    include=['float64', 'int64']).drop(columns=TARGET_VARIABLES).columns

def calculate_numerical_statistics(columns, data):
    basic_stats = data[columns].describe()
    extended_stats = basic_stats.T
    
    extended_stats['median'] = data[columns].median()
    extended_stats['variance'] = data[columns].var()
    extended_stats['range'] = data[columns].max() - \
        data[columns].min()
    extended_stats['iqr'] = data[columns].quantile(
        0.75) - data[columns].quantile(0.25)
    extended_stats['skewness'] = data[columns].skew()
    extended_stats['kurtosis'] = data[columns].kurtosis()
    
    return extended_stats


calculate_numerical_statistics(numerical, train_data_processed)

Observations:
- Datetime scale is significantly larger compared to other features
- Datetime shows the highest variability, followed by humidity
- Features likely to have outliers (range > 2 * IQR) are temp, atemp, humidity and windspeed
- Windspeed has largest range relative to its IQR, indicating significant outlier influence
- Datetime has smallest range relative to its IQR, indicating minimal outlier influence
- All features have acceptable levels of skewness and kurtosis, with atemp being the most skewed

#### Distributions

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

def calculate_subplots_layout(columns):
    num_features = len(columns)
    num_cols = 3
    num_rows = (num_features + num_cols - 1) // num_cols
    
    return num_rows, num_cols

def rotate_xticklabels_if_long(ax, label_length_threshold=5, rotation_angle=45):
    labels = [item.get_text() for item in ax.get_xticklabels()]
    if any(len(label) > label_length_threshold for label in labels):
        ax.tick_params(axis='x', labelrotation=rotation_angle)

def plot_numerical_distributions(columns, data):
    num_rows, num_cols = calculate_subplots_layout(columns)
    fig, axes = plt.subplots(num_rows, num_cols, figsize=(8, 3*num_rows))
    axes = axes.flatten()

    for i, feature in enumerate(columns):
        sns.histplot(data[feature], ax=axes[i], kde=True, edgecolor=None)
        axes[i].set_title(feature)
        axes[i].set_xlabel('')
        rotate_xticklabels_if_long(axes[i])

    for j in range(i + 1, len(axes)):
        axes[j].axis('off')

    plt.tight_layout()
    plt.show()


plot_numerical_distributions(numerical, train_data_processed)

#### Outliers

In [None]:
def plot_numerical_outliers(columns, data):
    flierprops = dict(marker='d', markerfacecolor='black', markersize=5)

    num_rows, num_cols = calculate_subplots_layout(columns)
    fig, axes = plt.subplots(num_rows, num_cols, figsize=(8, 3*num_rows))
    axes = axes.flatten()

    for i, feature in enumerate(columns):
        sns.boxplot(data=data, y=feature, ax=axes[i], flierprops=flierprops)
        axes[i].set_title(feature)
        axes[i].set_ylabel('')

    for j in range(i + 1, len(axes)):
        axes[j].axis('off')

    plt.tight_layout()
    plt.show()

plot_numerical_outliers(numerical, train_data_processed)

### Univariate Analysis of Categorical Features

#### Summary Statistics

In [None]:
categorical = train_data_processed.select_dtypes(
    include=['category']).columns

train_data_processed[categorical].describe()

#### Distributions

In [None]:
fig, axes = plt.subplots(nrows=2, ncols=2, figsize=(8, 6))
axes = axes.flatten()

for i, col in enumerate(categorical):
    sns.countplot(data=train_data_processed, x=col,
                  ax=axes[i], order=train_data_processed[col].value_counts().index)
    axes[i].set_title(col)
    axes[i].set_xlabel('')
    axes[i].set_ylabel('Count')

plt.tight_layout()
plt.show()

### Univariate Analysis of Target Variables

#### Summary Statistics

In [None]:
calculate_numerical_statistics(TARGET_VARIABLES, train_data_processed)

Observations:
- Range > 2 * IQR across all target variables indicating the strong influence of outliers
- Class data requires transformation to reduce skewness and kurtosis

#### Transform Data

In [None]:
import numpy as np

train_data_transformed = train_data_processed.copy()
train_data_transformed[TARGET_VARIABLES] = train_data_transformed[TARGET_VARIABLES].apply(
    lambda x: np.log(x + 1))
calculate_numerical_statistics(TARGET_VARIABLES, train_data_transformed)

#### Distributions

In [None]:
plot_numerical_distributions(TARGET_VARIABLES, train_data_transformed)

#### Outliers

In [None]:
plot_numerical_outliers(TARGET_VARIABLES, train_data_transformed)

### Bivariate Analysis of Numerical - Numerical Features

In [None]:
pairplot = sns.pairplot(train_data_transformed[numerical], plot_kws={'s': 5, 'alpha': 0.5},
             diag_kind='kde')

pairplot.figure.set_size_inches(8, 8)

for ax in pairplot.axes.flatten():
    ax.set_xticklabels([])
    ax.set_yticklabels([])
    ax.set_xticks([])
    ax.set_yticks([])

plt.show()

### Bivariate Analysis of Numerical - Categorical Features

In [None]:
def plot_numerical_to_categorical(numerical_features, categorical_features, data):
    num_numerical = len(numerical_features)
    num_categorical = len(categorical_features)

    flierprops = dict(marker='d', markerfacecolor='black', markersize=5)
    
    fig, axes = plt.subplots(num_numerical, num_categorical, figsize=(
        8, 2 * num_numerical))

    for i, num_feature in enumerate(numerical_features):
        for j, cat_feature in enumerate(categorical_features):
            ax = axes[i, j] if num_numerical > 1 else axes[j]
            sns.boxplot(x=cat_feature, y=num_feature, data=data, ax=ax, flierprops=flierprops)
            ax.set_title(f'{num_feature} vs {cat_feature}')
            ax.set_yticklabels([])
            ax.set_yticks([])

    plt.tight_layout()
    plt.show()


plot_numerical_to_categorical(numerical, categorical, train_data_transformed)

Observations:
- Large median and IQR variations indicate that season significantly influences datetime, temp and atemp
- Weather significantly influences humidity
- Holiday appears to be the least influencal categorical feature in relation to numerical features
- Outlier presence suggests that season category 3 introduces the most variability in temp, atemp and windspeed
- Weather category 3 introduces the most variability in atemp, humidity and windspeed
- Holiday category 0 introduces variability in humidity and windspeed
- A tight IQR across all numerical features in weather category 4 indicates significantly low variability

### Bivariate Analysis of Categorical - Categorical Features

In [None]:
fig, axes = plt.subplots(2, 3, figsize=(8, 6))
axes = axes.flatten()

pairs = [(categorical[x], categorical[y]) for x in range(len(categorical))
         for y in range(x + 1, len(categorical))]

for ax, (feature_i, feature_j) in zip(axes, pairs):
    crosstab = pd.crosstab(
        train_data_transformed[feature_i], train_data_transformed[feature_j])
    sns.heatmap(crosstab, cmap='PuBu', ax=ax,
                cbar=True, annot=False, fmt="d")
    ax.set_title(f'{feature_i} vs {feature_j}')

plt.tight_layout()
plt.show()

### Bivariate Analysis of Numerical Features - Target Variables

In [None]:
pairplot = sns.pairplot(data=train_data_transformed,
             x_vars=numerical,
             y_vars=TARGET_VARIABLES,
             kind='scatter',
                        plot_kws={'s': 5, 'alpha': 0.5})

pairplot.figure.set_size_inches(8, 6)

for ax in pairplot.axes.flatten():
    ax.set_xticklabels([])
    ax.set_xticks([])

plt.show()