# Data Analysis

## Data Loading

In [None]:
import sys
import os

sys.path.append(os.path.abspath(os.path.join('..', 'src')))

from data_processing import load_train_and_test_data

train_data, test_data = load_train_and_test_data()

## Data Inspection

### Counting Instances

#### Training Set

In [None]:
instances_count = train_data.shape[0]
print(f"Instances: {instances_count}")

#### Test Set

In [None]:
instances_count = test_data.shape[0]
print(f"Instances: {instances_count}")

### Sampling Dataset

#### Training Set

In [None]:
train_data.head()

#### Test Set

In [None]:
test_data.head()

### Checking Feature and Target Variable Data Types

In [None]:
import pandas as pd

data_types = train_data.dtypes
pd.DataFrame(data_types, columns=['Type'])

## Data Cleaning and Preprocessing

### Handling Missing Values

#### Training Set

In [None]:
missing_values = train_data.isnull().sum()
pd.DataFrame(missing_values, columns=["Count"])

#### Test Set

In [None]:
missing_values = test_data.isnull().sum()
pd.DataFrame(missing_values, columns=["Count"])

### Handling Duplicate Rows

#### Training Set

In [None]:
duplicates_count = train_data.duplicated().sum()
print(f"Duplicate rows: {duplicates_count}")

#### Test Set

In [None]:
duplicates_count = test_data.duplicated().sum()
print(f"Duplicate rows: {duplicates_count}")

### Transforming Datetime Feature

In [None]:
from data_processing import transform_datetime

train_data_transformed = transform_datetime(train_data)
train_data_transformed.head()

### Converting Categorical Features

In [None]:
unique_values = train_data_transformed.nunique()
pd.DataFrame(unique_values, columns=["Unique Values Count"])

In [None]:
from data_processing import perform_categorical_conversion

train_data_processed = perform_categorical_conversion(train_data_transformed)
data_types_after_processing = train_data_processed.dtypes
pd.DataFrame(data_types_after_processing, columns=["Type"])

## Exploratory Data Analysis (EDA)

### Univariate Analysis of Numerical Features

#### Summary Statistics

In [None]:
from config import TARGET_VARIABLES

numerical = train_data_processed.select_dtypes(
    include=['float64', 'int64', 'int32']).drop(columns=TARGET_VARIABLES).columns

def calculate_numerical_statistics(columns, data):
    basic_stats = data[columns].describe()
    extended_stats = basic_stats.T
    
    extended_stats['median'] = data[columns].median()
    extended_stats['variance'] = data[columns].var()
    extended_stats['range'] = data[columns].max() - \
        data[columns].min()
    extended_stats['iqr'] = data[columns].quantile(
        0.75) - data[columns].quantile(0.25)
    extended_stats['skewness'] = data[columns].skew()
    extended_stats['kurtosis'] = data[columns].kurtosis()
    
    return extended_stats


calculate_numerical_statistics(numerical, train_data_processed)

#### Distributions

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

def calculate_subplots_layout(columns):
    num_features = len(columns)
    num_cols = 2 if num_features < 5 else 3
    num_rows = (num_features + num_cols - 1) // num_cols
    
    return num_rows, num_cols

def rotate_xticklabels_if_long(ax, label_length_threshold=5, rotation_angle=45):
    labels = [item.get_text() for item in ax.get_xticklabels()]
    if any(len(label) > label_length_threshold for label in labels):
        ax.tick_params(axis='x', labelrotation=rotation_angle)

def plot_numerical_distributions(columns, data):
    num_rows, num_cols = calculate_subplots_layout(columns)
    fig, axes = plt.subplots(num_rows, num_cols, figsize=(8, 3*num_rows))
    axes = axes.flatten()

    for i, feature in enumerate(columns):
        sns.histplot(data[feature], ax=axes[i], kde=True, edgecolor=None)
        axes[i].set_title(feature)
        axes[i].set_xlabel('')
        rotate_xticklabels_if_long(axes[i])

    for j in range(i + 1, len(axes)):
        axes[j].axis('off')

    plt.tight_layout()
    plt.show()


plot_numerical_distributions(numerical, train_data_processed)

### Univariate Analysis of Categorical Features

#### Summary Statistics

In [None]:
categorical = train_data_processed.select_dtypes(
    include=['category']).columns

train_data_processed[categorical].describe()

#### Distributions

In [None]:
num_rows, num_cols = calculate_subplots_layout(categorical)
fig, axes = plt.subplots(num_rows, num_cols, figsize=(8, 6))
axes = axes.flatten()

for i, col in enumerate(categorical):
    sns.countplot(data=train_data_processed, x=col,
                  ax=axes[i], order=train_data_processed[col].value_counts().index)
    axes[i].set_title(col)
    axes[i].set_xlabel('')
    axes[i].set_ylabel('Count')

plt.tight_layout()
plt.show()

### Univariate Analysis of Target Variables

#### Summary Statistics

In [None]:
calculate_numerical_statistics(TARGET_VARIABLES, train_data_processed)

#### Transform Data

In [None]:
from data_processing import transform_target_variable_data

train_data_transformed = transform_target_variable_data(train_data_processed)
calculate_numerical_statistics(TARGET_VARIABLES, train_data_transformed)

#### Distributions

In [None]:
plot_numerical_distributions(TARGET_VARIABLES, train_data_transformed)

### Bivariate Analysis of Numerical - Numerical Features

#### Pearson Coefficient

In [None]:
corr_matrix = train_data_transformed[numerical].corr()
corr_df = pd.DataFrame(corr_matrix)

corr_df

### Bivariate Analysis of Numerical - Categorical Features

In [None]:
from scipy.stats import f_oneway

results = []

for num_feature in numerical:
    for cat_feature in categorical:
        groups = [train_data_transformed[train_data_transformed[cat_feature] == level][num_feature] for level in train_data_transformed[cat_feature].unique()]
        
        f_stat, p_value = f_oneway(*groups)
        
        results.append({
            'Numerical Feature': num_feature,
            'Categorical Feature': cat_feature,
            'F-statistic': f_stat,
            'P-value': p_value
        })

results_df = pd.DataFrame(results)
alpha = 0.05
filtered_results_df = results_df[results_df['P-value'] < alpha]
ordered_results_df = filtered_results_df.sort_values(by='F-statistic', ascending=False)

ordered_results_df

### Bivariate Analysis of Categorical - Categorical Features

### Bivariate Analysis of Numerical Features - Target Variables