# Pre-processing

In [None]:
import pandas as pd
import numpy as np
from scipy import stats
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import PowerTransformer

def plot_transformations(data, feature, original, transformed):
    fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(12, 4))
    
    sns.histplot(original, kde=True, ax=ax1)
    ax1.set_title(f'Original {feature}')
    ax1.set_xlabel('')
    
    sns.histplot(transformed, kde=True, ax=ax2)
    ax2.set_title(f'Yeo-Johnson Transformed {feature}')
    ax2.set_xlabel('')
    
    plt.tight_layout()
    plt.show()
    
    print(f"Original {feature}:")
    print(f"Skewness: {stats.skew(original):.3f}")
    print(f"Kurtosis: {stats.kurtosis(original):.3f}")
    
    print(f"\nTransformed {feature}:")
    print(f"Skewness: {stats.skew(transformed):.3f}")
    print(f"Kurtosis: {stats.kurtosis(transformed):.3f}")

def handle_outliers_by_class(data, feature, class_column, lower_percentile=1, upper_percentile=99):
    data[f'{feature}_capped'] = data[feature]
    
    for class_value in data[class_column].unique():
        class_data = data[data[class_column] == class_value]
        lower_limit = np.percentile(class_data[feature], lower_percentile)
        upper_limit = np.percentile(class_data[feature], upper_percentile)
        
        data.loc[data[class_column] == class_value, f'{feature}_capped'] = \
            data.loc[data[class_column] == class_value, feature].clip(lower_limit, upper_limit)
    
    return data

def plot_class_specific_handling(data, original_feature, capped_feature, class_column):
    fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(15, 5))
    
    sns.boxplot(x=class_column, y=original_feature, data=data, ax=ax1)
    ax1.set_title(f'Before: {original_feature}')
    
    sns.boxplot(x=class_column, y=capped_feature, data=data, ax=ax2)
    ax2.set_title(f'After: {capped_feature}')
    
    plt.tight_layout()
    plt.show()

df = pd.read_csv('acoustic_features.csv')

# Features with high skewness and kurtosis (from previous analysis)
skewed_features = [
    '_Fluctuation_Mean',
    '_Roughness_Mean',
    '_AttackTime_Mean',
    '_Pulseclarity_Mean',
    '_Spectralskewness_Mean',
    '_Spectralkurtosis_Mean',
    '_Spectralflatness_Mean',
    '_Chromagram_Mean_2',
    '_Chromagram_Mean_4',
    '_Chromagram_Mean_6',
    '_Chromagram_Mean_7',
    '_HarmonicChangeDetectionFunction_PeriodEntropy'
]

# Initialize PowerTransformer (Box-Cox)
pt = PowerTransformer(method='yeo-johnson', standardize=True)

# Apply Box-Cox transformation
df_transformed = df.copy()
df_transformed[skewed_features] = pt.fit_transform(df[skewed_features])

# Plot before and after transformation
for feature in skewed_features:
    plot_transformations(df, feature, df[feature], df_transformed[feature])

# Handle outliers for transformed features
for feature in skewed_features:
    df_transformed = handle_outliers_by_class(df_transformed, feature, 'Class')
    plot_class_specific_handling(df_transformed, feature, f'{feature}_capped', 'Class')

# Replace transformed features with capped versions
for feature in skewed_features:
    df_transformed[feature] = df_transformed[f'{feature}_capped']
    df_transformed = df_transformed.drop(f'{feature}_capped', axis=1)

# Save the processed dataframe
df_transformed.to_csv('fully_processed_data.csv', index=False)

print("Preprocessing completed. Fully processed data saved to 'fully_processed_data.csv'")

# Optional: Print summary statistics of the processed data
print("\nSummary statistics of processed data:")
print(df_transformed.describe())

# Optional: Check for any remaining high skewness or kurtosis
skewness = df_transformed.drop('Class', axis=1).skew()
kurtosis = df_transformed.drop('Class', axis=1).kurtosis()

print("\nFeatures with remaining high skewness (>1 or <-1):")
print(skewness[abs(skewness) > 1])

print("\nFeatures with remaining high kurtosis (>7):")
print(kurtosis[kurtosis > 7])