In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import StandardScaler, MinMaxScaler, LabelEncoder, OneHotEncoder
from sklearn.metrics import jaccard_score
from sklearn.metrics.pairwise import cosine_similarity
from scipy.stats import pearsonr
def load_and_examine_data(file_path):
   
   
    df = pd.read_csv(file_path)
    
    print("Dataset Shape:", df.shape)
    print("\nColumn Names:")
    print(df.columns.tolist())
    print("\nFirst 5 rows:")
    print(df.head())
    print("\nDataset Info:")
    print(df.info())
    print("\nMissing Values:")
    print(df.isnull().sum())
    print("\nDescriptive Statistics:")
    print(df.describe())
    
    return df

def select_relevant_features(df):
    
    
  
    relevant_features = ['CustomerKey','Age', 'Gender', 'MaritalStatus','YearlyIncome','Education','Occupation','CommuteDistance','Region','NumberCarsOwned','TotalChildren',
        'BikeBuyer'  
    ]
    
   
    available_features = [col for col in relevant_features if col in df.columns]
    
    print("Selected Features:")
    for feature in available_features:
        print(f"- {feature}")
    
    return available_features

def create_selected_dataframe(df, selected_features):

    df_selected = df[selected_features].copy()
    
    print(f"\nNew DataFrame created with {len(selected_features)} features")
    print(f"Shape: {df_selected.shape}")
    
    return df_selected

def determine_data_types(df_selected):
    
    data_types_analysis = {}
    
    for column in df_selected.columns:
        dtype = df_selected[column].dtype
        unique_values = df_selected[column].nunique()
        sample_values = df_selected[column].dropna().head(10).tolist()
        
        # Determine data type category
        if dtype in ['object']:
            if unique_values == 2:
                category = "Discrete - Nominal (Binary)"
            else:
                category = "Discrete - Nominal"
        elif dtype in ['int64', 'int32'] and unique_values < 20:
            category = "Discrete - Ordinal/Nominal"
        elif dtype in ['int64', 'int32', 'float64', 'float32']:
            if unique_values > 50:
                category = "Continuous - Ratio/Interval"
            else:
                category = "Discrete - Ordinal"
        else:
            category = "Unknown - Needs Investigation"
        
        data_types_analysis[column] = {
            'dtype': str(dtype),
            'unique_values': unique_values,
            'category': category,
            'sample_values': sample_values
        }
    
    print("\nData Type Analysis:")
    print("-" * 80)
    for col, info in data_types_analysis.items():
        print(f"{col}:")
        print(f"  - Data Type: {info['dtype']}")
        print(f"  - Unique Values: {info['unique_values']}")
        print(f"  - Category: {info['category']}")
        print(f"  - Sample Values: {info['sample_values']}")
        print()
    
    return data_types_analysis

def handle_null_values(df):
    
    print("Handling Null Values:")
    print("-" * 40)
    
    df_processed = df.copy()
    
    for column in df_processed.columns:
        null_count = df_processed[column].isnull().sum()
        if null_count > 0:
            print(f"{column}: {null_count} null values")
            
            if df_processed[column].dtype in ['object']:
                # Categorical: fill with mode
                mode_value = df_processed[column].mode()[0] if len(df_processed[column].mode()) > 0 else 'Unknown'
                df_processed[column].fillna(mode_value, inplace=True)
                print(f"  -> Filled with mode: {mode_value}")
            else:
                # Numerical: fill with median
                median_value = df_processed[column].median()
                df_processed[column].fillna(median_value, inplace=True)
                print(f"  -> Filled with median: {median_value}")
    
    print(f"\nAfter handling nulls - Remaining null values: {df_processed.isnull().sum().sum()}")
    return df_processed

def normalize_data(df, numerical_columns):
   
    print("\nNormalizing Numerical Data:")
    print("-" * 40)
    
    df_normalized = df.copy()
    
    # Min-Max Normalization (0-1 scaling)
    scaler_minmax = MinMaxScaler()
    df_normalized[numerical_columns] = scaler_minmax.fit_transform(df_normalized[numerical_columns])
    
    # Standardization (Z-score)
    scaler_standard = StandardScaler()
    standardized_data = scaler_standard.fit_transform(df[numerical_columns])
    
    # Create separate standardized dataframe
    df_standardized = df.copy()
    df_standardized[numerical_columns] = standardized_data
    
    print("Applied Min-Max Normalization and Z-score Standardization")
    
    return df_normalized, df_standardized, scaler_minmax, scaler_standard

def discretize_continuous_attributes(df, continuous_columns, n_bins=5):
    """
    (c) Discretization (Binning) on continuous attributes
    """
    print(f"\nDiscretizing Continuous Attributes into {n_bins} bins:")
    print("-" * 50)
    
    df_discretized = df.copy()
    
    for column in continuous_columns:
        if column in df_discretized.columns:
            # Equal-width binning
            df_discretized[f'{column}_binned'] = pd.cut(
                df_discretized[column], 
                bins=n_bins, 
                labels=[f'{column}_bin_{i+1}' for i in range(n_bins)]
            )
            
            # Equal-frequency binning
            df_discretized[f'{column}_quantile'] = pd.qcut(
                df_discretized[column], 
                q=n_bins, 
                labels=[f'{column}_q_{i+1}' for i in range(n_bins)],
                duplicates='drop'
            )
            
            print(f"{column} -> {column}_binned, {column}_quantile")
    
    return df_discretized

def binarize_categorical_attributes(df, categorical_columns):
    """
    (e) Binarization (One Hot Encoding) for categorical attributes
    """
    print("\nApplying One-Hot Encoding:")
    print("-" * 30)
    
    df_encoded = df.copy()
    
    for column in categorical_columns:
        if column in df_encoded.columns:
            # One-hot encoding
            dummies = pd.get_dummies(df_encoded[column], prefix=column)
            df_encoded = pd.concat([df_encoded, dummies], axis=1)
            df_encoded.drop(column, axis=1, inplace=True)
            print(f"{column} -> {len(dummies.columns)} binary columns")
    
    return df_encoded

# =============================================================================
# PART III: PROXIMITY/CORRELATION ANALYSIS
# =============================================================================

def simple_matching_coefficient(x, y):
    
    matches = np.sum(x == y)
    total = len(x)
    return matches / total

def jaccard_similarity_manual(x, y):
    
    intersection = np.sum((x == 1) & (y == 1))
    union = np.sum((x == 1) | (y == 1))
    return intersection / union if union != 0 else 0

def calculate_proximity_measures(df, obj1_idx=0, obj2_idx=1):
    
    print(f"\nCalculating Proximity Measures between objects {obj1_idx} and {obj2_idx}:")
    print("-" * 60)
    
    # Get two objects (rows)
    obj1 = df.iloc[obj1_idx].values
    obj2 = df.iloc[obj2_idx].values
    
    # Ensure binary data for binary similarity measures
    obj1_binary = (obj1 > 0).astype(int)
    obj2_binary = (obj2 > 0).astype(int)
    
    # Simple Matching Coefficient
    smc = simple_matching_coefficient(obj1_binary, obj2_binary)
    print(f"Simple Matching Coefficient: {smc:.4f}")
    
    # Jaccard Similarity
    jaccard = jaccard_similarity_manual(obj1_binary, obj2_binary)
    print(f"Jaccard Similarity: {jaccard:.4f}")
    
    # Cosine Similarity
    obj1_reshaped = obj1.reshape(1, -1)
    obj2_reshaped = obj2.reshape(1, -1)
    cosine = cosine_similarity(obj1_reshaped, obj2_reshaped)[0][0]
    print(f"Cosine Similarity: {cosine:.4f}")
    
    return smc, jaccard, cosine

def calculate_correlation(df, feature1='CommuteDistance', feature2='YearlyIncome'):

    print(f"\nCalculating Correlation between {feature1} and {feature2}:")
    print("-" * 50)
    
    if feature1 in df.columns and feature2 in df.columns:
        # Pearson correlation
        correlation, p_value = pearsonr(df[feature1], df[feature2])
        print(f"Pearson Correlation: {correlation:.4f}")
        print(f"P-value: {p_value:.4f}")
        
        # Spearman correlation (rank-based)
        spearman_corr = df[feature1].corr(df[feature2], method='spearman')
        print(f"Spearman Correlation: {spearman_corr:.4f}")
        
        # Create correlation plot
        plt.figure(figsize=(10, 6))
        plt.scatter(df[feature1], df[feature2], alpha=0.6)
        plt.xlabel(feature1)
        plt.ylabel(feature2)
        plt.title(f'Correlation between {feature1} and {feature2}')
        plt.grid(True, alpha=0.3)
        
        # Add trend line
        z = np.polyfit(df[feature1], df[feature2], 1)
        p = np.poly1d(z)
        plt.plot(df[feature1], p(df[feature1]), "r--", alpha=0.8)
        
        plt.show()
        
        return correlation, p_value, spearman_corr
    else:
        print("One or both features not found in dataset")
        return None, None, None


def main_pipeline(file_path):
   
    print("=" * 80)
    print("ADVENTURE WORKS CYCLES - DATA MINING PIPELINE")
    print("=" * 80)
    
    # PART I: Feature Selection and Analysis
    print("\n" + "="*50)
    print("PART I: FEATURE SELECTION AND ANALYSIS")
    print("="*50)
    
    # Load and examine data
    df = load_and_examine_data(file_path)
    
    # Select relevant features
    selected_features = select_relevant_features(df)
    
    # Create new dataframe with selected features
    df_selected = create_selected_dataframe(df, selected_features)
    
    # Determine data types
    data_types = determine_data_types(df_selected)
    
   
    print("\n" + "="*50)
    print("PART II: DATA PREPROCESSING AND TRANSFORMATION")
    print("="*50)
    
    # Handle null values
    df_no_nulls = handle_null_values(df_selected)
  
    numerical_cols = df_no_nulls.select_dtypes(include=[np.number]).columns.tolist()
    categorical_cols = df_no_nulls.select_dtypes(include=['object']).columns.tolist()
    
    numerical_cols = [col for col in numerical_cols if 'key' not in col.lower() and 'id' not in col.lower()]
    
    print(f"\nNumerical columns: {numerical_cols}")
    print(f"Categorical columns: {categorical_cols}")
    
   
    if numerical_cols:
        df_normalized, df_standardized, scaler1, scaler2 = normalize_data(df_no_nulls, numerical_cols)
    else:
        df_normalized = df_no_nulls.copy()
        df_standardized = df_no_nulls.copy()
    
    
    continuous_cols = [col for col in numerical_cols if df_no_nulls[col].nunique() > 10]
    if continuous_cols:
        df_discretized = discretize_continuous_attributes(df_normalized, continuous_cols)
    else:
        df_discretized = df_normalized.copy()
    
        if categorical_cols:
        df_final = binarize_categorical_attributes(df_standardized, categorical_cols)
    else:
        df_final = df_standardized.copy()
    
    
    print("\n" + "="*50)
    print("PART III: PROXIMITY/CORRELATION ANALYSIS")
    print("="*50)
    
    
    if len(df_final) >= 2:
        smc, jaccard, cosine = calculate_proximity_measures(df_final, 0, 1)
    
   
    correlation_result = calculate_correlation(df_no_nulls)
    
    print("\n" + "="*50)
    print("PIPELINE COMPLETED SUCCESSFULLY!")
    print("="*50)
    
    return {
        'original_data': df,
        'selected_data': df_selected,
        'processed_data': df_no_nulls,
        'normalized_data': df_normalized,
        'standardized_data': df_standardized,
        'final_data': df_final,
        'data_types': data_types
    }

