In [1]:
# Import necessary libraries
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.impute import SimpleImputer

# Configure pandas options for better display
pd.set_option('display.max_columns', None)
pd.options.display.float_format = '{:,.2f}'.format

In [2]:
# Function to load dataset
def load_data(file_path: str) -> pd.DataFrame:
    """
    Load dataset from a given file path.

    Args:
        file_path (str): The path to the CSV file.

    Returns:
        pd.DataFrame: The loaded data as a pandas DataFrame.
    """
    return pd.read_csv(file_path)

In [3]:
# Load dataset
df = load_data('../data/raw/Customer_Data.csv')

# Standardize column names to lowercase
df.columns = map(str.lower, df.columns)

# Drop customer ID as it's not needed for analysis
if 'cust_id' in df.columns:
    df.drop('cust_id', axis=1, inplace=True)

In [None]:
# Display basic information about the dataset
print("Dataframe shape:", df.shape)
print("First 5 rows of the dataset:\n", df.head())
print("Dataset description:\n", df.describe())
print("Number of duplicate rows:", df.duplicated().sum())
print("Random sample of data (transposed):\n", df.sample(5).T)

In [None]:
# Check for missing values
missing_values = df.isnull().sum()
print("Missing values:\n", missing_values[missing_values > 0])

In [None]:
# Display random samples from key columns with missing data
print("Sample of minimum_payments:\n", df['minimum_payments'].sample(10))
print("Sample of credit_limit:\n", df['credit_limit'].sample(10))

In [None]:
# Impute missing values using the median for columns with missing data
columns_to_impute = ['minimum_payments', 'credit_limit']
imputer = SimpleImputer(strategy='median')
df[columns_to_impute] = imputer.fit_transform(df[columns_to_impute])
print("Missing values after imputation:\n", df[columns_to_impute].isnull().sum())


In [None]:
# Function to detect outliers using IQR method
def detect_outliers_iqr(df: pd.DataFrame, sample_limit: int = 3) -> pd.DataFrame:
    """
    Detects outliers in a DataFrame using the Interquartile Range (IQR) method.

    Args:
        df (pd.DataFrame): The input DataFrame.
        sample_limit (int, optional): Maximum number of outliers to sample per column. Defaults to 3.

    Returns:
        pd.DataFrame: Summary DataFrame of detected outliers for each numerical column.
    """
    outliers_summary = []
    for column in df.select_dtypes(include=['float64', 'int64']).columns:
        Q1, Q3 = df[column].quantile(0.25), df[column].quantile(0.75)
        IQR = Q3 - Q1
        lower_bound, upper_bound = Q1 - 1.5 * IQR, Q3 + 1.5 * IQR
        outliers = df[(df[column] < lower_bound) | (df[column] > upper_bound)]
        outlier_count = len(outliers)
        outlier_percentage = (outlier_count / len(df)) * 100
        # Collect statistics
        summary = {
            "Column": column,
            "Total Observations": len(df),
            "Number of Outliers": outlier_count,
            "Outlier Percentage (%)": round(outlier_percentage, 2),
            "Mean": round(df[column].mean(), 2),
            "Median": round(df[column].median(), 2),
            "Standard Deviation": round(df[column].std(), 2),
            "Skewness": round(df[column].skew(), 2),
            "Kurtosis": round(df[column].kurt(), 2),
            "Sample Outliers": outliers.head(sample_limit).to_dict(orient='records') if outlier_count > 0 else "None"
        }
        outliers_summary.append(summary)
    return pd.DataFrame(outliers_summary)

# Detect outliers in the dataset
outliers_summary_df = detect_outliers_iqr(df, sample_limit=3)

# Display outlier summary
print("Outlier Summary:")
print(outliers_summary_df)

In [None]:
# Function to treat outliers using capping method
def treat_outliers(df: pd.DataFrame, outliers_summary_df: pd.DataFrame, method: str = 'cap') -> pd.DataFrame:
    """
    Treat outliers by capping or removing them.

    Args:
        df (pd.DataFrame): The input DataFrame.
        outliers_summary_df (pd.DataFrame): Outlier summary DataFrame.
        method (str, optional): Method to treat outliers ('cap' or 'remove'). Defaults to 'cap'.

    Returns:
        pd.DataFrame: DataFrame with treated outliers.
    """
    df_cleaned = df.copy()
    for index, row in outliers_summary_df.iterrows():
        column = row['Column']
        Q1, Q3 = df[column].quantile(0.25), df[column].quantile(0.75)
        IQR = Q3 - Q1
        lower_bound, upper_bound = Q1 - 1.5 * IQR, Q3 + 1.5 * IQR
        if method == 'cap':
            # Cap outliers
            df_cleaned[column] = df_cleaned[column].clip(lower_bound, upper_bound)
        elif method == 'remove':
            # Remove rows with outliers
            outliers = df_cleaned[(df_cleaned[column] < lower_bound) | (df_cleaned[column] > upper_bound)]
            df_cleaned.drop(outliers.index, inplace=True)
    return df_cleaned

# Treat outliers using the capping method
df_cleaned = treat_outliers(df, outliers_summary_df, method='cap')

# Display cleaned data
print("Cleaned Data (First 5 rows):\n", df_cleaned.head())

In [None]:
# Function to plot box plots before and after outlier treatment
def plot_boxplots_before_after(original_data: pd.DataFrame, cleaned_data: pd.DataFrame):
    """
    Plot box plots to compare original and cleaned datasets before and after outlier removal.
    
    Args:
        original_data (pd.DataFrame): The original DataFrame.
        cleaned_data (pd.DataFrame): The cleaned DataFrame after outlier removal.
    """
    # Get all numerical columns
    numerical_columns = original_data.select_dtypes(include=['float64', 'int64']).columns.tolist()
    
    plt.figure(figsize=(15, 10))

    # Original data box plots
    plt.subplot(1, 2, 1)
    sns.boxplot(data=original_data[numerical_columns])
    plt.title('Box Plots Before Outlier Removal')
    plt.xticks(rotation=45)

    # Cleaned data box plots
    plt.subplot(1, 2, 2)
    sns.boxplot(data=cleaned_data[numerical_columns])
    plt.title('Box Plots After Outlier Removal')
    plt.xticks(rotation=45)

    plt.tight_layout()
    plt.show()

# Plot box plots for all features
plot_boxplots_before_after(df, df_cleaned)

In [None]:
# Function to detect skewness and kurtosis
def detect_skewness_kurtosis(df: pd.DataFrame) -> pd.DataFrame:
    """
    Detect skewness and kurtosis for numerical columns in a DataFrame.

    Args:
        df (pd.DataFrame): The input DataFrame.

    Returns:
        pd.DataFrame: A DataFrame summarizing skewness and kurtosis for each numerical column.
    """
    summary = []
    for column in df.select_dtypes(include=['float64', 'int64']).columns:
        summary.append({
            "Column": column,
            "Skewness": round(df[column].skew(), 2),
            "Kurtosis": round(df[column].kurt(), 2),
            "Mean": round(df[column].mean(), 2),
            "Median": round(df[column].median(), 2),
            "Standard Deviation": round(df[column].std(), 2)
        })
    return pd.DataFrame(summary)

# Detect skewness and kurtosis
skewness_kurtosis_summary_df = detect_skewness_kurtosis(df_cleaned)

# Display skewness and kurtosis summary
print("\nSkewness and Kurtosis Summary:")
print(skewness_kurtosis_summary_df)

In [12]:
# Function to fix skewness and kurtosis
def fix_skewness_kurtosis(df: pd.DataFrame, summary_df: pd.DataFrame, skew_threshold: float = 0.5, kurt_threshold: float = 3) -> pd.DataFrame:
    """
    Fix skewness and kurtosis in the dataset using transformations.

    Args:
        df (pd.DataFrame): The input DataFrame.
        summary_df (pd.DataFrame): Summary DataFrame with skewness and kurtosis info.
        skew_threshold (float, optional): Threshold to decide skewness. Defaults to 0.5.
        kurt_threshold (float, optional): Threshold to decide kurtosis. Defaults to 3.

    Returns:
        pd.DataFrame: Transformed DataFrame with fixed skewness and kurtosis.
    """
    df_transformed = df.copy()
    for _, row in summary_df.iterrows():
        column = row['Column']
        skewness, kurtosis = row['Skewness'], row['Kurtosis']
        if skewness > skew_threshold:
            # Apply log transformation for positive skewness
            df_transformed[column] = np.log1p(df_transformed[column].clip(lower=0))
        elif skewness < -skew_threshold:
            # Apply square root transformation for negative skewness
            df_transformed[column] = np.sqrt(df_transformed[column].clip(lower=0))
        # Optionally handle kurtosis if needed (skipped here)
    return df_transformed


In [None]:
# Fix skewness in the dataset
df_fixed_skewness = fix_skewness_kurtosis(df_cleaned, skewness_kurtosis_summary_df)

# Display transformed data
print("\nTransformed Data (First 5 rows):\n", df_fixed_skewness.head())

In [14]:
# Plot skewness before and after fixing
def plot_skewness_comparison(original_summary: pd.DataFrame, fixed_summary: pd.DataFrame):
    """
    Plot comparison of skewness before and after fixing.

    Args:
        original_summary (pd.DataFrame): Summary before fixing.
        fixed_summary (pd.DataFrame): Summary after fixing.
    """
    plt.figure(figsize=(12, 6))
    bar_width = 0.35
    index = np.arange(len(original_summary))  # Get the range of indices for x-axis

    # Plot the skewness before fixing
    plt.bar(index, original_summary['skewness'], bar_width, label='Before Fixing', alpha=0.7)

    # Plot the skewness after fixing
    plt.bar(index + bar_width, fixed_summary['skewness'], bar_width, label='After Fixing', alpha=0.7)

    # Labels and title
    plt.xlabel('Features', fontsize=12)
    plt.ylabel('Skewness', fontsize=12)
    plt.title('Comparison of Skewness Before and After Fixing', fontsize=15)
    plt.xticks(index + bar_width / 2, original_summary.index, rotation=90)  # Feature names on x-axis
    plt.legend()

    # Display the plot
    plt.tight_layout()
    plt.show()

In [None]:
#  before and after summary
original_summary = df.describe().T[['mean', 'std', 'min', '25%', '50%', '75%', 'max']]
fixed_summary = df_fixed_skewness.describe().T[['mean', 'std', 'min', '25%', '50%', '75%', 'max']]

# Adding skewness to summaries
original_summary['skewness'] = df.skew()
fixed_summary['skewness'] = df_fixed_skewness.skew()

# Plot the comparison
plot_skewness_comparison(original_summary, fixed_summary)

In [None]:
import os

# Optional: Save the cleaned and transformed dataset to a new CSV file
def save_cleaned_data(df: pd.DataFrame, file_path: str):
    """
    Save the cleaned and transformed dataset to a CSV file.

    Args:
        df (pd.DataFrame): The input DataFrame.
        file_path (str): The path where the CSV will be saved.
    """
    # Ensure that the directory exists
    directory = os.path.dirname(file_path)
    if not os.path.exists(directory):
        os.makedirs(directory)
    
    # Save the DataFrame to CSV
    df.to_csv(file_path, index=False)
    print(f"Cleaned data saved to {file_path}")

# Save the final cleaned dataset
save_cleaned_data(df_fixed_skewness, '../data/processed/Cleaned_Customer_Data.csv')



In [None]:
import matplotlib.pyplot as plt

# Distribution plot to visualize data distribution
def plot_all_feature_distributions(df: pd.DataFrame):
    """
    Plot histograms for the distributions of each numerical feature in the DataFrame.
    
    Args:
        df (pd.DataFrame): The input DataFrame with cleaned data.
    """
    # Plot histograms for all numerical columns in the DataFrame
    hist_plot = df.hist(bins=50, figsize=(30, 20), grid=True, color='red', alpha=0.7)

    # Set title for the overall plot
    plt.suptitle('Distribution of Numerical Features', fontsize=20)
    plt.xlabel('Value', fontsize=15)
    plt.ylabel('Frequency', fontsize=15)
    
    plt.tight_layout(rect=[0, 0, 1, 0.96])  # Adjust layout to prevent overlap with suptitle
    plt.show()

# Call the function to plot distributions for cleaned data
plot_all_feature_distributions(df_fixed_skewness)


In [None]:
df_fixed_skewness.head()

MODELING

In [19]:
# creating new variable for modeling
df_model = df_fixed_skewness.copy()

In [None]:
import pandas as pd
import numpy as np

def summary_stats(df_model, n=4):
    """
    Generate detailed descriptive statistics for the given DataFrame.

    Parameters:
    df_model (pd.DataFrame): The DataFrame for which to calculate statistics.
    n (int): Number of decimal places to round to.

    Returns:
    pd.DataFrame: A DataFrame containing descriptive statistics for each attribute.
    """
    
    # Calculate central tendency
    mean = pd.DataFrame(df_model.apply(np.mean)).T
    median = pd.DataFrame(df_model.apply(np.median)).T
    
    # Calculate distribution statistics
    std = pd.DataFrame(df_model.apply(np.std)).T
    min_value = pd.DataFrame(df_model.apply(np.min)).T
    max_value = pd.DataFrame(df_model.apply(np.max)).T
    range_value = pd.DataFrame(df_model.apply(lambda x: x.max() - x.min())).T
    skewness = pd.DataFrame(df_model.apply(lambda x: x.skew())).T
    kurtosis = pd.DataFrame(df_model.apply(lambda x: x.kurtosis())).T
    count = pd.DataFrame(df_model.count()).T  # Count of non-null values
    
    # Concatenate all statistics into a single DataFrame
    summary_stats = pd.concat([count, min_value, max_value, range_value, mean, median, std, skewness, kurtosis]).T.reset_index()
    summary_stats.columns = ['Attributes', 'Count', 'Min', 'Max', 'Range', 'Mean', 'Median', 'Std Dev', 'Skewness', 'Kurtosis']
    
    # Format the DataFrame for better readability
    summary_stats['Min'] = summary_stats['Min'].round(n)
    summary_stats['Max'] = summary_stats['Max'].round(n)
    summary_stats['Range'] = summary_stats['Range'].round(n)
    summary_stats['Mean'] = summary_stats['Mean'].round(n)
    summary_stats['Median'] = summary_stats['Median'].round(n)
    summary_stats['Std Dev'] = summary_stats['Std Dev'].round(n)
    summary_stats['Skewness'] = summary_stats['Skewness'].round(n)
    summary_stats['Kurtosis'] = summary_stats['Kurtosis'].round(n)

    # Add a summary section title
    print("Summary Statistics:")
    
    return summary_stats

# descriptive analytic for modeling feature
summary_stats(df_model)


In [None]:
# Step 6: Correlation Matrix and Visualization
correlations = df_model.corr()
plt.figure(figsize=(25, 20))
sns.heatmap(correlations, annot=True, cmap="Blues")
plt.title('Correlation Matrix', fontsize=16)
plt.show()

In [None]:
import pandas as pd

# Assuming df is your DataFrame containing the relevant features
correlations = df_model.corr()

# 1. Set the diagonal to NaN to avoid considering self-correlation
correlation_matrix_no_diag = correlations.where(~np.eye(correlations.shape[0], dtype=bool))

# 2. Find the maximum correlation value and its corresponding feature pair
max_corr_value = correlation_matrix_no_diag.max().max()  # Find the maximum correlation value
max_corr_indices = correlation_matrix_no_diag.stack().idxmax()  # Find the corresponding feature pair

# 3. Find the minimum correlation value and its corresponding feature pair
min_corr_value = correlation_matrix_no_diag.min().min()  # Find the minimum correlation value
min_corr_indices = correlation_matrix_no_diag.stack().idxmin()  # Find the corresponding feature pair

# 4. Display the results
print(f"Maximum correlation value: {max_corr_value} between features: {max_corr_indices}")
print(f"Minimum correlation value: {min_corr_value} between features: {min_corr_indices}")


In [None]:
import pandas as pd
import numpy as np

# Assuming df is your DataFrame containing the relevant features
correlations = df_model.corr()

# 1. Set the diagonal to NaN to avoid considering self-correlation
correlation_matrix_no_diag = correlations.where(~np.eye(correlations.shape[0], dtype=bool))

# Define correlation thresholds
positive_threshold = 0.7  # Adjust this based on your needs
negative_threshold = 0.5  # Adjust this based on your needs

# 2. Find positive correlations exceeding the threshold
positive_corr_pairs = correlation_matrix_no_diag.stack()[
    correlation_matrix_no_diag.stack() > positive_threshold
].reset_index()

positive_corr_pairs.columns = ['Feature 1', 'Feature 2', 'Correlation']
print("Positive correlation pairs (correlation > 0.5):")
print(positive_corr_pairs)

# 3. Find negative correlations below the threshold
negative_corr_pairs = correlation_matrix_no_diag.stack()[
    correlation_matrix_no_diag.stack() < negative_threshold
].reset_index()

negative_corr_pairs.columns = ['Feature 1', 'Feature 2', 'Correlation']
print("\nNegative correlation pairs (correlation < -0.5):")
print(negative_corr_pairs)

# 4. Display the maximum and minimum correlation values and their corresponding feature pairs again for context
max_corr_value = correlation_matrix_no_diag.max().max()
max_corr_indices = correlation_matrix_no_diag.stack().idxmax()

min_corr_value = correlation_matrix_no_diag.min().min()
min_corr_indices = correlation_matrix_no_diag.stack().idxmin()

print(f"\nMaximum correlation value: {max_corr_value} between features: {max_corr_indices}")
print(f"Minimum correlation value: {min_corr_value} between features: {min_corr_indices}")


In [None]:
features_to_drop = [
    'purchases_frequency',
    'oneoff_purchases_frequency',
    'purchases_trx',
    'cash_advance_trx',
    'purchases_installments_frequency'
]

# Dropping features from df_model_features
df_model_features = df_model.drop(columns=features_to_drop)

df_model_features.head()

In [None]:
df_model_features.dtypes, df_model_features.shape

In [None]:
df_model_features.head()

In [27]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from scipy import stats

In [28]:
# Feature Engineering

# Create New Features
df_model_features['purchase_to_payment_ratio'] = df_model_features['purchases'] / (df_model_features['payments'] + 1e-5)
df_model_features['credit_utilization'] = df_model_features['balance'] / (df_model_features['credit_limit'] + 1e-5)
df_model_features['avg_purchases_per_month'] = df_model_features['purchases'] / df_model_features['tenure']
df_model_features['avg_payments_per_month'] = df_model_features['payments'] / df_model_features['tenure']
df_model_features['purchase_payment_interaction'] = df_model_features['purchases'] * df_model_features['payments']


In [None]:
df_model_features.head()

In [None]:
df_model_features.shape

In [None]:
import pandas as pd
from sklearn.preprocessing import StandardScaler

# Assuming df_model_features is your DataFrame
scaler = StandardScaler()
scaled_features = scaler.fit_transform(df_model_features)

# Create a DataFrame from the scaled features
scaled_df = pd.DataFrame(scaled_features, columns=df_model_features.columns)

# Display the first few rows of the scaled DataFrame
scaled_df.head()


In [32]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.cluster import KMeans, AgglomerativeClustering
from sklearn.manifold import TSNE
from sklearn.decomposition import PCA
from sklearn.metrics import silhouette_score, davies_bouldin_score
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer
import scipy.cluster.hierarchy as sch

# Set style for seaborn
sns.set(style="whitegrid")


In [None]:
# Load your scaled DataFrame
# Example: scaled_df = pd.read_csv('path_to_your_scaled_data.csv')

# Display basic information about the data
print(scaled_df.info())
print(scaled_df.describe())

# Check for missing values
print(scaled_df.isnull().sum())


In [None]:
# Plotting histograms for each feature
scaled_df.hist(figsize=(15, 10), bins=30)
plt.tight_layout()
plt.show()


In [35]:
# # Create a pair plot for the scaled DataFrame
# plt.figure(figsize=(12, 10))
# sns.pairplot(scaled_df)
# plt.suptitle('Pair Plot of Scaled DataFrame', y=1.02)  # Adjust title position
# plt.show()


In [None]:
# Elbow Method to find optimal number of clusters
inertia = []
range_n_clusters = range(1, 11)

for n_clusters in range_n_clusters:
    kmeans = KMeans(n_clusters=n_clusters, random_state=42)
    kmeans.fit(scaled_df)
    inertia.append(kmeans.inertia_)

# Plotting the elbow curve
plt.figure(figsize=(10, 6))
plt.plot(range_n_clusters, inertia, marker='o')
plt.title('Elbow Method for Optimal K')
plt.xlabel('Number of Clusters')
plt.ylabel('Inertia')
plt.xticks(range_n_clusters)
plt.grid()
plt.show()


In [37]:
# Fit K-Means with the chosen number of clusters (e.g., from elbow method)
optimal_clusters_kmeans = 4  # Adjust based on elbow method
kmeans = KMeans(n_clusters=optimal_clusters_kmeans, random_state=42)
kmeans_labels = kmeans.fit_predict(scaled_df)

# Add cluster labels to the DataFrame
scaled_df['KMeans_Cluster'] = kmeans_labels


In [38]:
# Define linkage methods
linkage_methods = ['ward', 'average', 'complete']
agglomerative_results = {}

for method in linkage_methods:
    # Fit Agglomerative Clustering with the optimal number of clusters
    agglomerative = AgglomerativeClustering(n_clusters=optimal_clusters_kmeans, linkage=method)
    agglomerative_labels = agglomerative.fit_predict(scaled_df)

    # Store results
    agglomerative_results[method] = agglomerative_labels

    # Add cluster labels to the DataFrame
    scaled_df[f'Agglomerative_Cluster_{method}'] = agglomerative_labels


In [None]:
# Dendrogram for Ward method
plt.figure(figsize=(12, 8))
dendrogram = sch.dendrogram(sch.linkage(scaled_df, method='ward'))
plt.title('Dendrogram for Ward Linkage')
plt.xlabel('Samples')
plt.ylabel('Distance')
plt.show()


In [None]:
# Perform t-SNE for visualization
tsne = TSNE(n_components=2, random_state=42)
tsne_result = tsne.fit_transform(scaled_df.drop(columns=['KMeans_Cluster'] + [f'Agglomerative_Cluster_{m}' for m in linkage_methods]))

# Create a new DataFrame for visualization
tsne_df = pd.DataFrame(data=tsne_result, columns=['TSNE1', 'TSNE2'])
tsne_df['KMeans_Cluster'] = kmeans_labels

# Plotting t-SNE results for K-Means
plt.figure(figsize=(12, 6))
sns.scatterplot(x='TSNE1', y='TSNE2', hue='KMeans_Cluster', data=tsne_df, palette='deep', legend='full', alpha=0.7)
plt.title('t-SNE Visualization of K-Means Clusters')
plt.show()


In [None]:
# Calculate silhouette scores for K-Means
kmeans_silhouette = silhouette_score(scaled_df, kmeans_labels)
print(f'K-Means Silhouette Score: {kmeans_silhouette:.4f}')

# Calculate silhouette scores for Agglomerative Clustering
for method in linkage_methods:
    silhouette = silhouette_score(scaled_df, agglomerative_results[method])
    print(f'Agglomerative Clustering ({method}) Silhouette Score: {silhouette:.4f}')


In [None]:
# Calculate Davies-Bouldin scores for K-Means
kmeans_davies_bouldin = davies_bouldin_score(scaled_df, kmeans_labels)
print(f'K-Means Davies-Bouldin Score: {kmeans_davies_bouldin:.4f}')

# Calculate Davies-Bouldin scores for Agglomerative Clustering
for method in linkage_methods:
    davies_bouldin = davies_bouldin_score(scaled_df, agglomerative_results[method])
    print(f'Agglomerative Clustering ({method}) Davies-Bouldin Score: {davies_bouldin:.4f}')


In [None]:
# Calculate Davies-Bouldin scores
kmeans_davies_bouldin = davies_bouldin_score(scaled_df, kmeans_labels)
agglomerative_davies_bouldin = davies_bouldin_score(scaled_df, agglomerative_labels)

print(f'K-Means Davies-Bouldin Score: {kmeans_davies_bouldin:.4f}')
print(f'Agglomerative Davies-Bouldin Score: {agglomerative_davies_bouldin:.4f}')


In [None]:
scaled_df.head()

In [None]:
scaled_df.columns, scaled_df.shape

In [56]:
from sklearn.cluster import KMeans

# Assume k_optimal is the chosen number of clusters (you may have determined this using the elbow method)
k_optimal = 4
kmeans = KMeans(n_clusters=k_optimal, random_state=42)
clusters_kmeans = kmeans.fit_predict(scaled_df)

# Add the cluster labels to the DataFrame
scaled_df['Cluster'] = clusters_kmeans

In [None]:
scaled_df.head()

In [None]:
# Profile the clusters based on the mean of each feature
cluster_profile = scaled_df.groupby('Cluster').mean()
print(cluster_profile)

# Optional: View other statistics such as median or count
cluster_profile_median = scaled_df.groupby('Cluster').median()
cluster_profile_count = scaled_df.groupby('Cluster').size()


In [None]:
import matplotlib.pyplot as plt

# Transpose the profile data for easier plotting
cluster_profile_T = cluster_profile.T

# List of columns to exclude
columns_to_exclude = [
    'KMeans_Cluster',
    'Agglomerative_Cluster_ward',
    'Agglomerative_Cluster_average',
    'Agglomerative_Cluster_complete'
]

# Print the initial DataFrame
print("Initial DataFrame:\n", cluster_profile_T.head())
print("Initial columns:", cluster_profile_T.columns)

# Exclude specified columns
cluster_profile_T = cluster_profile_T.drop(columns=columns_to_exclude, errors='ignore')

# Print shape after exclusion
print("Shape after exclusion:", cluster_profile_T.shape)

# Plotting
cluster_profile_T.plot(kind='bar', figsize=(10, 6))
plt.title('Cluster Profiling - Feature Means')
plt.xlabel('Features')
plt.ylabel('Mean Values')
plt.xticks(rotation=45)
plt.legend(title='Cluster')
plt.show()


In [None]:
# Detailed profile with mean, median, min, max for key features in KMeans
detailed_kmeans_profile = scaled_df.groupby('KMeans_Cluster').agg({
    'balance': ['mean', 'median', 'min', 'max'],
    'balance_frequency': ['mean', 'median', 'min', 'max'],  # Added feature
    'purchases': ['mean', 'median', 'min', 'max'],
    'oneoff_purchases': ['mean', 'median', 'min', 'max'],
    'installments_purchases': ['mean', 'median', 'min', 'max'],
    'cash_advance': ['mean', 'median', 'min', 'max'],
    'cash_advance_frequency': ['mean', 'median', 'min', 'max'],  # Added feature
    'credit_limit': ['mean', 'median', 'min', 'max'],
    'payments': ['mean', 'median', 'min', 'max'],
    'minimum_payments': ['mean', 'median', 'min', 'max'],  # Added feature
    'prc_full_payment': ['mean', 'median', 'min', 'max'],  # Added feature
    'tenure': ['mean', 'median', 'min', 'max'],  # Added feature
    'purchase_to_payment_ratio': ['mean', 'median', 'min', 'max'],  # Added feature
    'credit_utilization': ['mean', 'median', 'min', 'max'],  # Added feature
    'avg_purchases_per_month': ['mean', 'median', 'min', 'max'],  # Added feature
    'avg_payments_per_month': ['mean', 'median', 'min', 'max'],  # Added feature
    'purchase_payment_interaction': ['mean', 'median', 'min', 'max'],  # Added feature
})

# View the detailed KMeans cluster profile
print("\nDetailed KMeans Cluster Profile:\n", detailed_kmeans_profile)

# Cluster Profile Summary

## Overall Summary
The clustering analysis identified four distinct customer segments based on credit card usage, revealing insights for marketing strategies and risk management.

## Cluster Profiles

### Cluster 0: High Balance and Moderate Purchases
- **Balance**: 
  - Mean: 0.74 (Effective management)
- **Purchases**: 
  - Mean: 0.53 (Regular utilization)
- **Payments**: 
  - Mean: 0.42 (Reliable)
- **Credit Limit**: 
  - Mean: 0.26 (Stable)
- **Cash Advance**: 
  - Mean: 1.07 (Liquidity needs)

**Interpretation**: Financially stable customers with active usage and reliable payments.

---

### Cluster 1: Low Balance with High Purchases
- **Balance**: 
  - Mean: -1.47 (Poor management)
- **Purchases**: 
  - Mean: 0.08 (Limited usage)
- **Payments**: 
  - Mean: -0.98 (Financial stress)
- **Credit Limit**: 
  - Mean: -0.34 (Need for borrowing)
- **Cash Advance**: 
  - Mean: -0.71 (Minimal reliance)

**Interpretation**: Active spenders facing financial challenges.

---

### Cluster 2: Moderate Balance with Variable Purchases
- **Balance**: 
  - Mean: 0.11 (Inconsistent management)
- **Purchases**: 
  - Mean: 0.69 (Reliance on credit)
- **Payments**: 
  - Mean: 0.32 (Moderate reliability)
- **Credit Limit**: 
  - Mean: 0.13 (Typical behavior)
- **Cash Advance**: 
  - Mean: -0.85 (Limited needs)

**Interpretation**: Mix of financial stability and variable spending habits.

---

### Cluster 3: Balanced Usage with Inconsistent Payments
- **Balance**: 
  - Mean: 0.56 (Stable management)
- **Purchases**: 
  - Mean: -1.65 (Limited usage)
- **Payments**: 
  - Mean: 0.04 (Possible difficulties)
- **Credit Limit**: 
  - Mean: -0.12 (Average access)
- **Cash Advance**: 
  - Mean: 0.99 (Occasional needs)

**Interpretation**: Balanced spending with financial instability affecting payments.

---

## Conclusion
These profiles facilitate tailored marketing strategies and improved risk management, enhancing customer satisfaction and retention.
