# Advanced Coding Group Project - 2024/25 
#### Students: Michele Turco, Mattia Cervelli, Lorenzo Laterza 

## 0) Import Libraries and the Dataset

Setup your python environment and download the credit-score data

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
credit_card_df = pd.read_csv('train_biased.csv')

## 1) Understanding the dataset



#### 1.1) General overview of the dataset

Using the function .head(), we can extract from the dataset the first x rows (x=5 by default, but we can change this number). They are not useful for analytic purposes, but still enable us to visualize the dataset.

In [None]:
credit_card_df.head

#### 1.2) Showing the dataset shape

In [None]:
# Identify Number of Columns and Rows
n_rows = len(credit_card_df)
n_columns = len(credit_card_df.columns)
print(f"The dataset contains {n_columns} columns")
print(f"The dataset contains {n_rows} rows")

#### 1.3) Gathering information from data



In [None]:
credit_card_df.info()

In [None]:
credit_card_df.nunique()

#### 1.4) Missing values

In [None]:
missing_values = credit_card_df.isnull().sum()
missing_values

#### 1.5) Data Types Check

In [None]:
##Selecting numerical features
numerical_data = credit_card_df.select_dtypes(include='number')
#append the features of numerical_data to list
numerical_features = numerical_data.columns.tolist()

#Selecting categoricalfeatures
categorical_data = credit_card_df.select_dtypes(include= 'object')
#append the features of categorical_data to list
categorical_features = categorical_data.columns.tolist()

print(f'There are {len(numerical_features)} numerical features:', '\n')
print(numerical_features)
print('\n')
print(f'There are {len(categorical_features)} categorical features:', '\n')
print(categorical_features)

In [None]:
credit_card_df.describe().T

In [None]:
credit_card_df.describe(include="object").T

#### 1.6) Distribution of numerical variables

In [None]:
# Loop through all numerical columns and plot their distributions : it does not work yet!!!!!
numeric_cols = credit_card_df.select_dtypes(include=['number']).columns

plt.figure(figsize=(15, len(numeric_cols) * 4))
for i, col in enumerate(numeric_cols, 1):
    plt.subplot(len(numeric_cols), 1, i)
    sns.histplot(credit_card_df[col], kde=True, bins=10)
    plt.title(f'Distribution of {col}')
plt.tight_layout()
plt.show()

In [None]:
numeric_cols = credit_card_df.select_dtypes(include=['number']).columns

# Create a grid layout for better visualization
n_cols = 3  # Number of columns in the grid
n_rows = (len(numeric_cols) + n_cols - 1) // n_cols  # Calculate required number of rows

# Create subplots with a grid layout
fig, axes = plt.subplots(n_rows, n_cols, figsize=(18, 4 * n_rows))
axes = axes.flatten()  # Flatten the 2D array of axes for easier indexing

# Loop through all numerical columns and plot their distributions
for i, col in enumerate(numeric_cols):
    # Determine appropriate number of bins based on data characteristics
    if credit_card_df[col].nunique() < 10:
        bins = credit_card_df[col].nunique()  # Discrete data with few values
    else:
        bins = 'auto'  # Let Seaborn determine optimal bins
    
    # Create histogram with kernel density estimate
    sns.histplot(data=credit_card_df, x=col, kde=True, bins=bins, ax=axes[i])
    
    # Add mean and median lines for reference
    mean_val = credit_card_df[col].mean()
    median_val = credit_card_df[col].median()
    axes[i].axvline(mean_val, color='red', linestyle='--', alpha=0.7, label=f'Mean: {mean_val:.2f}')
    axes[i].axvline(median_val, color='green', linestyle='-.', alpha=0.7, label=f'Median: {median_val:.2f}')
    
    # Add legend and title
    axes[i].legend(fontsize='small')
    axes[i].set_title(f'Distribution of {col}')
    
    # Handle axis labels for better readability
    if credit_card_df[col].max() > 1000:
        axes[i].ticklabel_format(style='scientific', axis='x', scilimits=(0,0))

# Remove any unused subplots
for j in range(i + 1, len(axes)):
    fig.delaxes(axes[j])

plt.tight_layout()
plt.show()

In [None]:
import pandas as pd
import numpy as np

def clean_and_convert_numeric_columns(df, potentially_numeric_columns):
    """
    Clean and convert columns that should be numeric but contain formatting issues.
    Track NaN values before and after conversion.
    
    Parameters:
      df: DataFrame containing the data
      potentially_numeric_columns: List of column names that should be numeric
    
    Returns:
      DataFrame with cleaned numeric columns and a summary dictionary
    """
    df_cleaned = df.copy()
    
    # Create a summary dictionary to track changes
    summary = {}
    
    for col in potentially_numeric_columns:
        if col in df.columns:
            # Count initial NaN values and total values
            initial_nan_count = df[col].isna().sum()
            total_values = len(df[col])
            
            # If column is already numeric, no conversion is performed.
            if pd.api.types.is_numeric_dtype(df[col]):
                summary[col] = {
                    'already_numeric': True,
                    'initial_nan_count': initial_nan_count,
                    'final_nan_count': initial_nan_count,
                    'new_nan_count': 0,
                    'total_values': total_values,
                    'conversion_rate': None  # Not applicable since no conversion occurred
                }
                continue
                
            # For columns that should be numeric but aren't:
            try:
                # Convert column to string (to ensure uniform processing)
                temp_series = df[col].astype(str)
                
                # Remove common non-numeric characters/patterns
                temp_series = temp_series.str.replace('_', '')
                temp_series = temp_series.str.replace(',', '')
                temp_series = temp_series.str.replace('$', '')
                temp_series = temp_series.str.replace('%', '')
                temp_series = temp_series.str.replace(' ', '')
                
                # Convert cleaned strings to numeric, coercing errors to NaN
                df_cleaned[col] = pd.to_numeric(temp_series, errors='coerce')
                
                # Count final NaN values and determine how many new NaNs were introduced
                final_nan_count = df_cleaned[col].isna().sum()
                new_nan_count = final_nan_count - initial_nan_count
                
                # Calculate conversion rate as the percentage of originally non-NaN values
                # that were successfully converted.
                non_na_original = total_values - initial_nan_count
                if non_na_original > 0:
                    conversion_rate = (non_na_original - new_nan_count) / non_na_original
                else:
                    conversion_rate = 0
                
                # Store summary statistics
                summary[col] = {
                    'already_numeric': False,
                    'initial_nan_count': initial_nan_count,
                    'final_nan_count': final_nan_count,
                    'new_nan_count': new_nan_count,
                    'total_values': total_values,
                    'conversion_rate': conversion_rate
                }
                
            except Exception as e:
                print(f"Error converting {col}: {str(e)}")
                summary[col] = {
                    'already_numeric': False,
                    'error': str(e),
                    'initial_nan_count': initial_nan_count
                }
    
    # Print summary table (without the NaN Rate column)
    print("\nSummary Table:")
    print("-" * 80)
    print(f"{'Column':<25} {'Initial NaNs':<15} {'Final NaNs':<15} {'New NaNs':<15} {'Conversion Rate'}")
    print("-" * 80)
    
    for col, stats in summary.items():
        if 'error' in stats:
            print(f"{col:<25} {stats['initial_nan_count']:<15} {'ERROR':<15} {'N/A':<15} {'N/A'}")
        else:
            conv_rate = stats.get('conversion_rate')
            conv_rate_str = "N/A" if conv_rate is None else f"{conv_rate:.2%}"
            print(
                f"{col:<25} "
                f"{stats['initial_nan_count']:<15} "
                f"{stats['final_nan_count']:<15} "
                f"{stats['new_nan_count']:<15} "
                f"{conv_rate_str}"
            )
    
    return df_cleaned, summary

# List of columns that should be numeric
should_be_numeric = [
    'Annual_Income', 
    'Num_of_Loan', 
    'Num_of_Delayed_Payment', 
    'Changed_Credit_Limit', 
    'Outstanding_Debt', 
    'Amount_invested_monthly'
]

# Clean and convert the data
credit_card_cleaned, cleaning_summary = clean_and_convert_numeric_columns(credit_card_df, should_be_numeric)

# Check which columns are now numeric vs categorical
numeric_cols = credit_card_cleaned.select_dtypes(include=['number']).columns.tolist()
categorical_cols = credit_card_cleaned.select_dtypes(exclude=['number']).columns.tolist()

print(f"\nAfter cleaning, there are {len(numeric_cols)} numerical features: \n{numeric_cols}")
print(f"\nAfter cleaning, there are {len(categorical_cols)} categorical features: \n{categorical_cols}")

# Now let's examine problematic values that caused NaNs
print("\nExamining problematic values that were converted to NaN:")
for col in should_be_numeric:
    if col in cleaning_summary and cleaning_summary[col].get('new_nan_count', 0) > 0:
        print(f"\nColumn {col} problematic values:")
        
        # Find the indices where values became NaN after conversion
        if not pd.api.types.is_numeric_dtype(credit_card_df[col]):
            # Get original non-NA values that became NA after conversion
            original_values = credit_card_df[col][~credit_card_df[col].isna()]
            indices_that_became_nan = original_values.index[credit_card_cleaned[col].isna()[original_values.index]]
            
            if len(indices_that_became_nan) > 0:
                # Show a sample of problematic values
                sample_size = min(10, len(indices_that_became_nan))
                sample_indices = np.random.choice(indices_that_became_nan, sample_size, replace=False)
                
                print("Sample of original values that couldn't be converted:")
                for idx in sample_indices:
                    print(f"  Index {idx}: '{credit_card_df.loc[idx, col]}'")
 

In [None]:
def clean_changed_credit_limit(df):
    """
    Special cleaning function for the Changed_Credit_Limit column.
    Handles cases with just underscores or other non-numeric entries.
    
    Parameters:
    df: DataFrame containing the Changed_Credit_Limit column
    
    Returns:
    Series with cleaned numeric values
    """
    # Make a copy to avoid modifying the original dataframe
    cleaned_series = df['Changed_Credit_Limit'].copy()
    
    # Count values before cleaning
    total_values = len(cleaned_series)
    missing_before = cleaned_series.isna().sum()
    
    # Track number of replaced values
    underscore_only_count = 0
    other_invalid_count = 0
    
    # If already numeric, just return it
    if pd.api.types.is_numeric_dtype(cleaned_series):
        print(f"Changed_Credit_Limit is already numeric. {missing_before} missing values out of {total_values}.")
        return cleaned_series
    
    # Convert to string for cleaning
    cleaned_series = cleaned_series.astype(str)
    
    # Replace values that are just underscores with NaN
    underscore_mask = cleaned_series.str.strip() == '_'
    underscore_only_count = underscore_mask.sum()
    cleaned_series.loc[underscore_mask] = np.nan
    
    # Clean remaining values by removing non-numeric characters
    valid_values_mask = ~cleaned_series.isna()
    
    if valid_values_mask.any():
        # Remove common non-numeric characters from remaining values
        cleaned_series.loc[valid_values_mask] = (
            cleaned_series
            .loc[valid_values_mask]
            .str.replace('_', '')
            .str.replace(',', '')
            .str.replace('$', '')
            .str.replace('%', '')
            .str.replace(' ', '')
        )
    
    # Convert to numeric, tracking which values couldn't be converted
    numeric_series = pd.to_numeric(cleaned_series, errors='coerce')
    
    # Count new NaNs created during the final conversion
    other_invalid_mask = ~underscore_mask & ~cleaned_series.isna() & numeric_series.isna()
    other_invalid_count = other_invalid_mask.sum()
    
    # Count values after cleaning
    missing_after = numeric_series.isna().sum()
    
    # Print summary
    print(f"\nChanged_Credit_Limit Cleaning Summary:")
    print(f"Total values: {total_values}")
    print(f"Missing values before: {missing_before} ({missing_before/total_values:.2%})")
    print(f"Values with just '_': {underscore_only_count} ({underscore_only_count/total_values:.2%})")
    print(f"Other invalid values: {other_invalid_count} ({other_invalid_count/total_values:.2%})")
    print(f"Missing values after: {missing_after} ({missing_after/total_values:.2%})")
    print(f"Successfully converted: {total_values-missing_after} ({(total_values-missing_after)/total_values:.2%})")
    
    # If many values are missing, suggest a strategy
    if missing_after/total_values > 0.3:  # Over 30% missing
        print("\nRecommendation: Consider treating this as a binary feature (changed/not changed)")
        print("or imputing missing values based on other features.")
    
    return numeric_series

# Apply the special cleaning to Changed_Credit_Limit
credit_card_cleaned['Changed_Credit_Limit'] = clean_changed_credit_limit(credit_card_df)

# Let's also check if it makes sense to convert this to a binary feature
unique_values = credit_card_cleaned['Changed_Credit_Limit'].dropna().unique()
print(f"\nUnique values in Changed_Credit_Limit after cleaning: {sorted(unique_values)}")



In [None]:
# Create a DataFrame from the cleaning_summary dictionary
summary_df = pd.DataFrame.from_dict(cleaning_summary, orient='index')

# Convert the conversion_rate to a percentage string for display.
if 'conversion_rate' in summary_df.columns:
    summary_df['conversion_rate'] = summary_df['conversion_rate'].apply(lambda x: f"{x:.2%}" if pd.notnull(x) else "N/A")

# Optionally reorder the columns
cols_order = ['already_numeric', 'initial_nan_count', 'final_nan_count', 'new_nan_count', 'conversion_rate']
summary_df = summary_df[cols_order]

print("\nDetailed Cleaning Summary Table:")
print(summary_df)

Plot the distribution of credit card score respect to a normal distribution to see i there are skewnesses

In [None]:
from scipy.stats import norm
import numpy as np

# Get the list of numerical feature names from the cleaned dataframe.
numeric_features = credit_card_cleaned.select_dtypes(include=['number']).columns
n_features = len(numeric_features)

# Define grid size
n_cols = 3  # Number of columns in the grid
n_rows = (n_features + n_cols - 1) // n_cols  # Calculate required number of rows

# Create subplots with a grid layout
fig, axes = plt.subplots(n_rows, n_cols, figsize=(18, 4 * n_rows))
axes = axes.flatten()  # Flatten the 2D array of axes for easier indexing

# Loop through all numerical features and plot their distributions
for i, col in enumerate(numeric_features):
    ax = axes[i]
    data = credit_card_cleaned[col].dropna()
    
    # Plot histogram with density normalization
    ax.hist(data, bins=30, density=True, alpha=0.5, color='blue')
    
    # Fit a normal distribution to the data and plot it
    mu, std = norm.fit(data)
    x = np.linspace(data.min(), data.max(), 100)
    y = norm.pdf(x, mu, std)
    ax.plot(x, y, 'r-', lw=2)
    
    # Add vertical lines for mean and median
    mean_val = data.mean()
    median_val = data.median()
    ax.axvline(mean_val, color='red', linestyle='--', alpha=0.7, label=f'Mean: {mean_val:.2f}')
    ax.axvline(median_val, color='blue', linestyle='-.', alpha=0.7, label=f'Median: {median_val:.2f}')
    
    # Add legend, grid, and style ticks
    ax.legend(fontsize='small')
    ax.grid(True, linestyle='--', alpha=0.7)
    ax.tick_params(axis='both', which='major', labelsize=10)
    
    # Set titles and labels
    ax.set_title(f'Distribution of {col} with Normal Fit')
    ax.set_xlabel(col)
    ax.set_ylabel('Frequency')

# Remove any unused subplots if n_features is not a multiple of n_cols
for j in range(i + 1, len(axes)):
    fig.delaxes(axes[j])

plt.tight_layout()
plt.show()

## Fix Missing Values


### Customer ID

as we have 12500 unique values of Customer_ID over 100k entries it means that many customer are repeated over the entire dataset. So we can iterate over the column Name to fix it 

In [3]:
missing_customerID = credit_card_df['Customer_ID'].isnull().sum()
missing_customerID

9893

In [3]:
# Function to fill missing Customer_ID values based on matching names
def fill_missing_customer_id(df):
    """
    Fill missing Customer_ID values using the most common ID for each name.
    
    Parameters:
        df: The input DataFrame with Name and Customer_ID columns
    
    Returns:
        DataFrame with missing Customer_ID values filled
    """
    # Count missing values before
    missing_before = df['Customer_ID'].isnull().sum()
    print(f"Missing Customer_ID values before: {missing_before}")
    
    # Check if there are any missing values to fill
    if missing_before == 0:
        print("No missing Customer_ID values to fill.")
        return df
    
    # Create a copy to avoid modifying the original during calculation
    result_df = df.copy()
    
    # Only use rows that have both a Name and a valid Customer_ID to compute the mode
    valid = df.dropna(subset=['Name', 'Customer_ID'])
    
    # Find the most common Customer_ID for each Name
    mode_mapping = valid.groupby('Name')['Customer_ID'].agg(lambda x: x.mode().iloc[0] if not x.empty else None)
    
    # Create a mask for rows with missing Customer_ID
    missing_mask = result_df['Customer_ID'].isnull()
    
    # Apply the mapping in one vectorized operation
    result_df.loc[missing_mask, 'Customer_ID'] = result_df.loc[missing_mask, 'Name'].map(mode_mapping)
    
    # Count missing values after filling
    missing_after = result_df['Customer_ID'].isnull().sum()
    print(f"Missing Customer_ID values after: {missing_after}")
    print(f"Fixed {missing_before - missing_after} values ({(missing_before - missing_after)/missing_before:.2%} of missing values)")
    
    # Check if any values couldn't be filled
    if missing_after > 0:
        print(f"Warning: Could not fill {missing_after} Customer_ID values because their Names don't match any known Customer_ID")
    
    return result_df

# Apply the function to fix missing Customer_ID values
credit_card_df = fill_missing_customer_id(credit_card_df)

Missing Customer_ID values before: 9893
Missing Customer_ID values after: 1891
Fixed 8002 values (80.89% of missing values)


In [4]:
# check if how many rows are missing both Customer_ID and Name and tell me their index
missing_rows = credit_card_df[credit_card_df['Customer_ID'].isnull() & credit_card_df['Name'].isnull()]
missing_rows_indices = missing_rows.index.tolist()
print(f"Indices of rows with missing Customer_ID and Name: {missing_rows_indices}")

Indices of rows with missing Customer_ID and Name: [108, 109, 216, 241, 311, 367, 489, 539, 587, 668, 693, 704, 747, 765, 837, 865, 875, 1127, 1229, 1320, 1344, 1421, 1433, 1518, 1530, 1556, 1753, 1786, 1817, 1837, 1842, 1873, 1882, 1907, 1967, 1977, 1984, 2007, 2022, 2030, 2209, 2247, 2434, 2445, 2500, 2526, 2537, 2551, 2678, 2727, 2834, 2886, 2917, 3048, 3101, 3148, 3157, 3279, 3284, 3382, 3555, 3571, 3629, 3664, 3717, 3769, 3784, 3816, 3823, 3913, 3980, 4151, 4200, 4220, 4223, 4231, 4317, 4398, 4399, 4790, 4820, 4833, 4854, 4937, 4946, 4949, 5046, 5069, 5122, 5131, 5154, 5176, 5186, 5203, 5283, 5426, 5462, 5534, 5554, 5585, 5613, 5672, 5687, 5785, 5811, 5856, 5957, 5983, 6038, 6074, 6087, 6156, 6287, 6310, 6358, 6388, 6578, 6622, 6750, 6877, 6883, 7098, 7101, 7104, 7203, 7209, 7227, 7318, 7338, 7510, 7569, 7761, 7784, 7804, 7849, 7853, 7945, 7958, 7960, 7969, 8035, 8072, 8175, 8283, 8306, 8488, 8588, 8630, 8715, 8731, 8737, 8755, 8870, 8988, 9012, 9023, 9074, 9098, 9150, 9172, 9301,

I manually check this rows and they have both missing values on names and customer id. I find that the SSN represents the social security number of a person and it seems to be unique, in fact we have 12500 unique values for customer IDs and 12501 for SSN( maybe check for this 1 later), so we can do the same thing we did before but this time use SSN column instead and see if we fix every customer ID 

#### Theory of subsequential dataset

We found out that each client has a monthly analysis of his credit cards on a range of months that goes from january to august. Here we want to prove that we can fix the remaining missing values for customer ID going in one of the other lines that are i range of january august and fix it. 

In [5]:
# Calculate the number of rows per customer
rows_per_customer = credit_card_df.groupby('Customer_ID').size()
print("Average entries per customer:", rows_per_customer.mean())


Average entries per customer: 7.84872


with this I proved that for each customer we have an average of 8 rows (this is pretty accurate because we have only 1891 missing customer IDs over 100k entries). As the lenght of the range of months between January to August is 8 we can proceed by fixing all the remaining missing customer IDs by fixing before the column 'Month'. Before doing this we need to prove that we have a precise number of rows for each client and this is the same for everyone and the clients are in order.

ex. 
| Customer_ID | Month    |
|-------------|----------|
| Client 1    | January  |
| Client 1    | February |
| Client 1    | March    |
| Client 1    | April    |
| Client 1    | May      |
| Client 1    | June     |
| Client 1    | July     |
| Client 1    | August   |

and not something like that

| Customer_ID | Month    |
|-------------|----------|
| Client 1    | January  |
| Client 2    | January  |
| Client 1    | February |
| Client 3    | March    |
| Client 1    | June     |
| Client 5    | April    |

In [6]:
# print me the first 8 rows of the dataset
print(credit_card_df.head(8))

       ID Customer_ID     Month           Name          City         Street  \
0     NaN   CUS_0xd40       NaN  Aaron Maashoh        Lonton  Oxford Street   
1  0x1603   CUS_0xd40  February  Aaron Maashoh        Lonton  Oxford Street   
2  0x1604   CUS_0xd40       NaN            NaN        Lonton  Oxford Street   
3  0x1605   CUS_0xd40     April  Aaron Maashoh        Lonton  Oxford Street   
4     NaN   CUS_0xd40       May  Aaron Maashoh        Lonton  Oxford Street   
5  0x1607   CUS_0xd40      June  Aaron Maashoh        Lonton  Oxford Street   
6  0x1608   CUS_0xd40      July  Aaron Maashoh        Lonton  Oxford Street   
7  0x1609   CUS_0xd40    August            NaN  Standhampton  Oxford Street   

     Age          SSN Occupation Annual_Income  ...  Num_of_Delayed_Payment  \
0    NaN  821-00-0265    Manager      19114.12  ...                       7   
1   23.0  821-00-0265    Manager      19114.12  ...                     NaN   
2 -500.0  821-00-0265    Manager      19114.12  ...

In [7]:
# print me the last 8 rows of the dataset
print(credit_card_df.tail(8))

            ID Customer_ID     Month   Name          City        Street   Age  \
99992  0x25fe6  CUS_0x942c   January  Nicks      BadShire  North Street  24.0   
99993  0x25fe7  CUS_0x942c  February  Nicks  Standhampton  North Street  25.0   
99994  0x25fe8  CUS_0x942c     March    NaN      BadShire  North Street  25.0   
99995  0x25fe9  CUS_0x942c     April  Nicks      BadShire  North Street   NaN   
99996  0x25fea  CUS_0x942c       May  Nicks      BadShire  North Street  25.0   
99997  0x25feb  CUS_0x942c      June  Nicks      BadShire  North Street  25.0   
99998  0x25fec  CUS_0x942c      July  Nicks  Standhampton  North Street  25.0   
99999  0x25fed  CUS_0x942c    August  Nicks      BadShire  North Street  25.0   

               SSN Occupation Annual_Income  ...  Num_of_Delayed_Payment  \
99992  078-73-5990    Cleaner      39628.99  ...                     NaN   
99993  078-73-5990    Cleaner     39628.99_  ...                     NaN   
99994  078-73-5990    Cleaner      39628.9

$12500 \cdot 8 = 100 000$ which is the exact number of our rows, and as we can see from the previous outputs they're ordered as each client is consequential

In [7]:
# Create a list to store any non-consecutive customer groups
non_consecutive = []

# Group by the combined key (customer_id, SSN)
for (cid, ssn), group in credit_card_df.groupby(['Customer_ID', 'SSN']):
    indices = group.index
    # Calculate the expected length if the block were contiguous
    expected_length = indices.max() - indices.min() + 1
    # Check if the actual group length matches the expected contiguous block length
    if expected_length != len(indices):
        non_consecutive.append((cid, ssn))

if non_consecutive:
    print("The following customer blocks are not contiguous in the current dataset order:")
    for entry in non_consecutive:
        print("Customer_ID:", entry[0], "SSN:", entry[1])
else:
    print("All customers appear as contiguous blocks in the current dataset.")


The following customer blocks are not contiguous in the current dataset order:
Customer_ID: CUS_0x100b SSN: 238-62-0395
Customer_ID: CUS_0x1011 SSN: 793-05-8223
Customer_ID: CUS_0x1013 SSN: 930-49-9615
Customer_ID: CUS_0x1015 SSN: 810-97-7024
Customer_ID: CUS_0x1018 SSN: 731-19-8119
Customer_ID: CUS_0x1026 SSN: 500-62-9044
Customer_ID: CUS_0x102d SSN: 692-71-7552
Customer_ID: CUS_0x1032 SSN: 620-58-8045
Customer_ID: CUS_0x1037 SSN: 230-22-9583
Customer_ID: CUS_0x1038 SSN: 355-00-7832
Customer_ID: CUS_0x103e SSN: 155-72-8070
Customer_ID: CUS_0x1044 SSN: 261-18-4430
Customer_ID: CUS_0x1048 SSN: 808-81-2470
Customer_ID: CUS_0x104a SSN: #F%$D@*&8
Customer_ID: CUS_0x104a SSN: 652-58-5852
Customer_ID: CUS_0x104e SSN: 837-93-5062
Customer_ID: CUS_0x1051 SSN: 232-33-7638
Customer_ID: CUS_0x105c SSN: 998-18-7252
Customer_ID: CUS_0x1063 SSN: 883-38-8680
Customer_ID: CUS_0x1069 SSN: 761-27-5143
Customer_ID: CUS_0x107c SSN: 712-17-7369
Customer_ID: CUS_0x107e SSN: 463-69-6790
Customer_ID: CUS_0x10

In [10]:
# Dictionary to hold non-consecutive customer indices
non_consecutive_indices = {}

# Group by the combined key (customer_id, SSN)
for (cid, ssn), group in credit_card_df.groupby(['Customer_ID', 'SSN']):
    indices = group.index.tolist()
    # Calculate the expected contiguous block length based on min and max index
    expected_length = max(indices) - min(indices) + 1
    if expected_length != len(indices):
        non_consecutive_indices[(cid, ssn)] = indices

# Print or return the dictionary of non-consecutive blocks
print("Non-consecutive customer blocks (customer_id, SSN) with their indices:")
for key, inds in non_consecutive_indices.items():
    print("Customer_ID:", key[0], "SSN:", key[1], "Indices:", inds)


Non-consecutive customer blocks (customer_id, SSN) with their indices:
Customer_ID: CUS_0x100b SSN: 238-62-0395 Indices: [1528, 1529, 1531, 1532, 1533]
Customer_ID: CUS_0x1011 SSN: 793-05-8223 Indices: [60120, 60121, 60122, 60124, 60126, 60127]
Customer_ID: CUS_0x1013 SSN: 930-49-9615 Indices: [95216, 95217, 95218, 95220, 95221, 95222]
Customer_ID: CUS_0x1015 SSN: 810-97-7024 Indices: [65208, 65210, 65211, 65212, 65213, 65214, 65215]
Customer_ID: CUS_0x1018 SSN: 731-19-8119 Indices: [82064, 82065, 82066, 82067, 82069, 82070, 82071]
Customer_ID: CUS_0x1032 SSN: 620-58-8045 Indices: [27963, 27965, 27967]
Customer_ID: CUS_0x1037 SSN: 230-22-9583 Indices: [6432, 6433, 6434, 6435, 6436, 6439]
Customer_ID: CUS_0x1038 SSN: 355-00-7832 Indices: [88664, 88665, 88666, 88667, 88669, 88670]
Customer_ID: CUS_0x103e SSN: 155-72-8070 Indices: [34480, 34481, 34482, 34484, 34485, 34486, 34487]
Customer_ID: CUS_0x103e SSN: 795-31-5311 Indices: [67132, 67135]
Customer_ID: CUS_0x1044 SSN: 261-18-4430 Indi

In [11]:
len(non_consecutive_indices)

9155

In [10]:
# print a specific row of the dataset
print(credit_card_df.iloc[60120])

ID                                                                    0x17646
Customer_ID                                                        CUS_0x1011
Month                                                                 January
Name                                                                Schneyerh
City                                                             Standhampton
Street                                                             Old Street
Age                                                                       NaN
SSN                                                               793-05-8223
Occupation                                                         Journalist
Annual_Income                                                        58918.47
Monthly_Inhand_Salary                                               5208.8725
Num_Bank_Accounts                                                         3.0
Num_Credit_Card                                                 

In [11]:
print(credit_card_df.iloc[60121])

ID                                                                    0x17647
Customer_ID                                                        CUS_0x1011
Month                                                                February
Name                                                                Schneyerh
City                                                             Standhampton
Street                                                             Old Street
Age                                                                      44.0
SSN                                                               793-05-8223
Occupation                                                         Journalist
Annual_Income                                                        58918.47
Monthly_Inhand_Salary                                               5208.8725
Num_Bank_Accounts                                                         3.0
Num_Credit_Card                                                 

after I print the first indexes that are assumed to not be consecutive we can clearly see that we're talking about the same customer as they have the same customer and they have subsequent months so we can proceed by fixing the 'Month' column

In [None]:
# Define the expected month order.
month_order = ['January', 'February', 'March', 'April', 'May', 'June', 'July', 'August']

# Reset the index to ensure we're working on the DataFrame's natural order (if needed)
credit_card_df = credit_card_df.reset_index(drop=True)

# Create a new month column with a repeating cycle: each group of 8 rows gets January to August.
credit_card_df['Month'] = [month_order[i % 8] for i in range(len(credit_card_df))]



In [13]:
# Fix Customer ID missing values

# Define the month order and a mapping to positions (0 for January, …, 7 for August).
month_to_index = {month: i for i, month in enumerate(month_order)}

def fill_missing_customer_id(df):
    df = df.copy()
    n_rows = len(df)
    
    # Iterate over each row where Customer_ID is missing.
    for idx, row in df[df['Customer_ID'].isna()].iterrows():
        # Determine the cycle position based on the assigned month.
        # If for some reason month is missing, default to position 0.
        m = row.get('Month', None)
        m_index = month_to_index.get(m, 0)
        
        # Define the search window:
        # Look m rows upward and (7 - m_index) rows downward.
        window_start = max(idx - m_index, 0)
        window_end = min(idx + (7 - m_index), n_rows - 1)
        
        # Extract the window of rows.
        window = df.loc[window_start:window_end, 'Customer_ID']
        # Get all non-missing Customer_ID values in that window.
        non_missing_ids = window.dropna().unique()
        
        if len(non_missing_ids) == 1:
            # If exactly one unique non-missing Customer_ID is found, assign it.
            df.at[idx, 'Customer_ID'] = non_missing_ids[0]
        elif len(non_missing_ids) > 1:
            # If multiple candidate IDs are found, you could choose the first one,
            # or implement more sophisticated logic (e.g. mode, or nearest by distance).
            df.at[idx, 'Customer_ID'] = non_missing_ids[0]
        else:
            # If no candidate is found in the immediate window, you may decide to extend the search.
            # For now, we'll leave it missing.
            pass
    return df

# Example usage:
credit_card_df= fill_missing_customer_id(credit_card_df)


In [14]:
credit_card_df.isnull().sum()

ID                           9942
Customer_ID                     0
Month                           0
Name                        18887
City                         9851
Street                       9920
Age                         14488
SSN                         10001
Occupation                   9944
Annual_Income               10094
Monthly_Inhand_Salary       23595
Num_Bank_Accounts            9833
Num_Credit_Card             10062
Interest_Rate                9849
Num_of_Loan                 10191
Type_of_Loan                20312
Delay_from_due_date          9988
Num_of_Delayed_Payment      16218
Changed_Credit_Limit        10067
Num_Credit_Inquiries        11898
Credit_Mix                   9915
Outstanding_Debt             9963
Credit_Utilization_Ratio     9975
Credit_History_Age          18209
Payment_of_Min_Amount        9957
Amount_invested_monthly     14120
Credit_Score                    0
dtype: int64