In [1]:
import pickle
import pandas as pd
from sklearn.preprocessing import LabelEncoder

# Step 1: Load the LabelEncoders from the .ppkl file
with open("label_encoders.ppkl", "rb") as file:
    label_encoders = pickle.load(file)

# Step 2: Create a test DataFrame
data = {
    'Gender': ['Male', 'Female', 'Female', 'Male'],
    'Subscription Type': ['Basic', 'Premium', 'Basic', 'Premium'],
    'Contract Length': ['Monthly', 'Annual', 'Quarterly', 'Annual']
}

df = pd.DataFrame(data)

# Step 3: Apply the loaded LabelEncoders to the test DataFrame
print("\nEncodage des variables catégoriques sur le test DataFrame :")
for col, le in label_encoders.items():
    df[col] = le.transform(df[col])
    print(f"Colonne {col} encodée.")

# Display the test DataFrame after encoding
print("\nDataFrame après encodage :")
print(df)



Encodage des variables catégoriques sur le test DataFrame :
Colonne Gender encodée.
Colonne Subscription Type encodée.
Colonne Contract Length encodée.

DataFrame après encodage :
   Gender  Subscription Type  Contract Length
0       1                  0                1
1       0                  1                0
2       0                  0                2
3       1                  1                0


In [2]:
import pickle
import pandas as pd
from sklearn.preprocessing import StandardScaler
numerical_columns = ['Age', 'Tenure', 'Usage Frequency', 'Support Calls', 
                     'Payment Delay', 'Total Spend', 'Last Interaction']
# Step 1: Load the StandardScaler from the .ppkl file
with open("scaler.ppkl", "rb") as file:
    scaler = pickle.load(file)

# Step 2: Create a test DataFrame with numerical data
data = {
    'Age': [25, 30, 35, 40],
    'Tenure': [1, 2, 3, 4],
    'Usage Frequency': [5, 6, 7, 8],
    'Support Calls': [1, 0, 2, 3],
    'Payment Delay': [30, 60, 90, 120],
    'Total Spend': [100, 200, 300, 400],
    'Last Interaction': [5, 10, 15, 20]
}

df = pd.DataFrame(data)

# Step 3: Apply the loaded StandardScaler to the test DataFrame
print("\nApplication du StandardScaler sur le test DataFrame :")
df[numerical_columns] = scaler.transform(df[numerical_columns])

# Display the scaled DataFrame
print("\nDataFrame après mise à l'échelle :")
print(df)



Application du StandardScaler sur le test DataFrame :

DataFrame après mise à l'échelle :
        Age    Tenure  Usage Frequency  Support Calls  Payment Delay  \
0 -1.160499 -1.760725        -1.243118      -0.904173       1.952764   
1 -0.765883 -1.702712        -1.127100      -1.223295       5.502565   
2 -0.371268 -1.644699        -1.011081      -0.585052       9.052365   
3  0.023348 -1.586686        -0.895063      -0.265930      12.602166   

   Total Spend  Last Interaction  
0    -2.119986         -1.116435  
1    -1.712353         -0.535598  
2    -1.304721          0.045238  
3    -0.897088          0.626074  


In [5]:
import pandas as pd
import pickle
from sklearn.preprocessing import StandardScaler, LabelEncoder

# Load the saved label encoders and scaler
with open("label_encoders.ppkl", "rb") as le_file:
    label_encoders = pickle.load(le_file)

with open("scaler.ppkl", "rb") as dc_file:
    scaler = pickle.load(dc_file)

training_cols = [
    'Age', 'Gender', 'Tenure', 'Usage Frequency', 'Support Calls', 
    'Payment Delay', 'Subscription Type', 'Contract Length', 'Total Spend', 
    'Last Interaction'
]

# Ensure missing columns are added with a default value
def fix_missing_cols(training_cols, new_data):
    missing_cols = set(training_cols) - set(new_data.columns)
    for c in missing_cols:
        if c in label_encoders:  # If it's a categorical column, set to default category
            new_data[c] = label_encoders[c].classes_[0]
        else:  # If it's a numerical column, set to 0
            new_data[c] = 0
    new_data = new_data[training_cols]  # Ensure column order matches training
    return new_data

# Clean and preprocess the data
def clean_data_json(df):
    # Define column types
    numerical_cols = ['Age', 'Tenure', 'Usage Frequency', 'Support Calls', 
                      'Payment Delay', 'Total Spend', 'Last Interaction']
    categorical_cols = ['Gender', 'Subscription Type', 'Contract Length']
    
    # Fill missing numerical columns with 0
    for col in numerical_cols:
        if col not in df.columns:
            df[col] = 0

    # Apply StandardScaler to numerical columns (check if scaling is needed)
    if not df[numerical_cols].empty:
        df[numerical_cols] = scaler.transform(df[numerical_cols])

    # Handle categorical columns
    for col in categorical_cols:
        if col in df.columns:
            # Apply LabelEncoder if the column exists
            try:
                df[col] = label_encoders[col].transform(df[col])
            except ValueError as e:
                print(f"Error transforming column {col}: {e}")
                # Fallback in case of unseen categories
                df[col] = label_encoders[col].transform([label_encoders[col].classes_[0]])[0]
        else:
            # If the categorical column is missing, set it to the default class
            df[col] = label_encoders[col].transform([label_encoders[col].classes_[0]])[0]

    # Ensure all training columns are present
    df = fix_missing_cols(training_cols, df)
    return df

# Example JSON data as a dictionary
data = {
    "CustomerID": 1,
    "Age": 25,
    "Gender": "Female",
    "Tenure": 25,
    "Usage Frequency": 14,
    "Support Calls": 4,
    "Payment Delay": 27,
    "Subscription Type": "Basic",
    "Contract Length": "Monthly",
    "Total Spend": 598.0,
    "Last Interaction": 9
}
# Convert the dictionary to a pandas DataFrame
df = pd.DataFrame([data])

# Clean and preprocess the DataFrame
cleaned_data = clean_data_json(df)

# Print the cleaned data
print(cleaned_data)


        Age  Gender    Tenure  Usage Frequency  Support Calls  Payment Delay  \
0 -1.160499       0 -0.368409        -0.198951       0.053192       1.597784   

   Subscription Type  Contract Length  Total Spend  Last Interaction  
0                  0                1    -0.089976         -0.651766  
