In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import datetime # For time-based features
from sklearn.preprocessing import MinMaxScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
import os
import joblib # To save scaler/encoder

print("Libraries imported.")

Libraries imported.


In [3]:
# --- Load Original Cleaned Clickstream Data ---
processed_data_path = 'E:/GUVI/Projects/customer_conversion_analysis/data_processed/train_cleaned.csv'
try:
    df_train_clicks = pd.read_csv(processed_data_path)
    print(f"Original cleaned clickstream data loaded: {df_train_clicks.shape}")
except FileNotFoundError:
    print(f"Error: File not found at {processed_data_path}")
    raise

# --- Re-run Aggregation Logic ---
print("\nAggregating data to session level...")
aggregation_functions = {
    'order': 'max',
    'page_1_main_category': ['nunique', lambda x: x.mode()[0] if not x.mode().empty else None],
    'page_2_clothing_model': 'nunique',
    'colour': ['nunique', lambda x: x.mode()[0] if not x.mode().empty else None],
    'price': ['mean', 'sum', 'max'],
    'page': ['max', 'count'],
    'month': 'first',
    'day': 'first',
    'country': 'first', # Assuming country is consistent per session
     # Keep necessary fields for feature engineering
    'model_photography': lambda x: x.mode()[0] if not x.mode().empty else None, # Most common photo type
    'price_2': lambda x: x.mode()[0] if not x.mode().empty else None # Most common price category indicator
}

df_session = df_train_clicks.groupby('session_id').agg(aggregation_functions)

# Rename columns
df_session.columns = ['_'.join(map(str, col)).strip('_') for col in df_session.columns.values] # Ensure col names are strings
df_session = df_session.rename(columns={
    'order_max': 'session_length',
    'page_1_main_category_nunique': 'unique_main_categories',
    'page_1_main_category_<lambda_0>': 'most_freq_main_category',
    'page_2_clothing_model_nunique': 'unique_models_viewed',
    'colour_nunique': 'unique_colours',
    'colour_<lambda_0>': 'most_freq_colour',
    'price_mean': 'avg_price_viewed',
    'price_sum': 'session_revenue_potential', # Regression Target
    'price_max': 'max_price_viewed',
    'page_max': 'max_page_reached',
    'page_count': 'total_pages_viewed', # Often same as session_length
    'model_photography_<lambda_0>': 'most_freq_model_photo',
    'price_2_<lambda_0>': 'most_freq_price_indicator'
    # Keep month_first, day_first, country_first for now
})

# Define Classification Target
df_session['purchase_completed'] = (df_session['max_page_reached'] == 5).astype(int)

print(f"Session data recreated: {df_session.shape}")
print(df_session.head())
df_session.info() # Check dtypes, especially for categorical features

Original cleaned clickstream data loaded: (132379, 14)

Aggregating data to session level...
Session data recreated: (22910, 17)
            session_length  unique_main_categories  most_freq_main_category  \
session_id                                                                    
1                        9                       4                        2   
2                       10                       3                        2   
3                        5                       2                        3   
4                        4                       2                        1   
5                        1                       1                        3   

            unique_models_viewed  unique_colours  most_freq_colour  \
session_id                                                           
1                              8               6                 6   
2                              8               5                 3   
3                              3     

In [5]:
print("\n--- Dropping Redundant/Unnecessary Features ---")
columns_to_drop = [
    'month_first',                  # Used to create date features
    'day_first',                    # Used to create date features
    'date',                         # Intermediate date column
    'max_page_reached',             # Used to create target, potential leakage
    'total_pages_viewed',           # Likely redundant with session_length
    # Consider dropping features with too many unique values if not handled
    # 'page_2_clothing_model_nunique' # Example - decide based on EDA/utility
]
# Drop columns only if they exist in the dataframe
existing_columns_to_drop = [col for col in columns_to_drop if col in df_session.columns]
df_session_fe = df_session.drop(columns=existing_columns_to_drop)

print(f"Columns dropped: {existing_columns_to_drop}")
print(f"Shape after dropping: {df_session_fe.shape}")
print(df_session_fe.info()) # Check remaining columns and types


--- Dropping Redundant/Unnecessary Features ---
Columns dropped: ['month_first', 'day_first', 'date', 'max_page_reached', 'total_pages_viewed']
Shape after dropping: (22910, 13)
<class 'pandas.core.frame.DataFrame'>
Index: 22910 entries, 1 to 24026
Data columns (total 13 columns):
 #   Column                      Non-Null Count  Dtype  
---  ------                      --------------  -----  
 0   session_length              22910 non-null  int64  
 1   unique_main_categories      22910 non-null  int64  
 2   most_freq_main_category     22910 non-null  int64  
 3   unique_models_viewed        22910 non-null  int64  
 4   unique_colours              22910 non-null  int64  
 5   most_freq_colour            22910 non-null  int64  
 6   avg_price_viewed            22910 non-null  float64
 7   session_revenue_potential   22910 non-null  float64
 8   max_price_viewed            22910 non-null  float64
 9   country_first               22910 non-null  int64  
 10  model_photography_<lambda>  

In [6]:
print("\n--- Identifying Feature Types for Preprocessing ---")

# Separate Targets
y_class = df_session_fe['purchase_completed']
y_reg = df_session_fe['session_revenue_potential']
X = df_session_fe.drop(columns=['purchase_completed', 'session_revenue_potential'])

# Identify numerical and categorical features IN THE REMAINING X DATAFRAME
numerical_features = X.select_dtypes(include=np.number).columns.tolist()
categorical_features = X.select_dtypes(exclude=np.number).columns.tolist() # Check if any objects remain

# MANUALLY review and adjust the lists if needed.
# Features like 'country', 'most_freq_main_category', 'most_freq_colour' should be treated as categorical even if numeric
potential_cats = ['country_first', 'most_freq_main_category', 'most_freq_colour',
                 'most_freq_model_photo', 'most_freq_price_indicator',
                 'day_of_week', 'is_weekend'] # Treat day_of_week as categorical? Or keep numeric? Let's try categorical.

# Refine lists
categorical_features = [col for col in potential_cats if col in X.columns]
numerical_features = [col for col in X.columns if col not in categorical_features]


print(f"Numerical Features ({len(numerical_features)}): {numerical_features}")
print(f"Categorical Features ({len(categorical_features)}): {categorical_features}")
print(f"\nFeatures DataFrame X shape: {X.shape}")


--- Identifying Feature Types for Preprocessing ---
Numerical Features (8): ['session_length', 'unique_main_categories', 'unique_models_viewed', 'unique_colours', 'avg_price_viewed', 'max_price_viewed', 'model_photography_<lambda>', 'price_2_<lambda>']
Categorical Features (3): ['country_first', 'most_freq_main_category', 'most_freq_colour']

Features DataFrame X shape: (22910, 11)


In [7]:
print("\n--- Setting up Preprocessing Pipeline ---")

# Define preprocessing steps for numerical and categorical features
numeric_transformer = Pipeline(steps=[
    ('scaler', MinMaxScaler()) # Or StandardScaler()
])

categorical_transformer = Pipeline(steps=[
    ('onehot', OneHotEncoder(handle_unknown='ignore', sparse_output=False)) # handle_unknown='ignore' is important for test set
])

# Create the column transformer
# Use remainder='passthrough' if there are columns you want to keep unchanged (unlikely here)
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numerical_features),
        ('cat', categorical_transformer, categorical_features)
    ],
    remainder='drop' # Drop any columns not specified
)

# Apply the preprocessing pipeline to the features X
print("Fitting and transforming data with the preprocessor...")
X_processed = preprocessor.fit_transform(X)

# Get feature names after OneHotEncoding
# This is important for interpreting feature importances later
try:
    ohe_feature_names = preprocessor.named_transformers_['cat'].named_steps['onehot'].get_feature_names_out(categorical_features)
    all_feature_names = numerical_features + list(ohe_feature_names)
    print(f"\nTotal features after processing: {len(all_feature_names)}")

    # Convert the processed numpy array back to a DataFrame (optional but good for inspection)
    X_processed_df = pd.DataFrame(X_processed, columns=all_feature_names, index=X.index)

    print("\n--- Processed Data Head ---")
    print(X_processed_df.head())

    print("\n--- Processed Data Description ---")
    print(X_processed_df.describe())

except AttributeError:
    print("Warning: Could not retrieve feature names automatically. Check scikit-learn version.")
    # Handle older versions or alternative ways if needed
    X_processed_df = pd.DataFrame(X_processed, index=X.index) # Create DF without column names

print(f"\nShape of processed features X: {X_processed.shape}")
print(f"Shape of classification target y_class: {y_class.shape}")
print(f"Shape of regression target y_reg: {y_reg.shape}")

# --- Save the Preprocessor and Feature Names ---
output_dir = '../src/models' # Define directory to save artifacts
os.makedirs(output_dir, exist_ok=True) # Create directory if it doesn't exist

preprocessor_path = os.path.join(output_dir, 'preprocessor.joblib')
joblib.dump(preprocessor, preprocessor_path)
print(f"\nPreprocessor saved to: {preprocessor_path}")

# Save feature names (important for consistency)
feature_names_path = os.path.join(output_dir, 'feature_names.joblib')
joblib.dump(all_feature_names, feature_names_path)
print(f"Feature names saved to: {feature_names_path}")

# --- Save Processed Data (Optional but can be useful) ---
# You can save X_processed_df, y_class, y_reg if needed for separate modeling scripts
# processed_data_output_path = '../src/data/processed/train_final_features.csv'
# X_processed_df.to_csv(processed_data_output_path, index=False)
# y_class.to_csv('../src/data/processed/train_final_y_class.csv', index=False, header=True)
# y_reg.to_csv('../src/data/processed/train_final_y_reg.csv', index=False, header=True)
# print("Processed data frames saved.")


--- Setting up Preprocessing Pipeline ---
Fitting and transforming data with the preprocessor...

Total features after processing: 72

--- Processed Data Head ---
            session_length  unique_main_categories  unique_models_viewed  \
session_id                                                                 
1                 0.041237                1.000000              0.051852   
2                 0.046392                0.666667              0.051852   
3                 0.020619                0.333333              0.014815   
4                 0.015464                0.333333              0.022222   
5                 0.000000                0.000000              0.000000   

            unique_colours  avg_price_viewed  max_price_viewed  \
session_id                                                       
1                 0.384615          0.404297          0.609375   
2                 0.307692          0.487847          0.765625   
3                 0.153846          0.4

In [None]:
print("\n--- Engineering Time-Based Features ---")
# We know the year is 2008
# Create a date column (handle potential errors if day/month invalid - unlikely here)
try:
    # Original line likely causing issues if apply returns mixed types or errors happen silently
    # df_session['date'] = df_session.apply(lambda row: datetime.date(2008, int(row['month_first']), int(row['day_first'])), axis=1)

    # --- CORRECTED APPROACH using pd.to_datetime ---
    # Create string representation first 'YYYY-MM-DD'
    df_session['date_str'] = '2008-' + df_session['month_first'].astype(str) + '-' + df_session['day_first'].astype(str)
    # Convert string column to datetime objects
    df_session['date'] = pd.to_datetime(df_session['date_str'], format='%Y-%m-%d', errors='coerce')
    # Check for any dates that failed conversion (became NaT - Not a Time)
    if df_session['date'].isnull().any():
        print("Warning: Some date conversions failed. Review 'month_first' and 'day_first'. Rows with NaT:")
        print(df_session[df_session['date'].isnull()][['month_first', 'day_first']])
        # Decide how to handle NaT: fillna, drop rows, etc. For now, we proceed but be aware.

    # Drop the intermediate string column
    df_session = df_session.drop(columns=['date_str'])

    # Now, this part should work if conversion was successful
    df_session['day_of_week'] = df_session['date'].dt.dayofweek # Monday=0, Sunday=6
    df_session['is_weekend'] = df_session['day_of_week'].apply(lambda x: 1 if pd.notnull(x) else x).astype('Int64') # Handle potential NaT from conversion, use nullable Int
    # Convert day_of_week to nullable Int as well if you handled NaT
    df_session['day_of_week'] = df_session['day_of_week'].astype('Int64')


    print("Day of week and weekend features created.")
    print(df_session[['date', 'day_of_week', 'is_weekend']].head())
    # Check dtypes again
    print("\nChecking dtypes after date conversion:")
    print(df_session[['date', 'day_of_week', 'is_weekend']].info())

except Exception as e: # Catch more general exceptions during conversion
     print(f"Error creating date features: {e}. Check month/day values.")
     # Handle error


--- Engineering Time-Based Features ---
Day of week and weekend features created.
                 date  day_of_week  is_weekend
session_id                                    
1          2008-04-01            1           1
2          2008-04-01            1           1
3          2008-04-01            1           1
4          2008-04-01            1           1
5          2008-04-01            1           1

Checking dtypes after date conversion:
<class 'pandas.core.frame.DataFrame'>
Index: 22910 entries, 1 to 24026
Data columns (total 3 columns):
 #   Column       Non-Null Count  Dtype         
---  ------       --------------  -----         
 0   date         22910 non-null  datetime64[ns]
 1   day_of_week  22910 non-null  Int64         
 2   is_weekend   22910 non-null  Int64         
dtypes: Int64(2), datetime64[ns](1)
memory usage: 760.7 KB
None


In [11]:
# --- Save the Preprocessor and Feature Names ---
output_dir = 'models' # Define directory to save artifacts
os.makedirs(output_dir, exist_ok=True) # Create directory if it doesn't exist

preprocessor_path = os.path.join(output_dir, 'preprocessor.joblib')
joblib.dump(preprocessor, preprocessor_path)
print(f"\nPreprocessor saved to: {preprocessor_path}")

# Save feature names (important for consistency)
feature_names_path = os.path.join(output_dir, 'feature_names.joblib')
joblib.dump(all_feature_names, feature_names_path)
print(f"Feature names saved to: {feature_names_path}")

# --- Save Processed Data (Optional but can be useful) ---
# This part was commented out before - let's uncomment and ensure paths are correct
processed_data_output_dir = 'data_processed' # Define directory for processed data output
os.makedirs(processed_data_output_dir, exist_ok=True) # Ensure directory exists

processed_features_path = os.path.join(processed_data_output_dir, 'train_final_features.csv')
y_class_path = os.path.join(processed_data_output_dir, 'train_final_y_class.csv')
y_reg_path = os.path.join(processed_data_output_dir, 'train_final_y_reg.csv')

# Ensure X_processed_df exists (it's created earlier in the try block)
if 'X_processed_df' in locals():
        X_processed_df.to_csv(processed_features_path, index=False)
        print(f"Processed features saved to: {processed_features_path}")
else:
        print("Warning: X_processed_df not found. Cannot save features.")

# Ensure y_class and y_reg exist (defined earlier)
if 'y_class' in locals():
        y_class.to_csv(y_class_path, index=False, header=True) # Save header for clarity
        print(f"Classification target saved to: {y_class_path}")
else:
        print("Warning: y_class not found. Cannot save target.")

if 'y_reg' in locals():
        y_reg.to_csv(y_reg_path, index=False, header=True) # Save header for clarity
        print(f"Regression target saved to: {y_reg_path}")
else:
    print("Warning: y_reg not found. Cannot save target.")


Preprocessor saved to: models\preprocessor.joblib
Feature names saved to: models\feature_names.joblib
Processed features saved to: data_processed\train_final_features.csv
Classification target saved to: data_processed\train_final_y_class.csv
Regression target saved to: data_processed\train_final_y_reg.csv
