In [16]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

# Load the uploaded dataset
df = pd.read_csv('../data/raw/data.csv')

In [20]:
from datetime import datetime
import sys
import os
sys.path.append(os.path.abspath(".."))
from src.data_processing import create_aggregate_features, extract_time_features, build_pipeline


# Apply time features
df = extract_time_features(df)

# Create aggregate features
agg_df = create_aggregate_features(df)  # Includes additional columns like TotalDebit, AbsAmount, etc.

# See what new columns were generated
print("Generated columns from aggregation:", agg_df.columns.tolist())

# Merge with relevant customer-level info (ProductCategory, etc.)
merged = agg_df.merge(
    df[['CustomerId', 'ProductCategory', 'ChannelId', 'PricingStrategy']].drop_duplicates('CustomerId'),
    on='CustomerId',
    how='left'
)

print("All columns after merge:", merged.columns.tolist())

# Define numerical features dynamically from merged dataframe (exclude 'CustomerId')
numerical_features = [col for col in numerical_features if col != 'PricingStrategy']

categorical_features = ['ProductCategory', 'ChannelId', 'PricingStrategy']
print("Numerical features:", numerical_features)
print("Categorical features:", categorical_features)

# Build pipeline
pipeline = build_pipeline(categorical_features, numerical_features)

# Prepare feature set
X = merged[categorical_features + numerical_features]

# Apply transformations
X_transformed = pipeline.fit_transform(X)

print("Transformed feature matrix shape:", X_transformed.shape)





Generated columns from aggregation: ['CustomerId', 'TotalAmount', 'MeanAmount', 'StdAmount', 'TransactionCount']
All columns after merge: ['CustomerId', 'TotalAmount', 'MeanAmount', 'StdAmount', 'TransactionCount', 'ProductCategory', 'ChannelId', 'PricingStrategy']
Numerical features: ['TotalAmount', 'MeanAmount', 'StdAmount', 'TransactionCount']
Categorical features: ['ProductCategory', 'ChannelId', 'PricingStrategy']


ValueError: Cannot use median strategy with non-numeric data:
could not convert string to float: 'airtime'

In [21]:
print(merged[numerical_features].dtypes)

TotalAmount         float64
MeanAmount          float64
StdAmount           float64
TransactionCount      int64
dtype: object


In [22]:
for col in numerical_features:
    print(f"Unique values in {col}:")
    print(merged[col].unique())
    print()


Unique values in TotalAmount:
[-10000.   20000.    4225.  ... 228727.2   8650.  543873. ]

Unique values in MeanAmount:
[-10000.           4000.            384.09090909 ...    961.11111111
   5384.88118812   8176.47058824]

Unique values in StdAmount:
[           nan  6558.96333272   560.49896602 ... 14800.65678427
  4433.32964765  6775.1455632 ]

Unique values in TransactionCount:
[   1    5   11    6    9    2    7   74    8  203   16   14   50    3
   30   52    4   29   12   42   46   32  152   55  784   10   23   57
   24   37   20   49   18  368   15   84  241   67   22  173   45   17
   44   72   13   26   19  273   97   33  253   86   60   28   36   21
   61  113   43  140   27   66   58  116   93   70   90   25   91   59
   89  126   68  480   96  151  524   47   63   38   35   79   56   31
   41  259  106  144  181   73  425  278   75  146   85   76  197   40
  119   34  156  111   82  127   94  147  236  148   77  162   64  484
  124   39   62  399  310  610  133  139  109  

In [23]:
for col in numerical_features:
    merged[col] = pd.to_numeric(merged[col], errors='coerce')


In [24]:
numerical_features = [col for col in numerical_features if pd.api.types.is_numeric_dtype(merged[col])]


In [25]:
X = merged[categorical_features + numerical_features]
X_transformed = pipeline.fit_transform(X)
print("Transformed feature matrix shape:", X_transformed.shape)


ValueError: Cannot use median strategy with non-numeric data:
could not convert string to float: 'airtime'

In [19]:
df.to_csv('../data/processed/processed_data.csv', index=False)
