# YZV311E Data Mining Project Data Preprocessing and Exploration
__________

### Hasan Taha Bağcı - 150210338
### Selman Turan Toker - 150220330
____________

In [58]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

from utils.data_preprocess import *
from utils.plots import *

import warnings
warnings.filterwarnings('ignore')


from datetime import timedelta
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from lightgbm import LGBMClassifier
from sklearn.metrics import classification_report
from xgboost import XGBClassifier

## Reading the Data and Exploring Main Features

In [40]:
product_catalog = pd.read_csv('data/product_catalog.csv')
product_category_map = pd.read_csv('data/product_category_map.csv')
transactions = pd.read_csv('data/transactions.csv')
test = pd.read_csv('data/test.csv')

Shapes of the datasets are printed below.

In [41]:
print('product_catalog:', product_catalog.shape)
print('product_category_map:', product_category_map.shape)
print('transactions:', transactions.shape)


product_catalog: (32776, 8)
product_category_map: (4332, 2)
transactions: (1071538, 4)


In [None]:
final = transactions.merge(product_catalog, on='product_id', how='left')

In [43]:
final.head()

Unnamed: 0,customer_id,product_id,purchase_date,quantity,manufacturer_id,attribute_1,attribute_2,attribute_3,attribute_4,attribute_5,categories
0,38769,3477,2020-06-01,1,186,6,0,196,0,45,"[74, 4109, 3867, 803, 4053]"
1,42535,30474,2020-06-01,1,193,10,3,229,3,132,"[3459, 3738, 679, 1628, 4072]"
2,42535,15833,2020-06-01,1,1318,4,1,455,0,108,"[2973, 2907, 2749, 3357]"
3,42535,20131,2020-06-01,1,347,4,0,291,3,44,"[30, 1515, 1760, 2932, 1287, 2615, 3727, 2450,..."
4,42535,4325,2020-06-01,1,539,6,0,303,0,45,"[3104, 1772, 2029, 1274, 3915, 888, 1118, 3882..."


In [46]:
import ast

# Create a mapping from category_id to parent_category_id
category_to_parent = product_category_map.set_index('category_id')['parent_category_id'].to_dict()

# Function to map categories to their parent categories
def get_parent_categories(categories_str):
    if pd.isna(categories_str) or categories_str == '':
        return []
    # Convert string representation of list to an actual list
    categories = ast.literal_eval(categories_str)
    # Map each category to its parent
    parent_categories = [category_to_parent.get(cat_id, None) for cat_id in categories]
    # Remove None values if any category_id doesn't have a parent in the map
    return [parent for parent in parent_categories if parent is not None]

# Apply the function to add the parent_categories column
final['parent_categories'] = final['categories'].apply(get_parent_categories)

In [53]:
def fill_missing_category(row):
    if pd.isna(row['category']):
        return row['parent_categories']
    return row['category']

In [51]:
final_nulls_removed = final.dropna()
final_nulls_removed.isnull().sum()

customer_id          0
product_id           0
purchase_date        0
quantity             0
manufacturer_id      0
attribute_1          0
attribute_2          0
attribute_3          0
attribute_4          0
attribute_5          0
categories           0
parent_categories    0
dtype: int64

In [54]:
final_nulls_filled = final.fillna(fill_missing_category, axis=1)

In [55]:
final_nulls_filled.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1071538 entries, 0 to 1071537
Data columns (total 12 columns):
 #   Column             Non-Null Count    Dtype 
---  ------             --------------    ----- 
 0   customer_id        1071538 non-null  object
 1   product_id         1071538 non-null  object
 2   purchase_date      1071538 non-null  object
 3   quantity           1071538 non-null  object
 4   manufacturer_id    1071538 non-null  object
 5   attribute_1        1071538 non-null  object
 6   attribute_2        1071538 non-null  object
 7   attribute_3        1071538 non-null  object
 8   attribute_4        1071538 non-null  object
 9   attribute_5        1071538 non-null  object
 10  categories         1071538 non-null  object
 11  parent_categories  1071538 non-null  object
dtypes: object(12)
memory usage: 106.3+ MB


In [61]:
X = final_nulls_filled.drop(['quantity'], axis=1).reset_index(drop=True)
y = final_nulls_filled['quantity'].reset_index(drop=True)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=38)

In [62]:
xgb_model = XGBClassifier()
xgb_model.fit(X_train, y_train)
y_pred = xgb_model.predict(X_test)
print(classification_report(y_test, y_pred))


ValueError: Invalid classes inferred from unique values of `y`.  Expected: [ 0  1  2  3  4  5  6  7  8  9 10 11 12 13 14 15 16 17 18 19 20 21 22 23
 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46], got [1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27
 28 29 30 31 32 34 35 40 48 49 50 51 60 63 65 70 80 85 98 100]

In [None]:
# Get ready for the model split the data into train and test
final_nulls_filled['purchase_date'] = pd.to_datetime(final_nulls_filled['purchase_date'])

# Get the latest purchase date
latest_date = final_nulls_filled['purchase_date'].max()

# Calculate the recency of each purchase
final_nulls_filled['recency'] = latest_date - final_nulls_filled['purchase_date']

# Convert recency to days
final_nulls_filled['recency'] = final_nulls_filled['recency'].dt.days

# Get the total number of purchases for each user
final_nulls_filled['total_purchases'] = final_nulls_filled.groupby('user_id')['user_id'].transform('count')

# Get the total amount spent by each user
final_nulls_filled['total_spent'] = final_nulls_filled.groupby('user_id')['price'].transform('sum')

# Get the average amount spent by each user
final_nulls_filled['average_spent'] = final_nulls_filled.groupby('user_id')['price'].transform('mean')

# Get the total number of unique products purchased by each user
final_nulls_filled['unique_products'] = final_nulls_filled.groupby('user_id')['product_id'].transform('nunique')