# YZV311E Data Mining Project Data Preprocessing and Exploration
__________

### Hasan Taha Bağcı - 150210338
### Selman Turan Toker - 150220330
____________

In [58]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

from utils.data_preprocess import *
from utils.plots import *

import warnings
warnings.filterwarnings('ignore')


from datetime import timedelta
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from lightgbm import LGBMClassifier
from sklearn.metrics import classification_report
from xgboost import XGBClassifier

## Reading the Data and Exploring Main Features

In [40]:
product_catalog = pd.read_csv('data/product_catalog.csv')
product_category_map = pd.read_csv('data/product_category_map.csv')
transactions = pd.read_csv('data/transactions.csv')
test = pd.read_csv('data/test.csv')

Shapes of the datasets are printed below.

In [41]:
print('product_catalog:', product_catalog.shape)
print('product_category_map:', product_category_map.shape)
print('transactions:', transactions.shape)


product_catalog: (32776, 8)
product_category_map: (4332, 2)
transactions: (1071538, 4)


In [None]:
final = transactions.merge(product_catalog, on='product_id', how='left')

In [43]:
final.head()

Unnamed: 0,customer_id,product_id,purchase_date,quantity,manufacturer_id,attribute_1,attribute_2,attribute_3,attribute_4,attribute_5,categories
0,38769,3477,2020-06-01,1,186,6,0,196,0,45,"[74, 4109, 3867, 803, 4053]"
1,42535,30474,2020-06-01,1,193,10,3,229,3,132,"[3459, 3738, 679, 1628, 4072]"
2,42535,15833,2020-06-01,1,1318,4,1,455,0,108,"[2973, 2907, 2749, 3357]"
3,42535,20131,2020-06-01,1,347,4,0,291,3,44,"[30, 1515, 1760, 2932, 1287, 2615, 3727, 2450,..."
4,42535,4325,2020-06-01,1,539,6,0,303,0,45,"[3104, 1772, 2029, 1274, 3915, 888, 1118, 3882..."


In [46]:
import ast

# Create a mapping from category_id to parent_category_id
category_to_parent = product_category_map.set_index('category_id')['parent_category_id'].to_dict()

# Function to map categories to their parent categories
def get_parent_categories(categories_str):
    if pd.isna(categories_str) or categories_str == '':
        return []
    # Convert string representation of list to an actual list
    categories = ast.literal_eval(categories_str)
    # Map each category to its parent
    parent_categories = [category_to_parent.get(cat_id, None) for cat_id in categories]
    # Remove None values if any category_id doesn't have a parent in the map
    return [parent for parent in parent_categories if parent is not None]

# Apply the function to add the parent_categories column
final['parent_categories'] = final['categories'].apply(get_parent_categories)

In [53]:
def fill_missing_category(row):
    if pd.isna(row['category']):
        return row['parent_categories']
    return row['category']

In [51]:
final_nulls_removed = final.dropna()
final_nulls_removed.isnull().sum()

customer_id          0
product_id           0
purchase_date        0
quantity             0
manufacturer_id      0
attribute_1          0
attribute_2          0
attribute_3          0
attribute_4          0
attribute_5          0
categories           0
parent_categories    0
dtype: int64

In [63]:
final_nulls_removed.to_csv('data/final_nulls_removed.csv', index=False)

In [54]:
final_nulls_filled = final.fillna(fill_missing_category, axis=1)

In [55]:
final_nulls_filled.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1071538 entries, 0 to 1071537
Data columns (total 12 columns):
 #   Column             Non-Null Count    Dtype 
---  ------             --------------    ----- 
 0   customer_id        1071538 non-null  object
 1   product_id         1071538 non-null  object
 2   purchase_date      1071538 non-null  object
 3   quantity           1071538 non-null  object
 4   manufacturer_id    1071538 non-null  object
 5   attribute_1        1071538 non-null  object
 6   attribute_2        1071538 non-null  object
 7   attribute_3        1071538 non-null  object
 8   attribute_4        1071538 non-null  object
 9   attribute_5        1071538 non-null  object
 10  categories         1071538 non-null  object
 11  parent_categories  1071538 non-null  object
dtypes: object(12)
memory usage: 106.3+ MB


In [61]:
X = final_nulls_filled.drop(['quantity'], axis=1).reset_index(drop=True)
y = final_nulls_filled['quantity'].reset_index(drop=True)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=38)

In [66]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
import xgboost as xgb
import ast
import joblib

# Load the training data
train_df = pd.read_csv('data/final_nulls_removed.csv')
test_df = pd.read_csv('data/test.csv')

# Preprocess training data
def parse_list_column(column):
    return column.apply(lambda x: ast.literal_eval(x) if isinstance(x, str) else x)

train_df['categories'] = parse_list_column(train_df['categories'])
train_df['parent_categories'] = parse_list_column(train_df['parent_categories'])

# Feature engineering for training data
def extract_first_category(categories):
    return categories[0] if categories and len(categories) > 0 else -1

train_df['first_category'] = train_df['categories'].apply(extract_first_category)
train_df['first_parent_category'] = train_df['parent_categories'].apply(extract_first_category)

# Prepare features
features = [
    'customer_id', 'product_id', 'manufacturer_id', 
    'attribute_1', 'attribute_2', 'attribute_3', 
    'attribute_4', 'attribute_5', 
    'first_category', 'first_parent_category'
]
test_features = features.copy()
# Handle categorical features for training data
label_encoders = {}
for col in ['customer_id', 'product_id', 'manufacturer_id', 
            'first_category', 'first_parent_category']:
    le = LabelEncoder()
    train_df[f'{col}_encoded'] = le.fit_transform(train_df[col])
    label_encoders[col] = le
    features[features.index(col)] = f'{col}_encoded'

# Prepare X and y for training
X_train = train_df[features].values
y_train = train_df['quantity'].values

# Preprocess test data
# Prepare features for test data


# Encode test data features using the same label encoders
for col in ['customer_id', 'product_id']:
    col_encoded = f'{col}_encoded'
    
    # Safely encode test data using existing label encoders
    def safe_encode(value):
        try:
            return label_encoders[col].transform([value])[0]
        except ValueError:
            # If the value is not in the original training set, assign a unique value
            return len(label_encoders[col].classes_)
    
    test_df[col_encoded] = test_df[col].apply(safe_encode)
    test_features[test_features.index(col)] = col_encoded

# Add placeholder values for missing features
test_df['first_category_encoded'] = -1
test_df['first_parent_category_encoded'] = -1
test_features[test_features.index('first_category')] = 'first_category_encoded'
test_features[test_features.index('first_parent_category')] = 'first_parent_category_encoded'

# Add placeholders for other numerical features
for feature in ['manufacturer_id', 'attribute_1', 'attribute_2', 
                'attribute_3', 'attribute_4', 'attribute_5']:
    if feature not in test_df.columns:
        test_df[feature] = 0

# Prepare X for test data
X_test = test_df[test_features].values

# Scale the features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Create DMatrix for XGBoost
dtrain = xgb.DMatrix(X_train_scaled, label=y_train)
dtest = xgb.DMatrix(X_test_scaled)

# Set XGBoost parameters
params = {
    'objective': 'reg:squarederror',
    'eval_metric': 'rmse',
    'max_depth': 6,
    'learning_rate': 0.1,
    'n_estimators': 100
}

# Train the model
model = xgb.train(params, dtrain, num_boost_round=100)

# Predict quantities
test_predictions = model.predict(dtest)

# Round predictions to nearest integer (since quantity is typically whole numbers)
test_predictions_rounded = np.round(test_predictions).astype(int)

# Create submission DataFrame
submission_df = test_df[['id', 'customer_id', 'product_id']].copy()
submission_df['prediction'] = test_predictions_rounded

# Clip predictions to be non-negative
submission_df['prediction'] = submission_df['prediction'].clip(lower=0)

# Save predictions
submission_df.to_csv('quantity_predictions.csv', index=False)

print("Predictions generated and saved to quantity_predictions.csv")
print("\nPrediction Statistics:")
print(f"Total Predictions: {len(submission_df)}")
print(f"Mean Predicted Quantity: {submission_df['prediction'].mean():.2f}")
print(f"Max Predicted Quantity: {submission_df['prediction'].max()}")
print(f"Min Predicted Quantity: {submission_df['prediction'].min()}")

Predictions generated and saved to quantity_predictions.csv

Prediction Statistics:
Total Predictions: 10000
Mean Predicted Quantity: 5.04
Max Predicted Quantity: 32
Min Predicted Quantity: 0
