<a href="https://colab.research.google.com/github/i-ganza007/Multimodal-Data-Preprocessing/blob/main/Product_Model.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [27]:
import pandas as pd
import numpy as np
from lightgbm import LGBMClassifier
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.metrics import accuracy_score, f1_score, confusion_matrix, precision_score, recall_score
from imblearn.over_sampling import SMOTE
import seaborn as sns
import matplotlib.pyplot as plt

In [2]:
customer_social_profiles = pd.read_excel('/content/Copy of customer_social_profiles.xlsx')
customer_transactions = pd.read_excel('/content/Copy of customer_transactions.xlsx')
customer_social_profiles.head()
customer_transactions.head()

Unnamed: 0,customer_id_legacy,transaction_id,purchase_amount,purchase_date,product_category,customer_rating
0,151,1001,408,2024-01-01,Sports,2.3
1,192,1002,332,2024-01-02,Electronics,4.2
2,114,1003,442,2024-01-03,Electronics,2.1
3,171,1004,256,2024-01-04,Clothing,2.8
4,160,1005,64,2024-01-05,Clothing,1.3


In [4]:
# Assuming 'customer_id_new' is already in a numeric format or can be directly converted
customer_social_profiles['customer_id_new'] = customer_social_profiles['customer_id_new'].str.replace('A', '').astype(int)
customer_social_profiles.head()

Unnamed: 0,customer_id_new,social_media_platform,engagement_score,purchase_interest_score,review_sentiment
0,178,LinkedIn,74,4.9,Positive
1,190,Twitter,82,4.8,Neutral
2,150,Facebook,96,1.6,Positive
3,162,Twitter,89,2.6,Positive
4,197,Twitter,92,2.3,Neutral


In [5]:
df = customer_transactions.merge(customer_social_profiles, left_on='customer_id_legacy', right_on='customer_id_new', how='left')
df = df.drop(['transaction_id', 'customer_id_new'], axis=1) # Dimensionality removing unnecessary columns
df.head()

Unnamed: 0,customer_id_legacy,purchase_amount,purchase_date,product_category,customer_rating,social_media_platform,engagement_score,purchase_interest_score,review_sentiment
0,151,408,2024-01-01,Sports,2.3,TikTok,61.0,1.3,Neutral
1,151,408,2024-01-01,Sports,2.3,Twitter,72.0,1.6,Neutral
2,151,408,2024-01-01,Sports,2.3,Twitter,82.0,3.6,Negative
3,192,332,2024-01-02,Electronics,4.2,Instagram,60.0,4.3,Positive
4,114,442,2024-01-03,Electronics,2.1,Facebook,87.0,4.8,Negative


In [6]:
df['customer_rating'].fillna(df['customer_rating'].median(), inplace=True)
# df['missing_social_profile'] = df['engagement_score'].isna().astype(int)
df.fillna({'engagement_score': df['engagement_score'].mean(),
           'purchase_interest_score': df['purchase_interest_score'].mean(),
           'review_sentiment': 'Neutral'}, inplace=True)
df.head()

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['customer_rating'].fillna(df['customer_rating'].median(), inplace=True)


Unnamed: 0,customer_id_legacy,purchase_amount,purchase_date,product_category,customer_rating,social_media_platform,engagement_score,purchase_interest_score,review_sentiment
0,151,408,2024-01-01,Sports,2.3,TikTok,61.0,1.3,Neutral
1,151,408,2024-01-01,Sports,2.3,Twitter,72.0,1.6,Neutral
2,151,408,2024-01-01,Sports,2.3,Twitter,82.0,3.6,Negative
3,192,332,2024-01-02,Electronics,4.2,Instagram,60.0,4.3,Positive
4,114,442,2024-01-03,Electronics,2.1,Facebook,87.0,4.8,Negative


In [8]:
# Removing duplicates
df = df.drop_duplicates()
# Extract date features
df['purchase_month'] = df['purchase_date'].dt.month
df['purchase_day'] = df['purchase_date'].dt.day
df['purchase_weekday'] = df['purchase_date'].dt.weekday

df = df.astype({
    'purchase_amount': float,
    'customer_rating': float,
    'engagement_score': float,
    'purchase_interest_score': float,
    'purchase_month': float,
    'purchase_day': float,
    'purchase_weekday': float
})
df['product_category'] = df['product_category'].astype('category')
df['social_media_platform'] = df['social_media_platform'].astype('category')
df['review_sentiment'] = df['review_sentiment'].map({'Positive': 1, 'Neutral': 0, 'Negative': -1}).astype(int)
df.head()

Unnamed: 0,customer_id_legacy,purchase_amount,purchase_date,product_category,customer_rating,social_media_platform,engagement_score,purchase_interest_score,review_sentiment,purchase_month,purchase_day,purchase_weekday
0,151,408.0,2024-01-01,Sports,2.3,TikTok,61.0,1.3,0,1.0,1.0,0.0
1,151,408.0,2024-01-01,Sports,2.3,Twitter,72.0,1.6,0,1.0,1.0,0.0
2,151,408.0,2024-01-01,Sports,2.3,Twitter,82.0,3.6,-1,1.0,1.0,0.0
3,192,332.0,2024-01-02,Electronics,4.2,Instagram,60.0,4.3,1,1.0,2.0,1.0
4,114,442.0,2024-01-03,Electronics,2.1,Facebook,87.0,4.8,-1,1.0,3.0,2.0


In [11]:
df['purchase_year'] = df['purchase_date'].dt.year
df['purchase_month'] = df['purchase_date'].dt.month
df['purchase_dayofweek'] = df['purchase_date'].dt.dayofweek
df['days_since_last_purchase'] = (df['purchase_date'].max() - df['purchase_date']).dt.days
df['purchase_engagement_interaction'] = df['purchase_amount'] * df['engagement_score']
df['rating_sentiment_interaction'] = df['customer_rating'] * df['review_sentiment']

In [17]:
le = LabelEncoder()
df['product_category'] = le.fit_transform(df['product_category'])

In [18]:
X = df.drop(['product_category', 'purchase_date', 'customer_id_legacy'], axis=1)
y = df['product_category']

In [19]:
numerical_cols = ['purchase_amount', 'customer_rating', 'engagement_score', 'purchase_interest_score',
                  'days_since_last_purchase', 'purchase_engagement_interaction', 'rating_sentiment_interaction']
scaler = StandardScaler()
X[numerical_cols] = scaler.fit_transform(X[numerical_cols])

  updated_mean = (last_sum + new_sum) / updated_sample_count
  T = new_sum / new_sample_count
  new_unnormalized_variance -= correction**2 / new_sample_count


In [20]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)


In [21]:
model = LGBMClassifier(learning_rate=0.05, num_leaves=31, max_depth=7, n_estimators=100,
                      lambda_l1=0.1, lambda_l2=0.1, objective='multiclass', num_class=5)
model.fit(X_train, y_train)

[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000199 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 731
[LightGBM] [Info] Number of data points in the train set: 196, number of used features: 23
[LightGBM] [Info] Start training from score -1.751754
[LightGBM] [Info] Start training from score -1.844127
[LightGBM] [Info] Start training from score -1.564543
[LightGBM] [Info] Start training from score -1.667197
[LightGBM] [Info] Start training from score -1.307823


In [28]:
y_pred = model.predict(X_test)
print("Accuracy", accuracy_score(y_test, y_pred))
print("F1-score", f1_score(y_test, y_pred,average='weighted'))
print("F1-score", f1_score(y_test, y_pred,average='macro'))
print("Precision", precision_score(y_test, y_pred, average='weighted'))
print("Recall", recall_score(y_test, y_pred, average='weighted'))

Accuracy 0.6
F1-score 0.5956190476190476
F1-score 0.6006349206349205
Precision 0.6162020202020203
Recall 0.6


In [None]:
results = pd.DataFrame({
    'customer_id': df.loc[X_test.index, 'customer_id_legacy'],
    'purchase_date': df.loc[X_test.index, 'purchase_date'],
    'predicted_category': le.inverse_transform(y_pred)
})
print("Sample predictions:", results.head())

Sample predictions:
      customer_id purchase_date predicted_category
91           189    2024-02-26          Groceries
72           150    2024-02-15              Books
8            160    2024-01-05           Clothing
158          152    2024-04-08           Clothing
226          134    2024-05-20             Sports
