In [2]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [3]:
# reading the datasets
import numpy as np
import pandas as pd

train_data = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/DL/11.project3/what_to_buy/data/amazon_train.csv')
valid_data = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/DL/11.project3/what_to_buy/data/amazon_valid.csv')
test_data = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/DL/11.project3/what_to_buy/data/amazon_test.csv')

train_data

Unnamed: 0,UserID,ProductID,Rating,Timestamp
0,A3HICVLF4PFFMN,0594481813,5.0,2014-05-05
1,A2QBZA4S1ROX9Q,0594481813,3.0,2013-05-25
2,AT09WGFUM934H,0594481813,3.0,2013-08-31
3,AGAKHE014LQFU,0594481813,3.0,2013-09-18
4,A1S6B5QFWGVL5U,0594481813,4.0,2013-06-27
...,...,...,...,...
909196,ADLVFFE4VBT8,B00L3YHF6O,5.0,2014-07-05
909197,AWPODHOB4GFWL,B00L3YHF6O,5.0,2014-07-10
909198,A3A4ZAIBQWKOZS,B00L3YHF6O,5.0,2014-07-07
909199,A34BZM6S9L7QI4,B00LGQ6HL8,5.0,2014-07-17


In [4]:
# checking the distribution of rating values in the training dataset
rating_counts = train_data['Rating'].value_counts()
print(rating_counts)

Rating
5.0    544151
4.0    193195
3.0     77209
1.0     52315
2.0     42331
Name: count, dtype: int64


In [5]:
# applying random oversampling to balance the class distribution of ratings
from imblearn.over_sampling import RandomOverSampler

X = train_data[['UserID', 'ProductID', 'Timestamp']]
y = train_data['Rating']

ros = RandomOverSampler(
    sampling_strategy = {
        1.0: 100000,
        2.0: 100000,
        3.0: 150000,
        4.0: 200000,
        5.0: 544151
    }
)

X_resampled, y_resampled = ros.fit_resample(X, y)

train_data = pd.concat([X_resampled, y_resampled], axis=1)

train_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1094151 entries, 0 to 1094150
Data columns (total 4 columns):
 #   Column     Non-Null Count    Dtype  
---  ------     --------------    -----  
 0   UserID     1094151 non-null  object 
 1   ProductID  1094151 non-null  object 
 2   Timestamp  1094151 non-null  object 
 3   Rating     1094151 non-null  float64
dtypes: float64(1), object(3)
memory usage: 33.4+ MB


In [6]:
print(train_data['Rating'].value_counts())

Rating
5.0    544151
4.0    200000
3.0    150000
2.0    100000
1.0    100000
Name: count, dtype: int64


In [7]:
# encoding user and product IDs into numerical labels
from sklearn.preprocessing import LabelEncoder

user_encoder = LabelEncoder()
product_encoder = LabelEncoder()

train_data['UserEnc'] = user_encoder.fit_transform(train_data['UserID'])
train_data['ProductEnc'] = product_encoder.fit_transform(train_data['ProductID'])

test_data['UserEnc'] = user_encoder.transform(test_data['UserID'])
test_data['ProductEnc'] = product_encoder.transform(test_data['ProductID'])

valid_data['UserEnc'] = user_encoder.transform(valid_data['UserID'])
valid_data['ProductEnc'] = product_encoder.transform(valid_data['ProductID'])

In [8]:
# scaling rating values to a normalized range using MinMaxScaler
from sklearn.preprocessing import MinMaxScaler

scaler = MinMaxScaler()

train_data['Rating'] = scaler.fit_transform(train_data[['Rating']])

valid_data['Rating'] = scaler.transform(valid_data[['Rating']])

In [9]:
train_data

Unnamed: 0,UserID,ProductID,Timestamp,Rating,UserEnc,ProductEnc
0,A3HICVLF4PFFMN,0594481813,2014-05-05,1.00,59277,0
1,A2QBZA4S1ROX9Q,0594481813,2013-05-25,0.50,41167,0
2,AT09WGFUM934H,0594481813,2013-08-31,0.50,85549,0
3,AGAKHE014LQFU,0594481813,2013-09-18,0.50,77047,0
4,A1S6B5QFWGVL5U,0594481813,2013-06-27,0.75,18672,0
...,...,...,...,...,...,...
1094146,AA67YKPROAME0,B003QKBVYK,2012-01-26,0.75,73029,24160
1094147,AOYIITK7V0NME,B001H9NR2Q,2009-04-06,0.75,82868,14590
1094148,AUWATCU929616,B007JWUP52,2012-10-12,0.75,86780,37490
1094149,AYRWEGG9HWTL5,B00008Z1QJ,2009-05-11,0.75,89251,1837


In [10]:
# building a neural collaborative filtering model with user and product embeddings
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, Embedding, Flatten, Dot, Dense
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.metrics import Precision

n_users = train_data['UserEnc'].nunique()
n_products = train_data['ProductEnc'].nunique()

embedding_dim = 50

user_input = Input(shape = (1,), name = 'User')
user_embedding = Embedding(input_dim = n_users, output_dim = embedding_dim, name = 'User-Embedding')(user_input)
user_vec = Flatten(name = 'User-Flat')(user_embedding)

product_input = Input(shape = (1,), name = 'Product')
product_embedding = Embedding(input_dim = n_products, output_dim = embedding_dim, name = 'Product-Embedding')(product_input)
product_vec = Flatten(name = 'Product-Flat')(product_embedding)

dot_product = Dot(axes = 1, name = 'Dot-Product')([user_vec, product_vec])

output = Dense(1, activation = 'sigmoid')(dot_product)

model = Model(inputs = [user_input, product_input], outputs = output)

model.compile(optimizer = Adam(learning_rate = 0.001), loss = 'binary_crossentropy', metrics=[Precision()])

model.summary()

In [11]:
# preparing user, product, and rating data for training the recommendation model
user_ids = train_data['UserEnc']
product_ids = train_data['ProductEnc']

ratings = train_data['Rating']

In [12]:
# training the mdodel
model.fit([user_ids, product_ids], ratings, epochs = 2, batch_size = 64)

Epoch 1/2
[1m17097/17097[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m88s[0m 5ms/step - loss: 0.5725 - precision: 0.9156
Epoch 2/2
[1m17097/17097[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m124s[0m 4ms/step - loss: 0.3410 - precision: 0.9867


<keras.src.callbacks.history.History at 0x7849ac4a3410>

In [13]:
# evaluating training precision
from sklearn.metrics import precision_score

y_train_pred = model.predict([train_data['UserEnc'], train_data['ProductEnc']])
train_pred_labels = (y_train_pred > 0.5).astype(int)
y_train = (train_data['Rating'] >= 0.75).astype(int)

train_precision = precision_score(y_train, train_pred_labels)
print(f"Training Precision: {train_precision:.4f}")

[1m34193/34193[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m55s[0m 2ms/step
Training Precision: 0.8977


In [15]:
# evaluating validation precision
y_valid_pred = model.predict([valid_data['UserEnc'], valid_data['ProductEnc']])
valid_pred_labels = (y_valid_pred > 0.5).astype(int)
y_valid = (valid_data['Rating'] >= 0.75).astype(int)

valid_precision = precision_score(y_valid, valid_pred_labels)
print(f"Validation Precision: {valid_precision:.4f}")

[1m2810/2810[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 2ms/step
Validation Precision: 0.8120
