In [4]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [13]:
import pandas as pd
from gensim.models import Word2Vec
from gensim.models.phrases import Phrases, Phraser
import multiprocessing

In [31]:
# Load the preprocessed data from products_train_processed.csv
df_processed = pd.read_csv('/content/drive/MyDrive/products_train_processed.csv')

# Filter the data for the UK locale
df_uk = df_processed[df_processed['locale'] == 'UK']

# Combine 'brand' and 'title' columns to create a new 'text' column
df_uk['text'] = df_uk['brand'] + ' ' + df_uk['title']

# Remove missing values from the 'text' column
df_uk = df_uk.dropna(subset=['text'])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_uk['text'] = df_uk['brand'] + ' ' + df_uk['title']


In [32]:
print(df_uk['text'].head(10))

913336    sochow sochow sherpa fleec throw blanket doubl...
913337    hippowareh hippowareh personali photo print mo...
913338    clarkson potter 500 easi recip everi machin st...
913339    tyhjoy tyhjoy mini bag sealer handheld vacuum ...
913340    lucosobi lucosobi steer wheel lock car antithe...
913341                               88 film tentacl bluray
913342    bochion knife sharpen 4 1 finger protect knife...
913343    spigen spigen rug armor design iphon 11 case 2...
913344    criacr criacr motion sensor light bar 10 led 6...
913345    coolzon coolzon bento box lunch box adult kids...
Name: text, dtype: object


In [19]:
# Tokenize the text
tokenized_text = [text.split() for text in df_uk['text']]


In [21]:
# Train the Word2Vec model
cores = multiprocessing.cpu_count()  # Number of CPU cores
model = Word2Vec(tokenized_text,
                 vector_size=100,  # Size of the word embeddings
                 window=5,  # Context window size
                 min_count=5,  # Minimum word frequency threshold
                 workers=cores,  # Number of workers for parallel processing
                 sg=0)  # Training algorithm: 0 for CBOW, 1 for skip-gram

# Optional: Train a phrase model to capture common phrases (e.g., "green_haven")
phrases = Phrases(tokenized_text, min_count=5, threshold=10)
bigram = Phraser(phrases)
tokenized_text_phrases = [bigram[doc] for doc in tokenized_text]



In [22]:
# Train the Word2Vec model on the tokenized text with phrases
model_phrases = Word2Vec(tokenized_text_phrases, vector_size=100, window=5, min_count=5, workers=cores, sg=0)



In [23]:
# Save the trained model
model.save("word2vec_uk.model")
model_phrases.save("word2vec_phrases_uk.model")

In [None]:
# Access a sample item from the dataset
sample_item = df_uk.iloc[1]  # Assuming the first item in the filtered dataframe
sample_text = sample_item['text']
product_id = sample_item['id']

# Print the word embeddings and corresponding product ID for the sample item
for word in sample_text.split():
    if word in model.wv:
        print(f"Word: {word} | Embedding: {model.wv[word]} | Product ID: {product_id}")

In [45]:

# Load the saved word2vec models
model = Word2Vec.load("word2vec_uk.model")
model_phrases = Word2Vec.load("word2vec_phrases_uk.model")

In [2]:
# Load the training data
products_train = pd.read_csv("/content/drive/MyDrive/products_train_processed.csv")
sessions_train = pd.read_csv("/content/drive/MyDrive/sessions_train.csv")

# Filter the data for the UK locale
uk_sessions_train = sessions_train[sessions_train['locale'] == 'UK']

# Merge sessions_train with products_train to get the product information
merged_data = pd.merge(uk_sessions_train, products_train, left_on='next_item', right_on='id', how='left')

# Filter the data to include only the required columns
filtered_data = merged_data[['prev_items', 'next_item']]

NameError: ignored

In [None]:
# Split the prev_items column into individual product IDs
filtered_data['prev_items'] = filtered_data['prev_items'].apply(lambda x: x.split())

# Extract the input features and target labels
input_features = filtered_data['prev_items']
target_labels = filtered_data['next_item']

In [None]:
import tensorflow as tf
from tensorflow.keras.layers import Input, Embedding, Dense, Transformer

# Define the transformer-based model
def build_recommendation_model(vocab_size, embed_dim, num_heads, feed_forward_dim):
    # Input layer
    inputs = Input(shape=(None,))

    # Embedding layer
    embedding = Embedding(vocab_size, embed_dim)(inputs)

    # Transformer layer
    transformer = Transformer(num_heads=num_heads, d_model=embed_dim,
                              ff_dim=feed_forward_dim, activation='relu')(embedding)

    # Output layer
    outputs = Dense(vocab_size, activation='softmax')(transformer)

    # Create the model
    model = tf.keras.Model(inputs=inputs, outputs=outputs)
    return model

# Set the hyperparameters
vocab_size = len(model.wv.vocab)  # Adjust based on your vocabulary size
embed_dim = 300  # Adjust based on the dimensionality of your word embeddings
num_heads = 4  # Adjust based on the desired number of attention heads
feed_forward_dim = 512  # Adjust based on the desired size of the feed-forward layer

# Build the recommendation model
recommendation_model = build_recommendation_model(vocab_size, embed_dim, num_heads, feed_forward_dim)

In [None]:
# Define the loss function and optimizer
loss_fn = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)
optimizer = tf.keras.optimizers.Adam()

# Compile the model
recommendation_model.compile(optimizer=optimizer, loss=loss_fn)

# Convert the input features and target labels to tensors
input_features = tf.ragged.constant(input_features.tolist())
target_labels = tf.convert_to_tensor(target_labels)

# Train the model for a few epochs
num_epochs = 10
recommendation_model.fit(input_features, target_labels, epochs=num_epochs)

In [None]:
# Load the test data
sessions_test = pd.read_csv("sessions_test_task1_phase1.csv")
uk_sessions_test = sessions_test[sessions_test['locale'] == 'UK']

In [None]:
# Split the prev_items column into individual product IDs
uk_sessions_test['prev_items'] = uk_sessions_test['prev_items'].apply(lambda x: x.split())

# Convert the input features to a list
input_features = uk_sessions_test['prev_items'].tolist()

In [None]:
# Convert the input features to tensors
input_features = tf.ragged.constant(input_features)

# Generate predictions
predictions = recommendation_model.predict(input_features)

# Get the top 100 recommended product IDs for each session
recommended_product_ids = []
for prediction in predictions:
    top_product_ids = tf.argsort(prediction, direction='DESCENDING')[:100]
    recommended_product_ids.append(top_product_ids)

# Convert the recommended product IDs to a list
recommended_product_ids = [ids.numpy().tolist() for ids in recommended_product_ids]

In [None]:
# Create a DataFrame with the results
results = pd.DataFrame({
    'prev_items': uk_sessions_test['prev_items'],
    'locale': uk_sessions_test['locale'],
    'recommended_product_ids': recommended_product_ids
})

# Save the results to a CSV file
results.to_csv("/content/drive/MyDrive/RESULTS.csv", index=False)