In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
import pandas as pd
from gensim.models import Word2Vec
from gensim.models.phrases import Phrases, Phraser
import multiprocessing
import tensorflow as tf
from tensorflow.keras.layers import Input, Embedding, Dense, Transformer

# Function to train a Word2Vec model
def train_word2vec_model(locale):
    # Load the preprocessed data for the specified locale
    df_processed = pd.read_csv('/content/drive/MyDrive/products_train_processed.csv')
    df_locale = df_processed[df_processed['locale'] == locale]
    df_locale['text'] = df_locale['brand'] + ' ' + df_locale['title']
    df_locale = df_locale.dropna(subset=['text'])
    tokenized_text = [text.split() for text in df_locale['text']]

    # Train the Word2Vec model
    cores = multiprocessing.cpu_count()
    model = Word2Vec(tokenized_text, vector_size=100, window=5, min_count=5, workers=cores, sg=0)

    # Optional: Train a phrase model to capture common phrases
    phrases = Phrases(tokenized_text, min_count=5, threshold=10)
    bigram = Phraser(phrases)
    tokenized_text_phrases = [bigram[doc] for doc in tokenized_text]

    # Train the Word2Vec model on the tokenized text with phrases
    model_phrases = Word2Vec(tokenized_text_phrases, vector_size=100, window=5, min_count=5, workers=cores, sg=0)

    # Save the trained models
    model.save(f"word2vec_{locale.lower()}.model")
    model_phrases.save(f"word2vec_phrases_{locale.lower()}.model")

# Function to build and train the recommendation model
def train_recommendation_model(locale):
    # Load the training data for the specified locale
    products_train = pd.read_csv("/content/drive/MyDrive/products_train_processed.csv")
    sessions_train = pd.read_csv("/content/drive/MyDrive/sessions_train.csv")
    sessions_train_locale = sessions_train[sessions_train['locale'] == locale]
    merged_data = pd.merge(sessions_train_locale, products_train, left_on='next_item', right_on='id', how='left')
    filtered_data = merged_data[['prev_items', 'next_item']]
    filtered_data['prev_items'] = filtered_data['prev_items'].apply(lambda x: x.split())
    input_features = filtered_data['prev_items']
    target_labels = filtered_data['next_item']

    # Load the Word2Vec model
    model = Word2Vec.load(f"word2vec_{locale.lower()}.model")

    # Build the recommendation model
    vocab_size = len(model.wv.vocab)
    embed_dim = 300
    num_heads = 4
    feed_forward_dim = 512

    inputs = Input(shape=(None,))
    embedding = Embedding(vocab_size, embed_dim)(inputs)
    transformer = Transformer(num_heads=num_heads, d_model=embed_dim,
                              ff_dim=feed_forward_dim, activation='relu')(embedding)
    outputs = Dense(vocab_size, activation='softmax')(transformer)
    recommendation_model = tf.keras.Model(inputs=inputs, outputs=outputs)
    loss_fn = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)
    optimizer = tf.keras.optimizers.Adam()
    recommendation_model.compile(optimizer=optimizer, loss=loss_fn)

    # Convert input features and target labels to tensors
    input_features = tf.ragged.constant(input_features.tolist())
    target_labels = tf.convert_to_tensor(target_labels)

    # Train the model
    num_epochs = 10
    recommendation_model.fit(input_features, target_labels, epochs=num_epochs)

    # Return the trained model
    return recommendation_model

# Function to generate recommendations for a given locale and test data
def generate_recommendations(locale, test_data):
    # Load the Word2Vec and recommendation models
    model = Word2Vec.load(f"word2vec_{locale.lower()}.model")
    recommendation_model = train_recommendation_model(locale)

    # Preprocess the test data for the specified locale
    test_data_locale = test_data[test_data['locale'] == locale]
    test_data_locale['prev_items'] = test_data_locale['prev_items'].apply(lambda x: x.split())
    input_features = test_data_locale['prev_items'].tolist()
    input_features = tf.ragged.constant(input_features)

    # Generate predictions
    predictions = recommendation_model.predict(input_features)

    # Get the top 100 recommended product IDs for each session
    recommended_product_ids = []
    for prediction in predictions:
        top_product_ids = tf.argsort(prediction, direction='DESCENDING')[:100]
        recommended_product_ids.append(top_product_ids)

    # Convert the recommended product IDs to a list
    recommended_product_ids = [ids.numpy().tolist() for ids in recommended_product_ids]

    # Return the recommended product IDs
    return recommended_product_ids

# Load the test data
sessions_test = pd.read_csv("sessions_test_task1_phase1.csv")

# Generate recommendations for each locale and save the results
locales = ['DE', 'JP', 'ES', 'FR', 'IT']
for locale in locales:
    # Train the Word2Vec model for the locale
    train_word2vec_model(locale)

    # Generate recommendations for the locale
    recommended_product_ids = generate_recommendations(locale, sessions_test)

    # Create a DataFrame with the results
    results = pd.DataFrame({
        'prev_items': sessions_test[sessions_test['locale'] == locale]['prev_items'],
        'locale': sessions_test[sessions_test['locale'] == locale]['locale'],
        'recommended_product_ids': recommended_product_ids
    })

    # Save the results to a CSV file
    results.to_csv(f"RESULTS_{locale}.csv", index=False)
