In [None]:
# from google.colab import drive
# drive.mount('/content/gdrive')

Mounted at /content/gdrive


In [None]:
!pip install tensorflow_recommenders

In [None]:
import pandas as pd
import tensorflow as tf
import tensorflow_recommenders as tfrs


data_path = '/content/gdrive/MyDrive/567_final_proj/data'
# customers_df = pd.read_csv(data_path + "/customers.csv", header=0, dtype=str)
# articles_df = pd.read_csv(data_path + "/articles.csv", header=0, dtype=str)
# transactions_df = pd.read_csv(data_path + "/transactions_train.csv", header=0, dtype=str)
# 16 weeks: 2020-06-02
date = '2020-05-06'
# # 8 weeks: 2020-07-28
# date = '2020-07-28'
# transactions_df = transactions_df[transactions_df.t_dat > date]
# one year: '2019-09-22'
transactions_df = pd.read_csv(data_path + "/transactions_cut.csv", header=0, dtype=str)
customers_df = pd.read_csv(data_path + '/customers_clustered.csv', header=0, dtype=str)
transactions_df = transactions_df[transactions_df.t_dat >= date]
articles_df = pd.read_csv(data_path + "/articles.csv", header=0, dtype=str)

cluster0_popularity = ['0706016001', '0720125001', '0706016002', '0610776002', '0372860001', '0759871002', '0751471001', '0706016003', '0464297007', '0372860002', '0562245046', '0610776001']
cluster1_popularity = ['0706016001', '0706016002', '0720125001', '0372860001', '0610776002', '0759871002', '0751471001', '0706016003', '0464297007', '0372860002', '0562245046', '0448509014']
cluster2_popularity = ['0706016001', '0720125001', '0706016002', '0372860001', '0759871002', '0610776002', '0751471001', '0706016003', '0464297007', '0372860002', '0562245046', '0448509014']

customer_need_predict = customers_df[customers_df["customer_id"].isin(transactions_df["customer_id"].unique())]
print(len(customer_need_predict))
customer_need_popularity = customers_df[~customers_df["customer_id"].isin(transactions_df["customer_id"].unique())]
print(len(customer_need_popularity))


460815
911165


In [None]:
BATCH_SIZE = 10000
TOP_K = 12
train_dataset = tf.data.Dataset.from_tensor_slices((dict(transactions_df[['article_id', 'customer_id']])))
article_dataset = tf.data.Dataset.from_tensor_slices(articles_df['article_id'])

class TwoTowerModel(tfrs.Model):
    def __init__(self):
        super().__init__()
        num_unique_users = 1371980
        num_unique_movies = 105542
        embedding_dim = 32
        eval_batch_size = 128
        
        self.user_model = tf.keras.Sequential([
            tf.keras.layers.experimental.preprocessing.StringLookup(vocabulary=customers_df['customer_id'].to_numpy()),
            tf.keras.layers.Embedding(num_unique_users+1, embedding_dim)
        ])
        
        # Same for movies.
        self.movie_model = tf.keras.Sequential([
            tf.keras.layers.experimental.preprocessing.StringLookup(vocabulary=articles_df['article_id'].to_numpy()),
            tf.keras.layers.Embedding(num_unique_movies+1, embedding_dim)
        ])
        
        self.task = tfrs.tasks.Retrieval(
            metrics=tfrs.metrics.FactorizedTopK(candidates=article_dataset.batch(eval_batch_size).map(self.movie_model))
        )
        
    def compute_loss(self, features, training=False):
        user_embeddings = self.user_model(features['customer_id'])
        movie_embeddings = self.movie_model(features['article_id'])
        return self.task(user_embeddings, movie_embeddings)

model = TwoTowerModel()
model.compile(optimizer=tf.keras.optimizers.Adagrad(0.1))
model.fit(train_dataset.batch(BATCH_SIZE), verbose=True, epochs=3)

index = tfrs.layers.factorized_top_k.BruteForce(model.user_model, k=TOP_K)
index.index_from_dataset(
  tf.data.Dataset.zip((article_dataset.batch(100), article_dataset.batch(100).map(model.movie_model)))
)



Epoch 1/3
Epoch 2/3
Epoch 3/3


<tensorflow_recommenders.layers.factorized_top_k.BruteForce at 0x7f2d41fb2790>

In [None]:
print("start prediction")
customer_ids = []
predictions = []
counter = 0
for j in range(len(customers_df['customer_id'])):
  c_id = customers_df['customer_id'][j]
  if c_id in customer_need_predict['customer_id'].unique():
    _, titles = index(tf.constant([c_id]))
    preds = titles[0, :TOP_K]
    predictions.append([i.decode("utf-8") for i in preds.numpy()])
  else:
    cluster_id =  customers_df['clusters'][j]
    if cluster_id == '0':
      preds = cluster0_popularity
    elif cluster_id == '1':
      preds = cluster1_popularity
    else:
      preds = cluster2_popularity
    predictions.append(preds)
  customer_ids.append(c_id)
  if counter % 1000 == 0:
    print(counter)
    print(f"Recommendations for user {c_id}: {preds}")
  counter+=1

results_df = pd.DataFrame(list(zip(customer_ids, predictions)), columns=['customer_id', 'prediction'])
results_df["prediction"] = results_df["prediction"].apply(lambda x: " ".join(x))
print(results_df.head())
results_df.to_csv('submission.csv', index=False)