In [None]:
!pip install recommenders
!pip install tf_slim

In [None]:
import pandas as pd
import numpy as np
import time
from recommenders.utils.timer import Timer
from recommenders.models.ncf.ncf_singlenode import NCF
from recommenders.models.ncf.dataset import Dataset as NCFDataset
# from recommenders.datasets import movielens
from recommenders.datasets.python_splitters import python_chrono_split
from recommenders.evaluation.python_evaluation import (rmse, mae, rsquared, exp_var, map_at_k, ndcg_at_k, precision_at_k, 
                                                     recall_at_k, get_top_k_items)
from recommenders.utils.constants import SEED as DEFAULT_SEED

pp = ['0909370001', '0924243001', '0918522001', '0865799006', '0751471001', '0448509014', '0762846027', '0918292001', '0923758001', '0924243002', '0915529003', '0850917001']
# Initial parameters
TOP_K = 12
EPOCHS = 50
BATCH_SIZE = 1024
SEED = DEFAULT_SEED

# transaction_df = pd.read_csv("transactions_train.csv", header=0)
transactions_df = pd.read_csv("../input/h-and-m-personalized-fashion-recommendations/transactions_train.csv", header=0)
customers_df = pd.read_csv("../input/h-and-m-personalized-fashion-recommendations/customers.csv", header=0)
articles_df = pd.read_csv("../input/h-and-m-personalized-fashion-recommendations/articles.csv", header=0)
rec_df = pd.read_csv("../input/my-inputs/rec_item.csv", header=0)
# print("size of transaction:", len(transactions_df))
# transactions_df = transactions_df[:10000]
# data processing:
customers_df['userID'] = range(len(customers_df))
articles_df['itemID'] = range(len(articles_df))
transactions_df = transactions_df.merge(customers_df[['customer_id', 'userID']], on='customer_id')
transactions_df = transactions_df.merge(articles_df[['article_id', 'itemID']], on='article_id')
rec_df = rec_df.merge(customers_df[['customer_id', 'userID']], on='customer_id')

# test_results = compute_test_results(model, train, validation, RATING_METRICS, RANKING_METRICS)
print("start data processing")
"""
date = '2020-05-06'
train = transactions_df[transactions_df.t_dat <= date]
test = transactions_df[transactions_df.t_dat > date]

"""
transactions_df['timestamp'] = transactions_df['t_dat'].apply(lambda x:time.mktime(time.strptime(x,'%Y-%m-%d')))
train, test = python_chrono_split(transactions_df, [0.75, 0.25])
train = train.drop(['timestamp', 't_dat'], axis=1)
test = test.drop(['timestamp', 't_dat'], axis=1)
# header=["userID", "itemID", "rating", "timestamp"]
# t_dat, customer_id, article_id, price, sales_channel_id

train = train.groupby(['userID', 'itemID']).size().reset_index().rename(columns={0:'rating'})
test = test.groupby(['userID', 'itemID']).size().reset_index().rename(columns={0:'rating'})

test = test[test["userID"].isin(train["userID"].unique())]
test = test[test["itemID"].isin(train["itemID"].unique())]
# leave_one_out_test = test.groupby("userID").last().reset_index()


train_file = "./train.csv"
test_file = "./test.csv"
# leave_one_out_test_file = "./leave_one_out_test.csv"

print("save data")
train.to_csv(train_file, index=False)
test.to_csv(test_file, index=False)
# leave_one_out_test.to_csv(leave_one_out_test_file, index=False)
print(train.head())
# data = NCFDataset(train, test, seed=DEFAULT_SEED)
# data = NCFDataset(train = train, test = test, seed=SEED)
data = NCFDataset(train_file=train_file, test_file=test_file, seed=SEED, overwrite_test_file_full=True)
model = NCF (
    n_users=data.n_users,
    n_items=data.n_items,
    model_type="NeuMF",
    n_factors=4,
    layer_sizes=[16,8,4],
    n_epochs=EPOCHS,
    batch_size=BATCH_SIZE,
    learning_rate=1e-3,
    verbose=1,
    seed=SEED
)
print("start model training")
# fitting the model
model.fit(data)
"""print("eval:")
for user in test['userID']:
    print("user:", user)
    per_user_pred = []
    for item in articles_df['itemID']:
        print("item:", item)
        if item not in train["itemID"].unique():
            pred = 0
        else:
            pred = model.predict(user, item)
        per_user_pred.append((user, item, pred))
    per_user_pred = sorted(per_user_pred, key=lambda pred: pred[2], reverse=True)[:12]
    for article in test[test.userID == user]['itemID']:
        if article in per_user_pred:
            correct+=1
            
print("accuracy:", float(correct)/len(test))
"""
# predict the data in the test set
# predictions = [[row.userID, row.itemID, model.predict(row.userID, row.itemID)] for (_, row) in test.iterrows()]
print("start prediction")
customer_ids = []
predictions = []
counter1 = 0
counter2 = 0
for user in rec_df['userID']:
    print("userID:", user)
    per_user_pred = []
    recs = rec_df[rec_df.userID == user]['prediction'].iloc[0].split()
    if user not in train['userID'].unique():
        counter1+=1
        customer_ids.append(customers_df[customers_df.userID == user]["customer_id"].iloc[0])
        predictions.append(recs[:12])
        continue
    counter2+=1
    for article_id in recs:
        item = articles_df[articles_df.article_id == int(article_id)]['itemID'].iloc[0]
        if item not in train["itemID"].unique():
            pred = 0
        else:
            pred = model.predict(user, item)
        customer_id = customers_df[customers_df.userID == user]["customer_id"].iloc[0]
        per_user_pred.append((customer_id, article_id, pred))
    per_user_pred = sorted(per_user_pred, key=lambda pred: pred[2], reverse=True)[:12]
    customer_ids.append(customer_id)
    predictions.append([tuple[1] for tuple in per_user_pred])

print("generating results")
results_df = pd.DataFrame(list(zip(customer_ids, predictions)), columns=['customer_id', 'prediction'])
results_df["prediction"] = results_df["prediction"].apply(lambda x: " ".join(x))
print(results_df.head())
results_df.to_csv('submission.csv', index=False)
print("the user counter that not inside", counter1)
print("inside:", counter2)
"""
with Timer() as test_time:
    for user in rec_df['userID']:
        per_user_pred = []
        recs = rec_df[rec_df.userID == user]['prediction'].iloc[0].split()
        if user not in train['userID'].unique():
            customer_ids.append(customers_df[customers_df.userID == user]["customer_id"].iloc[0])
            predictions.append(recs[:12])
            continue
        for article_id in recs:
            print("article_id", article_id)
            item = articles_df[articles_df.article_id == int(article_id)]['itemID'].iloc[0]
            print("item:", item)
            if item not in train["itemID"].unique():
                print("NOT! inside training set")
                pred = 0
            else:
                print("inside training set")
                pred = model.predict(user, item)
            customer_id = customers_df[customers_df.userID == user]["customer_id"].iloc[0]
            per_user_pred.append((customer_id, article_id, pred))
        per_user_pred = sorted(per_user_pred, key=lambda pred: pred[2], reverse=True)[:12]
        customer_ids.append(customer_id)
        predictions.append([tuple[1] for tuple in per_user_pred])
        print("final prediction:", predictions)
        break

print("Took {} seconds for prediction.".format(test_time.interval))
"""
"""# predict the data in the test set
predictions = [[row.userID, row.itemID, model.predict(row.userID, row.itemID)]
               for (_, row) in test.iterrows()]

predictions = pd.DataFrame(predictions, columns=['userID', 'itemID', 'prediction'])
print(predictions.head())
predictions.to_csv("prediction.csv", index=False)
"""

"""
with Timer() as test_time:

    users, items, preds = [], [], []
    item = list(train.itemID.unique())
    for user in train.userID.unique():
        user = [user] * len(item) 
        users.extend(user)
        items.extend(item)
        preds.extend(list(model.predict(user, item, is_list=True)))

    all_predictions = pd.DataFrame(data={"userID": users, "itemID":items, "prediction":preds})

    merged = pd.merge(train, all_predictions, on=["userID", "itemID"], how="outer")
    all_predictions = merged[merged.rating.isnull()].drop('rating', axis=1)

print("Took {} seconds for prediction.".format(test_time.interval))

eval_map = map_at_k(test, all_predictions, col_prediction='prediction', k=TOP_K)
eval_ndcg = ndcg_at_k(test, all_predictions, col_prediction='prediction', k=TOP_K)
eval_precision = precision_at_k(test, all_predictions, col_prediction='prediction', k=TOP_K)
eval_recall = recall_at_k(test, all_predictions, col_prediction='prediction', k=TOP_K)

print("MAP:\t%f" % eval_map,
      "NDCG:\t%f" % eval_ndcg,
      "Precision@K:\t%f" % eval_precision,
      "Recall@K:\t%f" % eval_recall, sep='\n')
"""
"""
k = TOP_K

ndcgs = []
hit_ratio = []

for b in data.test_loader():
    user_input, item_input, labels = b
    output = model.predict(user_input, item_input, is_list=True)

    output = np.squeeze(output)
    rank = sum(output >= output[0])
    if rank <= k:
        ndcgs.append(1 / np.log(rank + 1))
        hit_ratio.append(1)
    else:
        ndcgs.append(0)
        hit_ratio.append(0)

eval_ndcg = np.mean(ndcgs)
eval_hr = np.mean(hit_ratio)

print("HR:\t%f" % eval_hr)
print("NDCG:\t%f" % eval_ndcg)
"""
