In [None]:
from __future__ import division

import numpy as np
import pandas as pd

In [None]:
data_train = pd.read_csv("./data/train.csv")
data_train.head()

In [None]:
data_challenge = pd.read_csv("./data/challenge_data.csv")
data_challenge.head()

In [None]:
data_test = pd.read_csv("./data/test.csv")
data_test.head()

In [None]:
data_train['user_id'] = data_train['user_id'].apply(lambda x: str(x))
data_train['user_id'].describe()

In [None]:
data_test['user_id'] = data_test['user_id'].apply(lambda x: str(x))
data_test['user_id'].describe()

In [None]:
data_all = pd.concat([data_train, data_test], axis=0)
data_all = data_all.sort_values('user_id')
# data_all = data_all.sample(frac=1).reset_index(drop=True)

data_all.info()

In [None]:
data_all['user_id'].describe()

In [None]:
data_all.isnull().sum()

In [None]:
data_all['challenge_sequence'] = data_all['challenge_sequence'].apply(lambda x: 14 - x)
data_all.head(20)

In [None]:
data_all = data_all[['user_id', 'challenge', 'challenge_sequence']]
data_all.info()
data_all.head()

In [None]:
# -  get the prediction dataset as the test dataset
# data_pred = data_all[data_all['user_id'].isin(data_test['user_id'].apply(lambda x: str(x)))]
# data_pred.info()
# data_pred.head()

In [None]:
import pandas as pd

from surprise import SVD
from surprise import KNNBasic
from surprise import Dataset
from surprise import Reader

In [None]:
# A reader is still needed but only the rating_scale param is requiered.
reader = Reader(rating_scale=(1, 13))

# The columns must correspond to user id, item id and ratings (in that order).
data = Dataset.load_from_df(data_all.iloc[:], reader)

In [None]:
# Retrieve the trainset.
trainset = data.build_full_trainset()

# Build an algorithm, and train it.
# algo = KNNBasic()
# algo = SVD()
# algo.fit(trainset)

from surprise import NormalPredictor, evaluate
algo = NormalPredictor()
algo.fit(trainset)

In [None]:
# uid = '4577'  # raw user id (as in the ratings file).
# iid = 'CI23855'  # raw item id (as in the ratings file).

# get a prediction for specific users and items, the rating is supposed to be 13.
pred_user_id_list = data_test.user_id.unique()
print(len(pred_user_id_list))

pred_challenge_id_list = data_all.challenge.unique()
print(len(pred_challenge_id_list))

# pred_challenge_id_list = data_challenge['challenge_ID'].unique()
# print(len(pred_challenge_id_list))


In [None]:
predictions = pd.DataFrame(columns=['user', 'challenge', 'rating'])

for pred_user_id in pred_user_id_list[:3]:
    for pred_challenge_id in pred_challenge_id_list[:100]:
        repeat_list = data_test['challenge'][data_test['user_id'] == pred_user_id].tolist()
        if pred_challenge_id not in repeat_list:
            pred = algo.predict(pred_user_id, pred_challenge_id, r_ui=None, verbose=False)
            if float(pred.est) > 7.0:
                predictions = predictions.append(pd.Series([pred_user_id, pred_challenge_id, pred.est], \
                                                           index=['user', 'challenge', 'rating']), \
                                                           ignore_index=True)
# predictions.iloc[0:9:3, 2] = 3.0
# predictions.iloc[:15, 1] = 'CI22222'
predictions.info()
predictions.head(10)

In [None]:
predictions['rank'] = predictions.groupby('user')['rating'].rank(ascending=False)
predictions.head()


In [None]:
# predictions = predictions[((predictions['user'].isin(data_pred['user_id'])) & \
#                           (predictions['challenge'].isin(data_pred['challenge']))) == False]
# predictions

In [None]:
predictions = predictions.sort_values(['rank'])
predictions = predictions.groupby('user').head(3)
predictions = predictions.sort_values(['user', 'rank']).reset_index(drop=True)

predictions.head(20)

In [None]:
current = None
seq = 11
results = pd.DataFrame(columns=['user_sequence', 'challenge'])
for index, row in predictions.iterrows():
    if row['user'] != current:
        seq = 11
    
    result_user_seq = row['user'] + '_' + str(seq)
    result_challenge = row['challenge']
    results = results.append(pd.Series([row['user'] + '_' + str(seq), row['challenge']], index=['user_sequence', 'challenge']), ignore_index=True)
    
    seq += 1
    current = row['user']

results.head(9)

In [None]:
results.to_csv('./submission/submission.csv', encoding='utf-8', index=False)
