# Packages

In [2]:
%load_ext autoreload
%autoreload 2

import pandas as pd

from constants import Constant as C
from loaders import load_ratings
from models import ContentBased

df_genome_tags columns: Index(['tagId', 'tag'], dtype='object')
df_genome_scores columns: Index(['movieId', 'tagId', 'relevance'], dtype='object')
merged columns: Index(['tagId', 'tag', 'movieId', 'relevance'], dtype='object')
      movieId                                                tag
0           1  [007, 007 (series), 18th century, 1920s, 1930s...
1           2  [007, 007 (series), 18th century, 1920s, 1930s...
2           3  [007, 007 (series), 18th century, 1920s, 1930s...
3           4  [007, 007 (series), 18th century, 1920s, 1930s...
4           5  [007, 007 (series), 18th century, 1920s, 1930s...
...       ...                                                ...
8367   160954  [007, 007 (series), 18th century, 1920s, 1930s...
8368   161582  [007, 007 (series), 18th century, 1920s, 1930s...
8369   161594  [007, 007 (series), 18th century, 1920s, 1930s...
8370   163056  [007, 007 (series), 18th century, 1920s, 1930s...
8371   163949  [007, 007 (series), 18th century, 1920s, 19

# How to generate predictions for the hackathon ?
To submit predictions to the hackathon: 
- be sure to consider your latest version of ContentBased. Use your evaluator notebook to assess the quality of your model
- run the make_hackathon_prediction() function in the present notebook with your best feature_method and regressor_method. This will generate a ratings_prediction.csv file
- download ratings_prediction.csv and upload it on the onedrive of your group

In [7]:
def make_hackathon_prediction(feature_method, regressor_method):
    """Generate a prediction file on the test set"""
    # 1) load train data - make sure to redirect the DATA_PATH to'data/hackathon'
    print(C.DATA_PATH)
    assert str(C.DATA_PATH) == 'data\hackathon'
    sp_ratings = load_ratings(surprise_format=True)
    train_set = sp_ratings.build_full_trainset()
    
    # 2) train your ContentBased model on the train set
    content_knn = ContentBased(feature_method, regressor_method)
    content_knn.fit(train_set)
    
    # 3) make predictions on the test set
    df_test = pd.read_csv('data/hackathon/evidence/ratings_test.csv')[C.USER_ITEM_RATINGS]
    test_records = list(df_test.to_records(index=False))
    predictions = content_knn.test(test_records)
    output_predictions = []
    for uid, iid, _, est, _ in predictions:
        output_predictions.append([uid, iid, est])
    df_predictions = pd.DataFrame(data=output_predictions, columns = df_test.columns)

    # 4) dump predictions
    df_predictions.to_csv(f'ratings_predictions.csv', index=False)

    
make_hackathon_prediction('all', 'linear')

data\hackathon
df_genome_tags columns: Index(['tagId', 'tag'], dtype='object')
df_genome_scores columns: Index(['movieId', 'tagId', 'relevance'], dtype='object')
merged columns: Index(['tagId', 'tag', 'movieId', 'relevance'], dtype='object')
      movieId                                                tag
0           1  [007, 007 (series), 18th century, 1920s, 1930s...
1           2  [007, 007 (series), 18th century, 1920s, 1930s...
2           3  [007, 007 (series), 18th century, 1920s, 1930s...
3           4  [007, 007 (series), 18th century, 1920s, 1930s...
4           5  [007, 007 (series), 18th century, 1920s, 1930s...
...       ...                                                ...
8367   160954  [007, 007 (series), 18th century, 1920s, 1930s...
8368   161582  [007, 007 (series), 18th century, 1920s, 1930s...
8369   161594  [007, 007 (series), 18th century, 1920s, 1930s...
8370   163056  [007, 007 (series), 18th century, 1920s, 1930s...
8371   163949  [007, 007 (series), 18th cen