# So how do we fill these missing values?

In [1]:
import numpy as np
import sys
import os
from pathlib import Path
import pandas as pd

NOTEBOOK_DIR = os.getcwd()
PROJECT_DIR = str(Path(NOTEBOOK_DIR).resolve().parent)

sys.path.insert(1, PROJECT_DIR+"/tindar-engine")
PROJECT_DIR

import tindar

%load_ext autoreload
%autoreload 2

In [2]:
# n = 10
n = 500

tindar_problem = tindar.TindarGenerator(
    n, nan_probability=0.3, generation_kind="interesting",
    attractiveness_distr="uniform", unif_low=0.3, unif_high=0.8
)
tindar_problem.create_love_matrix()
love_matrix = tindar_problem.love_matrix
love_matrix

array([[ 0.,  1., nan, ...,  1.,  1.,  0.],
       [ 0.,  0.,  0., ...,  0.,  0.,  0.],
       [ 0.,  1.,  0., ...,  1.,  1.,  0.],
       ...,
       [nan,  0., nan, ...,  0.,  0.,  0.],
       [ 0.,  0., nan, ...,  0.,  0.,  0.],
       [ 0.,  1.,  0., ...,  0., nan,  0.]])

## We predict if a person is interested in the other with recommender engines

In this notebook, we will use scikit-suprise (https://surprise.readthedocs.io/en/stable/index.html), a library for collaborative filtering algorithms (an alternative would be sklearn SimpleImputer or other kinds of Imputers).<br>

An easy way to load datasets into surprise is by tranforming your data to a long datafrom and using surprise.Dataset.load_from_df.

Links on Recommender Engines:
- blog: https://towardsdatascience.com/introduction-to-recommender-systems-6c66cf15ada
- book: The Adaptive Web - Methods and Strategies of Web Personalization, Chapters 9 and 10 (Brusilovsky, Kobsa, Nejdl)


In [3]:
from surprise import SVD, KNNBaseline, Dataset, accuracy
from surprise.model_selection import train_test_split, cross_validate
from surprise.reader import Reader


### Load data

In [4]:
df_love = pd.DataFrame(love_matrix)
print(df_love.shape)
df_love.head()

(500, 500)


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,490,491,492,493,494,495,496,497,498,499
0,0.0,1.0,,,,,,0.0,1.0,0.0,...,,1.0,0.0,0.0,,,0.0,1.0,1.0,0.0
1,0.0,0.0,0.0,,0.0,,,0.0,0.0,0.0,...,,0.0,,,0.0,,0.0,0.0,0.0,0.0
2,0.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0,0.0,...,,,,,0.0,,,1.0,1.0,0.0
3,0.0,,0.0,0.0,1.0,,0.0,0.0,1.0,0.0,...,0.0,0.0,,0.0,,1.0,0.0,,1.0,0.0
4,,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,,,,0.0,0.0,0.0,0.0,,0.0


In [5]:
def wide_to_long(df_wide, column_names=['Row', 'Column', 'Value']):
    df_wide_transp = df_wide.transpose()
    df_long = pd.DataFrame(df_wide_transp.copy().unstack()).reset_index()
    df_long.columns = column_names
    df_long = df_long.dropna()

    return df_long

In [6]:
df_love_long = wide_to_long(df_love)
df_love_long.head()

Unnamed: 0,Row,Column,Value
0,0,0,0.0
1,0,1,1.0
7,0,7,0.0
8,0,8,1.0
9,0,9,0.0


In [7]:
RATING_SCALE = (0, 1)
reader = Reader(rating_scale=RATING_SCALE)
surprise_dataset = Dataset.load_from_df(
    df_love_long, reader
)

### Initialize model

In [8]:
model = SVD(n_factors=3)
# model = KNNBaseline(k=5)

### Split data

#### Simple split

In [9]:
trainset, testset = train_test_split(surprise_dataset, test_size=.25)

In [10]:
model.fit(trainset)
test_predictions = model.test(testset)
print(help(trainset.build_testset))
trainset_iterable = trainset.build_testset()
train_predictions = model.test(trainset_iterable)

Help on method build_testset in module surprise.trainset:

build_testset() method of surprise.trainset.Trainset instance
    Return a list of ratings that can be used as a testset in the
    :meth:`test() <surprise.prediction_algorithms.algo_base.AlgoBase.test>`
    method.
    
    The ratings are all the ratings that are in the trainset, i.e. all the
    ratings returned by the :meth:`all_ratings()
    <surprise.Trainset.all_ratings>` generator. This is useful in
    cases where you want to to test your algorithm on the trainset.

None


In [13]:
len(trainset_iterable)

131317

In [19]:
def surprise_predictions_to_df(predictions_surp):
    values = [(x.uid, x.iid, x.r_ui, x.est) for x in predictions_surp]
    
    df_predictions = pd.DataFrame(
        data=values,
        columns=["Row", "Column", "y", "probabilities"]
    )
    
    return df_predictions

In [20]:
df_test_preds = surprise_predictions_to_df(test_predictions)
df_test_preds["Set"] = "test"
df_train_preds = surprise_predictions_to_df(train_predictions)
df_test_preds["Set"] = "train"

print(df_test_preds.shape)
print(df_train_preds.shape)
print(df_love_long.shape)

assert len(df_test_preds) + len(df_train_preds) == len(df_love_long), \
       f"len(df_test_preds) + len(df_train_preds) = {len(df_test_preds) + len(df_train_preds)} "\
       f"len(df_love_long) {len(df_love_long)} "\


df_train_preds.head()

(43773, 5)
(131317, 4)
(175090, 3)


Unnamed: 0,Row,Column,y,probabilities
0,213,248,0.0,0.044541
1,213,95,1.0,0.598
2,213,64,0.0,0.011094
3,213,187,0.0,0.077845
4,213,330,0.0,0.112984


### Convert floats to labels

In [26]:
def round_probas(df_predictions):
    df_predictions["y_hat"] = df_predictions["probabilities"].copy()
    
    round_up_bool = df_predictions.loc[:, "probabilities"] > 0.5
    round_down_bool = df_predictions.loc[:, "probabilities"] <= 0.5
    
    df_predictions.loc[round_up_bool, "y_hat"] = 1
    df_predictions.loc[round_down_bool, "y_hat"] = 0
    
    return df_predictions

    
df_train_preds_complete = round_probas(df_train_preds)
df_test_preds_complete = round_probas(df_test_preds)

assert (df_train_preds_complete["probabilities"] - df_train_preds_complete["y_hat"]).abs().max() <= 0.5
assert (df_test_preds_complete["probabilities"] - df_test_preds_complete["y_hat"]).abs().max() <= 0.5

df_train_preds_complete.head()

Unnamed: 0,Row,Column,y,probabilities,y_hat
0,213,248,0.0,0.044541,0.0
1,213,95,1.0,0.598,1.0
2,213,64,0.0,0.011094,0.0
3,213,187,0.0,0.077845,0.0
4,213,330,0.0,0.112984,0.0


## Evaluate model

TODO: sklearn

In [28]:
(df_train_preds_complete["y"] == df_train_preds_complete["y_hat"]).sum()/len(df_train_preds_complete)

0.9704227175460908

In [29]:
(df_test_preds_complete["y"] == df_test_preds_complete["y_hat"]).sum()/len(df_test_preds_complete)

0.9676056016265735