# So how do we fill these missing values?

In [1]:
import numpy as np
import sys
import os
from pathlib import Path
import pandas as pd

NOTEBOOK_DIR = os.getcwd()
PROJECT_DIR = str(Path(NOTEBOOK_DIR).resolve().parent)

sys.path.insert(1, PROJECT_DIR+"/tindar-engine")
PROJECT_DIR

import tindar

%load_ext autoreload
%autoreload 2

In [2]:
# n = 10
n = 500

tindar_problem = tindar.TindarGenerator(
    n, nan_probability=0.3, generation_kind="interesting",
    attractiveness_distr="uniform", unif_low=0.3, unif_high=0.8
)
tindar_problem.create_love_matrix()
love_matrix = tindar_problem.love_matrix
love_matrix

array([[ 0.,  0.,  0., ..., nan,  0., nan],
       [ 1.,  0.,  1., ...,  0.,  0.,  1.],
       [nan,  0.,  0., ...,  0.,  0.,  0.],
       ...,
       [ 1.,  0., nan, ...,  0.,  0.,  1.],
       [ 1.,  0.,  0., ...,  0.,  0.,  0.],
       [nan,  0.,  0., ...,  0., nan,  0.]])

## We predict if a person is interested in the other with recommender engines

In this notebook, we will use scikit-suprise (https://surprise.readthedocs.io/en/stable/index.html), a library for collaborative filtering algorithms (an alternative would be sklearn SimpleImputer or other kinds of Imputers).<br>


In this part of the notebook, we:

1. Load the love matrix in a format that surprise understands
2. Split the known love ratings (the 0's and 1's) into train and test set
3. Fit a model on the train set
4. Predict on both the train and test set
5. Evaluate the model

If the model is good enough, we will use it to fill up the missing values. In that case, we:

1. Retrain the selected model on the full dataset
2. Predict the missing ratings
3. Convert surprises' predictions back to the original square matrix shape

Links on Recommender Engines:
- blog: https://towardsdatascience.com/introduction-to-recommender-systems-6c66cf15ada
- book: The Adaptive Web - Methods and Strategies of Web Personalization, Chapters 9 and 10 (Brusilovsky, Kobsa, Nejdl)


In [3]:
from surprise import SVD, KNNBaseline, Dataset, accuracy
from surprise.model_selection import train_test_split, cross_validate
from surprise.reader import Reader


### Load data

An easy way to load datasets into surprise is by tranforming your data to a long datafrom and using surprise.Dataset.load_from_df.


In [4]:
df_love = pd.DataFrame(love_matrix)
print(df_love.shape)
df_love.head()

(500, 500)


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,490,491,492,493,494,495,496,497,498,499
0,0.0,0.0,0.0,0.0,,0.0,0.0,0.0,,0.0,...,,0.0,,0.0,,0.0,0.0,,0.0,
1,1.0,0.0,1.0,,1.0,0.0,1.0,0.0,1.0,,...,0.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,1.0
2,,0.0,0.0,0.0,0.0,,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,,0.0,0.0,0.0,0.0,0.0
3,1.0,,,0.0,,1.0,,0.0,,,...,0.0,,0.0,,1.0,1.0,,0.0,1.0,1.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,0.0,...,,0.0,,0.0,,0.0,,,0.0,0.0


In [5]:
print(f"There is {np.nanmean(df_love.values.reshape(-1))} love on a scale of 0 to 1")

There is 0.24048456740764432 love on a scale of 0 to 1


In [6]:
def wide_to_long(df_wide, column_names=['Row', 'Column', 'Value'], drop_na=True):
    df_wide_transp = df_wide.transpose()
    df_long = pd.DataFrame(df_wide_transp.copy().unstack()).reset_index()
    df_long.columns = column_names
    
    if drop_na:
        df_long = df_long.dropna()

    return df_long

In [7]:
df_love_long_all = wide_to_long(df_love, drop_na=False)

df_love_long = df_love_long_all.loc[df_love_long_all["Value"].notnull(), :]
df_love_long_nan = df_love_long_all.loc[df_love_long_all["Value"].isnull(), :]

print(df_love_long.shape)
df_love_long.head()

(175084, 3)


Unnamed: 0,Row,Column,Value
0,0,0,0.0
1,0,1,0.0
2,0,2,0.0
3,0,3,0.0
5,0,5,0.0


In [8]:
RATING_SCALE = (0, 1)
reader = Reader(rating_scale=RATING_SCALE)
surprise_dataset = Dataset.load_from_df(
    df_love_long, reader
)

### Initialize model

In [9]:
model = SVD(n_factors=3)
# model = KNNBaseline(k=5)

### Split data

#### Simple split

In [10]:
trainset, testset = train_test_split(surprise_dataset, test_size=.25)

In [11]:
print(type(trainset))
print(type(testset))

<class 'surprise.trainset.Trainset'>
<class 'list'>


In [12]:
testset[0]

(278, 219, 0.0)

In [13]:
model.fit(trainset)
test_predictions = model.test(testset)
print(help(trainset.build_testset))
trainset_iterable = trainset.build_testset()
train_predictions = model.test(trainset_iterable)

Help on method build_testset in module surprise.trainset:

build_testset() method of surprise.trainset.Trainset instance
    Return a list of ratings that can be used as a testset in the
    :meth:`test() <surprise.prediction_algorithms.algo_base.AlgoBase.test>`
    method.
    
    The ratings are all the ratings that are in the trainset, i.e. all the
    ratings returned by the :meth:`all_ratings()
    <surprise.Trainset.all_ratings>` generator. This is useful in
    cases where you want to to test your algorithm on the trainset.

None


In [14]:
def surprise_predictions_to_df(predictions_surp):
    values = [(x.uid, x.iid, x.r_ui, x.est) for x in predictions_surp]
    
    df_predictions = pd.DataFrame(
        data=values,
        columns=["Row", "Column", "y", "probabilities"]
    )
    
    return df_predictions

In [15]:
df_test_preds = surprise_predictions_to_df(test_predictions)
df_test_preds["Set"] = "test"
df_train_preds = surprise_predictions_to_df(train_predictions)
df_train_preds["Set"] = "train"

print(df_test_preds.shape)
print(df_train_preds.shape)
print(df_love_long.shape)

assert len(df_test_preds) + len(df_train_preds) == len(df_love_long), \
       f"len(df_test_preds) + len(df_train_preds) = {len(df_test_preds) + len(df_train_preds)} "\
       f"len(df_love_long) {len(df_love_long)} "\


df_train_preds.head()

(43771, 5)
(131313, 5)
(175084, 3)


Unnamed: 0,Row,Column,y,probabilities,Set
0,68,438,0.0,0.0,train
1,68,296,0.0,0.031249,train
2,68,61,0.0,0.208101,train
3,68,44,1.0,0.375039,train
4,68,266,0.0,0.0,train


### Convert floats to labels

In [16]:
def round_probas(df_predictions):
    df_predictions["y_hat"] = df_predictions["probabilities"].copy()
    
    round_up_bool = df_predictions.loc[:, "probabilities"] > 0.5
    round_down_bool = df_predictions.loc[:, "probabilities"] <= 0.5
    
    df_predictions.loc[round_up_bool, "y_hat"] = 1
    df_predictions.loc[round_down_bool, "y_hat"] = 0
    
    return df_predictions

    
df_train_preds_complete = round_probas(df_train_preds)
df_test_preds_complete = round_probas(df_test_preds)

assert (df_train_preds_complete["probabilities"] - df_train_preds_complete["y_hat"]).abs().max() <= 0.5
assert (df_test_preds_complete["probabilities"] - df_test_preds_complete["y_hat"]).abs().max() <= 0.5

df_train_preds_complete.head()

Unnamed: 0,Row,Column,y,probabilities,Set,y_hat
0,68,438,0.0,0.0,train,0.0
1,68,296,0.0,0.031249,train,0.0
2,68,61,0.0,0.208101,train,0.0
3,68,44,1.0,0.375039,train,0.0
4,68,266,0.0,0.0,train,0.0


## Evaluate model

In [17]:
from sklearn.metrics import (
    accuracy_score, roc_auc_score, f1_score,
    precision_score, recall_score, confusion_matrix
)

In [18]:
classification_score_funcs_dict = {
    "accuracy_score": accuracy_score,
    "roc_auc_score": roc_auc_score,
    "f1_score": f1_score,
    "precision_score": precision_score,
    "recall_score": recall_score,
    "confusion_matrix": confusion_matrix
}

In [19]:
def compute_classification_scores(df_predictions, classification_score_funcs_dict):
    classification_scores = {
        k: v(
            df_predictions["y"],
            df_predictions["y_hat"],
        )
        for k, v in classification_score_funcs_dict.items()
    }
    
    return classification_scores
    
classification_scores_train = compute_classification_scores(df_train_preds_complete, classification_score_funcs_dict)
classification_scores_test = compute_classification_scores(df_test_preds_complete, classification_score_funcs_dict)

print("---------------")
print("MODEL PERFORMANCE ON TRAIN SET:")
print(classification_scores_train)
print("")
print(classification_scores_train["confusion_matrix"])

print("---------------")
print("MODEL PERFORMANCE ON TEST SET:")
print(classification_scores_test)
print("")
print(classification_scores_test["confusion_matrix"])


---------------
MODEL PERFORMANCE ON TRAIN SET:
{'accuracy_score': 0.9698125851972006, 'roc_auc_score': 0.9485333236612454, 'f1_score': 0.9353723751141254, 'precision_score': 0.9650137926394402, 'recall_score': 0.9074976273331224, 'confusion_matrix': array([[98663,  1040],
       [ 2924, 28686]], dtype=int64)}

[[98663  1040]
 [ 2924 28686]]
---------------
MODEL PERFORMANCE ON TEST SET:
{'accuracy_score': 0.9663018893788125, 'roc_auc_score': 0.9430357251156125, 'f1_score': 0.9274506910629088, 'precision_score': 0.9585197234648231, 'recall_score': 0.8983325393044307, 'confusion_matrix': array([[32868,   408],
       [ 1067,  9428]], dtype=int64)}

[[32868   408]
 [ 1067  9428]]


### This model works pretty well, so we will use it to predict the missing ratings

In [20]:
# Fit on all data
surprise_full_trainset = surprise_dataset.build_full_trainset()
model.fit(surprise_full_trainset)

<surprise.prediction_algorithms.matrix_factorization.SVD at 0x137e386deb8>

In [21]:
df_love_long_nan.head()

Unnamed: 0,Row,Column,Value
4,0,4,
8,0,8,
11,0,11,
17,0,17,
24,0,24,


In [22]:
# missing_indices = 
missing_indices = zip(
    df_love_long_nan["Row"].values,
    df_love_long_nan["Column"].values,
    df_love_long_nan["Value"].values
)
# missing_indices = [(*x, None) for x in missing_indices]
# missing_indices[:3]

In [23]:
# dir(df_love_long_nan)
# help(model.predict)
missing_predictions = model.test(missing_indices)
missing_predictions[:3]

[Prediction(uid=0, iid=4, r_ui=nan, est=0, details={'was_impossible': False}),
 Prediction(uid=0, iid=8, r_ui=nan, est=0, details={'was_impossible': False}),
 Prediction(uid=0, iid=11, r_ui=nan, est=0, details={'was_impossible': False})]

In [24]:
df_missing_predictions = surprise_predictions_to_df(missing_predictions)
df_missing_predictions_complete = round_probas(df_missing_predictions)
df_missing_predictions_complete.head()

Unnamed: 0,Row,Column,y,probabilities,y_hat
0,0,4,,0.0,0.0
1,0,8,,0.0,0.0
2,0,11,,0.0,0.0
3,0,17,,0.0,0.0
4,0,24,,0.0,0.0


## Finally, we have to merge the predictions on the missing set back to the original love_matrix

We take the original values from the long love matrix without nans and append the missing set predictions

In [25]:
print(df_love_long["Value"].isnull().sum())
df_love_long.loc[:, ["Row", "Column", "Value"]].head()

0


Unnamed: 0,Row,Column,Value
0,0,0,0.0
1,0,1,0.0
2,0,2,0.0
3,0,3,0.0
5,0,5,0.0


In [26]:
print(df_missing_predictions_complete["y_hat"].isnull().sum())
df_missing_predictions_complete.loc[:, ["Row", "Column", "y_hat"]].head()

0


Unnamed: 0,Row,Column,y_hat
0,0,4,0.0
1,0,8,0.0
2,0,11,0.0
3,0,17,0.0
4,0,24,0.0


In [27]:
love_matrix_filled_long_np = np.concatenate([
    df_love_long.loc[:, ["Row", "Column", "Value"]].values,
    df_missing_predictions_complete.loc[:, ["Row", "Column", "y_hat"]].values
])

print(love_matrix_filled_long_np.shape)
print(np.isnan(love_matrix_filled_long_np).sum())

df_love_matrix_filled_long = pd.DataFrame(
    love_matrix_filled_long_np, columns=["Row", "Column", "Value"]
)

print(df_love_matrix_filled_long.isnull().sum().sum())
df_love_matrix_filled_long.head()

(250000, 3)
0
0


Unnamed: 0,Row,Column,Value
0,0.0,0.0,0.0
1,0.0,1.0,0.0
2,0.0,2.0,0.0
3,0.0,3.0,0.0
4,0.0,5.0,0.0


In [28]:
df_love_matrix_filled = df_love_matrix_filled_long.pivot(index="Row", columns="Column", values="Value")
assert df_love_matrix_filled.isnull().sum().sum() == 0
assert ((df_love == df_love_matrix_filled) | df_love.isnull()).all().all()
df_love_matrix_filled.head()

Column,0.0,1.0,2.0,3.0,4.0,5.0,6.0,7.0,8.0,9.0,...,490.0,491.0,492.0,493.0,494.0,495.0,496.0,497.0,498.0,499.0
Row,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1.0,1.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,...,0.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,1.0
2.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3.0,1.0,0.0,1.0,0.0,1.0,1.0,1.0,0.0,1.0,0.0,...,0.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0,1.0
4.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [29]:
love_matrix_filled = df_love_matrix_filled.values
assert love_matrix_filled.shape == love_matrix.shape
love_matrix_filled

array([[0., 0., 0., ..., 0., 0., 0.],
       [1., 0., 1., ..., 0., 0., 1.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [1., 0., 1., ..., 0., 0., 1.],
       [1., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]])