In [None]:
%load_ext autoreload
%autoreload 2

In [None]:
import json
from pathlib import Path
import pandas as pd
import numpy as np
import tarfile
import io
import re
import subprocess as sp

from typing import Dict, List, Tuple, Union, Optional

from loguru import logger

import plotly.express as px
import plotly.graph_objects as go


In [None]:
import pandas as pd
import numpy as np

import waac
import waac.config as config
from sklearn.decomposition import NMF

In [None]:
RANDOM_SEED = 6545

In [None]:
default_rng = np.random.default_rng(seed=RANDOM_SEED)

In [None]:
DATA_DIR = config.DATA_DIR

In [None]:
fp = DATA_DIR / "intermediate" / "netflix_to_imdb.csv"

id_mapping = pd.read_csv(fp, sep=";").dropna(subset="tconst")

In [None]:
id_mapping.info()

In [None]:
id_mapping

In [None]:
fp = DATA_DIR / "intermediate" / "rating_counts.csv"
rating_counts_import = pd.read_csv(fp)

In [None]:
rating_counts_import.info()
rating_counts_import["n_reviews"].describe()

In [None]:
px.histogram(rating_counts_import["n_reviews"])

In [None]:
px.histogram(rating_counts_import["n_reviews"], cumulative=True, histnorm="percent")

merging causes around 7000 movies to be dropped

In [None]:
rating_counts_mapped = rating_counts_import.sort_values("n_reviews", ascending = False).merge(id_mapping, how="inner", on="movie_ID")

In [None]:
# Set up netflix df
fp = DOWNLOAD_DIR / "movie_titles.txt"
movie_titles_df = waac.txt_to_df(
    fp,
    config.raw_data_column_names[fp.name],
    encoding="latin-1",
)
movie_titles_df = movie_titles_df.astype(object)
movie_titles_df["movie_ID"] = movie_titles_df["movie_ID"].astype(int)
# Set to float because of missing values
movie_titles_df["year_of_release"] = movie_titles_df["year_of_release"].replace(
    "NULL", None
)
movie_titles_df.dropna(subset=["year_of_release"], inplace=True)
movie_titles_df.sort_values("year_of_release", ascending=False, inplace=True)

In [None]:
x = rating_counts_mapped.merge(
    movie_titles_df,
    on=["movie_ID","title"],
    how="inner"
)

In [None]:
x.year_of_release.max()

In [None]:
prop = 0.1
rating_counts_subset = rating_counts_mapped.iloc[:int(rating_counts_mapped.shape[0]*prop)]
print(f"rating_counts_mapped: {rating_counts_mapped.shape}")
print(f"rating_counts_subset: {rating_counts_subset.shape}")
display(rating_counts_subset.head())

In [None]:
# rating_counts_subset.title.tolist()

In [None]:
-torch.inf == float('-inf')

In [None]:
DOWNLOAD_DIR = DATA_DIR / "download"
tar_fp = DOWNLOAD_DIR / "training_set.tar"
tar_fp.parent.mkdir(exist_ok=True, parents=True)

n = rating_counts_subset.shape[0]
results = [None for _ in range(n)]

with tarfile.open(tar_fp, "r") as t:
    for i, name in enumerate(rating_counts_subset.filename):
        print(f"{i+1}/{n}: {name=}") if i % 100 == 0 else None
        data_stream = t.extractfile(member=name)
        file_header = next(data_stream).decode("utf-8")
        match_file_header = re.search(r"[0-9]+(?=:\n)", file_header)
        if not match_file_header:
            logger.warning(f"Skipping file based on file header match: {file_header}")
            continue
        df_temp = pd.read_csv(
            data_stream, encoding="utf-8", header=None,
            names=config.raw_data_column_names["training_set"]["movie_ID"]
            )
        df_temp.insert(0, "movie_ID", match_file_header[0])
        results[i] = df_temp
       


In [None]:
any(x is None for x in results)

In [None]:
df = pd.concat(results)

In [None]:
print(df.shape)
display(df)

Check if any person reviewed a movie more than once. For example on 2 dates.

It is not the case but if it were, we might take the most recent review, or the average.

In [None]:
print(df.duplicated(subset=["movie_ID","customer_ID"]).any())

In [None]:
df_pivoted = df.pivot(columns="movie_ID", index="customer_ID", values="rating")

In [None]:
df.rating.describe()

---
## Model Training

---
## Sandbox

In [None]:
import torch
from torch import nn

In [None]:
def scale(df, min_rating: int = 1, max_rating: int = 5):
    return (df - min_rating) / (max_rating - min_rating)

def unscale(df, min_rating: int = 1, max_rating: int = 5):
    return (df * (max_rating - min_rating)) + min_rating

device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
print(device)

In [None]:
n_users, n_movies = df_pivoted.shape
# Scaling ratings to between 0 and 1, this helps our model by constraining predictions

rating_matrix = scale(df=df_pivoted)

sparcity = rating_matrix.notna().sum().sum() / (n_users * n_movies)
print(f'Sparcity: {sparcity:0.2%}')

# Replacing missing ratings with -1 so we can filter them out later
rating_matrix[rating_matrix.isna()] = -1
rating_matrix = torch.from_numpy(rating_matrix.values).to(device)
non_zero_mask = (rating_matrix != -1)

In [None]:
df_pivoted.notna().mean(axis=None)

In [None]:
r = np.arange(15).reshape(3,5)
r = torch.Tensor(r)
display(r)

In [None]:
print(torch.norm(r, dim=1))


In [None]:
# Manual implementation of the Frobenius norm
display(r)
x = r**2
display(x)
x = x.sum(dim=1)
display(x)
x = x**0.5
display(x)


In [None]:
from typing import List

class MatrixFactorization(nn.Module):
    """The model."""

    def __init__(
            self,
             u_features: torch.Tensor,
             v_features: torch.Tensor,
             ):
        super().__init()
        self.u_features = u_features
        self.v_features = v_features

    def forward(
            self,
            ):        
        return torch.sigmoid(
            torch.matmul(self.u_features, self.v_features.t())
        )
    
class Loss(nn.Module):
    """Calculate loss"""

    def __init__(
            self,
            # matrix: torch.Tensor,
            # non_zero_mask: torch.Tensor = None,
            lam_u: float = 0.3,
            lam_v: float = 0.3,
    ):
        super().__init__()
        self.lam_u = lam_u
        self.lam_v = lam_v

    def forward(
            self,
            matrix: torch.Tensor,
            non_zero_mask: torch.Tensor,
            predicted: torch.Tensor,
             u_features: torch.Tensor,
             v_features: torch.Tensor,
            ):
        diff = (matrix - predicted)**2
        prediction_error = torch.sum(diff*non_zero_mask)

        u_regularization = self.lam_u * torch.sum(u_features.norm(dim=1))
        v_regularization = self.lam_v * torch.sum(v_features.norm(dim=1))
        
        return prediction_error + u_regularization + v_regularization
    


class ModelTrainer(nn.Module):

    def __init__(
            self,
            n_features: int,
            model_class_type, # a class type
            loss_class, # a class instance
            optimizer_class_type, # a subclass of torch.optim.Optimizer, not a class instance
    ):
        super().__init()
        self.n_features = n_features
        self.loss_class = loss_class
        self._model_class_type = model_class_type
        self._optimizer_class_type = optimizer_class_type

    def train(self, matrix: torch.Tensor, n_epochs: int,
              u_features: torch.Tensor = None, v_features: torch.Tensor = None,
               non_zero_mask: torch.Tensor = None, lr: float=0.01
              ):
        
        # Scale the data if necessary. Save the min and max values for unscaling
        min_value = matrix.min()
        max_value = matrix.max()
        if (min_value != 0) or (max_value != 1):
            # Data is not 0-1 scaled
            matrix = scale(matrix, min_rating=min_value, max_rating=max_value)
            self.min_matrix_value = min_value
            self.max_matrix_value = max_value
        else:
            self.min_matrix_value = None
            self.max_matrix_value = None
        
        # Set matrix and mask
        if non_zero_mask:
            assert matrix.shape == non_zero_mask.shape
        else:
            non_zero_mask = (matrix != -1)
        self.matrix = matrix
        self.non_zero_mask = non_zero_mask

        # Set feature vectors
        n_users, n_movies = matrix.shape
        if not u_features:
            u_features = torch.randn(
                n_users, self.n_features, requires_grad=True, device=device
                )
        else:
            assert u_features.shape == (n_users, self.n_features)

        if not v_features:
            v_features = torch.randn(
                n_movies, self.n_features, requires_grad=True, device=device
                )
        else:
            assert v_features.shape == (n_users, self.n_features)

        self.u_features = u_features
        self.v_features = v_features

        # Set model and optimiser
        self.model = self._model_class_type(self.u_features, self.v_features)
        self.optimizer = self._optimizer_class_type(
            [self.u_features, self.v_features], lr=lr)
        
        for i in range(n_epochs):
            self._train()
            if i % 10 == 0:
                print(f"Epoch: {i}/{n_epochs}")
                self._validate()
        return None

    def _train(self):
            self.optimizer.zero_grad()

            predicted = self.model(self.matrix)
            loss = self.loss_class(
                matrix= self.matrix,
                non_zero_mask= self.non_zero_mask,
                predicted= predicted,
                u_features = self.u_features,
                v_features = self.v_features,
            )
            loss.backward()
            self.optimizer.step()

    def _validate(self):
        # There is no validation dataset here so we just made predictions of the whole dataset
        score = self.predict()
        print(f"Current score: {score}")
    
    def predict(self, user_idx: List[int] = None, matrix: torch.Tensor = None):

        # Scale if necessary
        scaling_info_available = (
            (self.min_matrix_value is not None) and
            (self.max_matrix_value is not None)
        )
        if matrix:
            if not ((matrix.min() >= 0) and (matrix.max() <= 1)):
                # data is not scaled. We need to scale the data.
                if not scaling_info_available:
                    raise ValueError(
                        "Data need to be scaled but scaling properties haven't be set."
                    )
                matrix = scale(matrix,
                               min_rating=self.min_matrix_value,
                               max_rating=self.max_matrix_value)
        else:
            # self.matrix was already scaled in self.train()
            matrix = self.matrix

        # Set user_idx
        if user_idx is None:
            user_idx = torch.range(matrix.shape[0])
        if isinstance(user_idx, int):
            user_idx = [user_idx]
        
        # Get predictions
        predicted_ratings, actual_ratings =  self._predict(matrix, user_idx)

        # Unscale if necessary
        if scaling_info_available:
            predicted_ratings = unscale(
                predicted_ratings,
                min_rating=self.min_matrix_value, max_rating=self.max_matrix_value
                )
            actual_ratings = unscale(
                actual_ratings,
            min_rating=self.min_matrix_value, max_rating=self.max_matrix_value

                )
        return predicted_ratings, actual_ratings
    
    def _predict(self, matrix: torch.Tensor, user_idx: List[int]):

        user_ratings = matrix[user_idx, :]
        non_zero_mask = user_ratings != -1

        with torch.no_grad():
            predictions = torch.sigmoid(
                torch.mm(
                    self.model.u_features[user_idx, :].view(-1, self.n_features),
                    self.model.v_features.t())
                )
            
        predicted_ratings = predictions.squeeze()[non_zero_mask]
        actual_ratings = user_ratings[non_zero_mask]
        # NOTE: These values are not scaled
        return predicted_ratings, actual_ratings


        



In [None]:
r

In [None]:
r[slice(2),2]

In [None]:
n = 10
downsample_id_mapping = id_mapping.sample(n, random_state=RANDOM_SEED)

In [None]:
s = downsample_id_mapping.movie_ID.iloc[0]
str(s).zfill(7)

In [None]:
for movie_id in downsample_id_mapping.movie_ID:
    training_fp = DATA_DIR / "download" / "training_set" / f"mv_{str(movie_id).zfill(7)}.txt"
    print(training_fp)
    print(training_fp.exists())
    

In [None]:
training_tar_fp = DATA_DIR / "download" / "training_set.tar"

with tarfile.open(training_tar_fp) as tf:
    names = tf.getnames()
    # "training_set/mv_1234567.txt" ...
print(names)

In [None]:
training_tar_fp = DATA_DIR / "download" / "training_set.tar"

with tarfile.open(training_tar_fp) as tf:
    for movie_id in downsample_id_mapping.movie_ID:
        member = f"training_set/mv_{str(movie_id).zfill(7)}.txt"
        print(f"{member=}")
        x = tf.extractfile(member)
        print(x)
        print(type(x))
        # read header row
        y0 = next(x)
        # read the rest
        d = pd.read_csv(x, encoding="utf-8", header=None, names=config.raw_data_column_names["training_set"]["movie_ID"])
        # y = [_ for _ in x]
        # contents = x.read().decode("utf-8")
        break

    

In [None]:
d

# todo:
# add a column called movieID, spread the df so customer is index, movieID is columns, values = rating. (date is removed)
# maybe have to drop duplicates on [customer_ID,movieID] first incase someone rated the same movie twice

# This is the input for NMF

In [None]:
a = np.array([4,5,5,np.NaN, 5,5, np.NaN, np.NaN, np.NaN]].reshape(3,3)
a

In [None]:
y[:4]

In [None]:
y0

In [None]:
contents

In [None]:
_bytes.decode("utf-8")

In [None]:
list((DATA_DIR / "download" / "training_set").glob("*.txt"))