In [2]:
import os
from google.colab import drive

Mounted at /content/drive
Collecting ipdb
  Downloading ipdb-0.13.13-py3-none-any.whl (12 kB)
Collecting jedi>=0.16 (from ipython>=7.31.1->ipdb)
  Downloading jedi-0.19.1-py2.py3-none-any.whl (1.6 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.6/1.6 MB[0m [31m9.7 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: jedi, ipdb
Successfully installed ipdb-0.13.13 jedi-0.19.1


# **Guidelines**

## Process Overview
The ranking step includes 2 sub-processes:
1. Scoring: In this step, you will train a scoring model to predict 3 types of score; probability of like, probability of dislike, and engagement time.
2. Ranking: In this step, you will create a ranking policy to rank the items based on the output from previous step.


## Your Task

### 1. Modify `DataCollectorExample()`
also rename it to `DataCollector{Team}()` juts like previous assignment. All of the functions in `DataCollectorExample()` need to be modified (They're the ones with `raise NotImplementedError("you need to implement this")` in `DataCollector()`). The explaination of each function includes in their own docstring with example code.

Some important points worth noting are:
1. for `.feature_generation_user()` and `.feature_generation_content()`, you DO NOT need to apply one-hot encoding or scaler since this process will be done by our pipeline using via `.postprocess_feature()`, which will also save the `Postprocessor()` class as a pickle file to be used for testing and inference step. You will need to output nsames of numberical features and categorical features along with the feature dataframe.
2. When engineering target variables with `.get_Ys()`, you'll need to output 3 columns of target variables (`'like'`, `'dislike'`, `'engage_time'`). Be careful of how you create each target variable. What happen if a user like an item then change their mind and dislike it? What if a user see the same content twice, what would be the engagement time?
3. Be creative of how you would create your ranking policy in `.rank()`. You have 3 scores that your ranking can be based on. Which score(s) would you optimise your ranking for? How would you trade-off one with others? Can you rank base on all 3 of them, if so, how would you combine them? You can also join the score dataframe with `self.generated_content_metadata` to use the original content features as part of ranking.

### 2. Train Model
Use the cell `Training: Create your own training` to train your model. Feel free to select any model you like. In the example, I use train 1 model for each target variable, resulting in 3 models. If you can find a model that can produce 3 output values (e.g. Neural Network), feel free to do so.

Some important points worth noting are:
1. Make sure you save the model to **ONLY 1 file**. Even if you make 3 models, put them into a list or dictionary and save the whole object to 1 file.
2. Depending on your modelling approach, you'll need your own way to load the model to make predictions. Thus, modify the `.load_model()` in `DataCollectorExample()` reflect this.
3. Once you train your model, you can use the example evaluation cells to evaluate your model. There's no need to change anything about the evaluation except the thresholds variable. These thresholds decide at what probability we'll consider the prediction as like and dislike. **NOTE** that this only test your scoring model, not your ranking policy. Testing for ranking policy is much more complicated since we're not only optimizing the number of likes and thus cannot be done offline.


# Test Your Model
Once you have the scoring model and ranking policy, you can use the `Inference Example` cell to run your pipeline and see the output of recommended items from your model.

# Submit your work

1. Put `postprocessor.pkl` and your model file into `Columbia-E4579/services/backend/src/recommendation_system/recommendation_flow/model_prediction` folder on your branch. Make sure to rename them as `{team}_postprocessor.pkl` and `{team}_model.pkl` (E.g. `alpha_postprocessor.pkl` and `alpha_model.pkl`). Both `postprocessor.pkl` and your model file will be saved in `sampled_data` in your COlab work space. Since this workspace is cleared everytime you restart the Colab, please also save the postprocessor and model files on your local machine if you want to keep them.

2. Download your Colab as `.ipynb` file, rename it as `{team}_ranking.ipynb` (e.g. `alpha_ranking.ipynb`), and also place it in `Columbia-E4579/services/backend/src/recommendation_system/recommendation_flow/model_prediction` folder on your branch.

You'll then merge your branch with these files into Professor's branch.


# Imports

In [3]:
import pandas as pd
import numpy as np
from tqdm import tqdm

from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split

# DataCollector - Do Not Modify

In [4]:
from sqlalchemy.sql.schema import ScalarElementColumnDefault
import pandas as pd
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import StandardScaler, Normalizer
import numpy as np
from typing import Tuple, List, Optional
import pickle


class Postprocessor:

    def __init__(self,
                 numberical_features: List[str],
                 categorical_features: List[str]):

        self.numberical_features = numberical_features
        self.categorical_features = categorical_features

        self.scaler = StandardScaler()
        self.encoder = OneHotEncoder(sparse_output=False, handle_unknown='ignore')
        self.encode_cols = []

    def fit(self, features_df: pd.DataFrame):

        self.scaler.fit(features_df[self.numberical_features])

        if len(self.categorical_features) > 0:
            self.encoder.fit(features_df[self.categorical_features])
            self.encode_cols = list(self.encoder.get_feature_names_out())

    def transform(self, features_df: pd.DataFrame) -> pd.DataFrame:

        features_df[self.numberical_features] = self.scaler.transform(features_df[self.numberical_features])

        if len(self.categorical_features) > 0:
            features_df[self.encode_cols] = self.encoder.transform(features_df[self.categorical_features])

        return features_df

    def fit_transform(self, features_df: pd.DataFrame) -> pd.DataFrame:

        self.fit(features_df)
        features_df = self.transform(features_df)

        return features_df


class DataCollector:

    def __init__(self,
                 engagement_path=None,
                 content_meta_path=None):


        self.engagement_path = engagement_path
        self.content_meta_path = content_meta_path

        self.objects_dir = 'sample_data'  #TODO change this
        self.numerical_features = []
        self.categorical_features = []

        self.postprocessor = None
        self.model = None

    def feature_generation_user(self) -> Tuple[pd.DataFrame, List[str], List[str]]:
        """
        Returns
          pd.DataFrame: User feature dataframe
          List[str]: List of numerical features. E.g. ['feat_1', 'feat_3, ...]
          List[str]: List of categorical features. E.g. ['feat_2', 'feat_4, ...]
        """
        raise NotImplementedError("you need to implement this")

    def feature_generation_content(self) -> Tuple[pd.DataFrame, List[str], List[str]]:
        """
        Returns
          pd.DataFrame: Content feature dataframe
          List[str]: List of numerical features. E.g. ['feat_1', 'feat_3, ...]
          List[str]: List of categorical features. E.g. ['feat_2', 'feat_4, ...]
        """
        raise NotImplementedError("you need to implement this")

    def get_Ys(self) -> pd.DataFrame:
        """Engineers taget variable.
        Args
            data (pd.DataFrame): Engagement data.
        Returns
            pd.DataFrame: Dataframe of 5 columns;
                'user_id', 'content_id', 'like', 'dislike', 'engage_time'
        """

        raise NotImplementedError("you need to implement this")

    def feature_generation(self, is_train=False) -> pd.DataFrame:
        """Generate features. If is_train, will generate features for user-content pairs
        exist in self.engagement_data. Else, will generate features for
        all possible user-content pairs.

        Args:
            is_train (bool): Whether in training mode.

        Returns:
            pd.DataFrame: Feature dataframe.

        """
        content_feature_df, content_num_feats, content_cat_feats = self.feature_generation_content()
        user_feature_df, user_num_feats, user_cat_feats = self.feature_generation_user()
        self.user_feature_df = user_feature_df
        self.content_feature_df = content_feature_df

        self.numerical_features = user_num_feats + content_num_feats
        self.categorical_features = user_cat_feats + content_cat_feats

        if is_train:
            interaction_pairs = self.engagement_data[
                ['user_id', 'content_id']].drop_duplicates()

        else:
            all_users = self.engagement_data['user_id'].drop_duplicates().tolist()
            all_contents = self.generated_content_metadata['content_id'].drop_duplicates().tolist()

            interaction_pairs = [(u, c) for u in all_users for c in all_contents]
            interaction_pairs = pd.DataFrame(interaction_pairs, columns=['user_id', 'content_id'])

        features_df = pd.merge(interaction_pairs,
                               user_feature_df, on='user_id', how='left')

        features_df = pd.merge(features_df,
                               content_feature_df, on='content_id', how='left')

        return features_df


    def get_engagement_data(self, user_id=None, content_ids=None):

        if self.engagement_path is None:
            #TODO: read from database
            pass
        else:
            df = pd.read_csv(self.engagement_path, sep="\t")

        if content_ids is not None:
            df = df[df['content_id'].isin(content_ids)]

        if user_id is not None:
            df = df[df['user_id'] == user_id]

        return df

    def get_generated_content_metadata(self, content_ids=None):

        if self.content_meta_path is None:
            #TODO: read from database
            pass
        else:
            df = pd.read_csv(self.content_meta_path, sep="\t")

        if content_ids is not None:
            df = df[df['content_id'].isin(content_ids)]

        return df

    def get_user_data(self, user_id=None):

        if self.engagement_path is None:
            #TODO: read from database
            pass
        else:
            df = pd.read_csv(self.engagement_path, sep="\t")

        if user_id is not None:
            df = df[df['user_id'] == user_id]

        return df

    def gather_data(self, user_id, content_ids):
        self.engagement_data = self.get_engagement_data(user_id, content_ids)
        self.generated_content_metadata = self.get_generated_content_metadata(content_ids)
        self.user_data = self.get_user_data(user_id)

        if len(self.engagement_data) == 0:
            raise Exception("either user_id or content_ids leads to empty engagement_data")

        if len(self.generated_content_metadata) == 0:
            raise Exception("content_ids leads to empty generated_content_metadata")

        if len(self.user_data) == 0:
            raise Exception("user_id leads to empty user_data")

    def postprocess_feature(self, features_df: pd.DataFrame, is_train=False) -> pd.DataFrame:
        """Applied postprocessings (one-hot encoding & scaler) to the feature dataframe.

        Args:
            features_df (pd.DataFrame): Input feature dataframe.
            is_train (bool): Whether in training mode. If True, will fit the
                Postprocessor() and save to a pickle file. Else, will load the
                saved Postprocessor() and use it.

        Returns:
            pd.DataFrame: Output feature dataframe.
        """

        if is_train:
            self.postprocessor = Postprocessor(self.numerical_features, self.categorical_features)
            features_df = self.postprocessor.fit_transform(features_df)
            self.save_postprocessor()

        else:
            self.postprocessor = self.load_postprocessor()
            features_df = self.postprocessor.transform(features_df)

        self.all_numeric_features = self.numerical_features + self.postprocessor.encode_cols


        return features_df


    def gen_model_input(self,
                        user_id: Optional[int] = None,
                        content_ids: Optional[list] = None,
                        is_train: bool = False) -> pd.DataFrame:
        """Generates input data (X) for model.

        Args:
            user_id (Optional[int]): User ID to generate features for.
                If None, will generate features for all available users in self.engagement_data.
            content_ids (Optional[list]): List of content ID to generate features for.
                If None, will generate features for all available contents in self.engagement_data.
            is_train (bool): Whether in training mode. If True, will generate
                features for user-content pairs exist in self.engagement_data.
                Else, will generate features for all possible user-content pairs.

        Returns:
            pd.DataFrame: Dataframe of features with 2-level index of ('user_id', 'content_id').
        """

        self.gather_data(user_id, content_ids)
        features_df = self.feature_generation(is_train)
        features_df = self.postprocess_feature(features_df, is_train)

        X = features_df.set_index(['user_id', 'content_id'])
        X = X[self.all_numeric_features]
        X = X.fillna(0)

        return X


    def gen_target_vars(self,
                        engagement_data: Optional[pd.DataFrame] = None
                        ) -> pd.DataFrame:
        """Wrapper to generate target variables.

        Args:
            engagement_data (Optional[pd.DataFrame]): Engagement data. If None,
                will use self.engagement_data which is loaded for training.
                For testing, parse in the engagement_data for testing.

        Returns:
            pd.DataFrame: Dataframe of 3 columns; 'like', 'dislike', 'engage_time'
                and 2-level index of ('user_id', 'content_id').
        """

        if engagement_data is None:
            engagement_data = self.engagement_data

        target_df = self.get_Ys(engagement_data)

        return target_df.set_index(['user_id', 'content_id'])


    def save_postprocessor(self):

        with open(f'{self.objects_dir}/postprocessor.pkl', 'wb') as f:
            pickle.dump(self.postprocessor, f)

    def load_postprocessor(self):

        with open(f'{self.objects_dir}/postprocessor.pkl', 'rb') as f:
            return pickle.load(f)

    def load_model(self):
        raise NotImplementedError("you need to implement this")

    def predict(self, X) -> Tuple[list, list, list]:
        raise NotImplementedError("you need to implement this")

    def rank(self, pred_score):
        raise NotImplementedError("you need to implement this")

    def score(self,
              user_id: Optional[int] = None,
              content_ids: Optional[list] = None) -> pd.DataFrame:
        """Predict the scores.

        Args:
            user_id (Optional[int]): User ID to generate features for.
                If None, will generate features for all available users in self.engagement_data.
            content_ids (Optional[list]): List of content ID to generate features for.
                If None, will generate features for all available contents in self.engagement_data.

        Returns:
            pd.DataFrame: Predicted score dataframe with 2-level index of (user_id, content_id).
                The dataframe also comes with the original content metadata which also
                can be used for ranking.
        """

        X = self.gen_model_input(user_id, content_ids, is_train=False)

        pred_like, pred_dislike, pred_engtime = self.predict(X)

        pred_df = pd.DataFrame(np.array([pred_like, pred_dislike, pred_engtime]).T,
                               index=X.index,
                               columns=['like', 'dislike', 'engage_time']).reset_index()

        pred_df = pd.merge(self.generated_content_metadata,
                           pred_df,
                           how='right',
                           on='content_id')

        return pred_df.set_index(['user_id', 'content_id'])

    def recommend(self, user_id, content_ids=None, top_k=20):

        score_df = self.score(user_id, content_ids).reset_index()

        rank = self.rank(score_df, user_id, content_ids)

        return rank[:top_k]




In [5]:
from sklearn.metrics import recall_score
from sklearn.metrics import precision_score
from sklearn.metrics import mean_squared_error


def evaluate(true_df: pd.DataFrame,
             pred_df: pd.DataFrame,
             thres_like: float = 0.5,
             thres_dislike: float = 0.5
             ) -> dict:

    """Compute evaluation metrics.

    Args:
        true_df (pd.DataFrame): Dataframe of true target variables.
        pred_df (pd.DataFrame): Dataframe of predicted target variables.
        thres_like (float): Probability threshold to consider a prediction as like.
        thres_dislike (float): Probability threshold to consider as a prediction dislike.

    Returns:
        dict: Dictionary of metrics.
    """

    true_df = true_df.reset_index()
    pred_df = pred_df[['like', 'dislike', 'engage_time']].reset_index()

    pred_df['like'] = (pred_df['like'] > thres_like).astype(int)
    pred_df['dislike'] = (pred_df['dislike'] > thres_dislike).astype(int)

    actual_user_content = true_df[['user_id', 'content_id']]
    pred_user_content = pred_df[['user_id', 'content_id']]

    common_user_content = pd.merge(actual_user_content,
                                   pred_user_content,
                                   how='inner',
                                   on=['user_id', 'content_id'])

    true_df = pd.merge(common_user_content,
                         true_df,
                         how='left',
                         on=['user_id', 'content_id'])


    pred_df = pd.merge(common_user_content,
                       pred_df,
                       how='left',
                       on=['user_id', 'content_id'])


    metrics = {}
    for col in ['like', 'dislike', 'engage_time']:
        metrics[col] = {}

        if col == 'engage_time':
            metrics[col]['rmse'] = np.sqrt(mean_squared_error(true_df[col], pred_df[col]))
        else:
            metrics[col]['precision'] = precision_score(true_df[col], pred_df[col])
            metrics[col]['recall'] = recall_score(true_df[col], pred_df[col])

    return metrics

# Your Implementation - Example Here, Must Modify

In [15]:
from collections import defaultdict, deque
TOP_ARTIST_STYLES = 30
TOP_SOURCES = 30
TOP_CONTENT = 100

class DataCollectorExample(DataCollector):

    def feature_generation_user(self) -> Tuple[pd.DataFrame, List[str], List[str]]:
        """Generates user features. Keep all the categorical variables as is,
        since the one-hot encoding will be done by our own pipeline. Along with
        the feature dataframe, you'll need to output lists of numberical features
        and categorical features as well.

        Returns
          pd.DataFrame: User feature dataframe
          List[str]: List of numerical features. E.g. ['feat_1', 'feat_3, ...]
          List[str]: List of categorical features. E.g. ['feat_2', 'feat_4, ...]
        """

        df = self.user_data.drop_duplicates()
        top_n_content = [81576, 112990, 115079, 29667, 116285, 119163, 121340, 98660, 112723, 112718, 100600, 123413, 84509, 124241, 85954, 91035, 96308, 107234, 124197, 129220, 97383, 32858, 101707, 83115, 82523, 91267, 112943, 115042, 85369, 86154, 96136, 113087, 125842, 118824, 115641, 124704, 87170, 124089, 55132, 95154, 92391, 94627, 95448, 96061, 118478, 95420, 90800, 85672, 76810, 79978, 73723, 85261, 112433, 85495, 102788, 121219, 124002, 37896, 31280, 89381, 118564, 132295, 31278, 101773, 130301, 111201, 97499, 97627, 108116, 87998, 62821, 86786, 86593, 118721, 87250, 33369, 91588, 95370, 121439, 124954, 85949, 123170, 124374, 129087, 58001, 79847, 90687, 101576, 90213, 104604, 106588, 115754, 117136, 117157, 127128, 104392, 86289, 91755, 116519, 85093]
        user_vector_dict = defaultdict(lambda: {
            'millisecond_engaged_vector': np.zeros(len(top_n_content)),
            'like_vector': np.zeros(len(top_n_content)),
            'dislike_vector': np.zeros(len(top_n_content))
        })

        def aggregate_engagement(group):
            millisecond_engagement_sum = group.loc[group['engagement_type'] != 'Like', 'engagement_value'].sum()
            likes_count = group.loc[(group['engagement_type'] == 'Like') & (group['engagement_value'] == 1)].shape[0]
            dislikes_count = group.loc[(group['engagement_type'] == 'Like') & (group['engagement_value'] == -1)].shape[0]

            return pd.Series({
                'millisecond_engagement_sum': millisecond_engagement_sum,
                'likes_count': likes_count,
                'dislikes_count': dislikes_count
            })
        for user_id in df['user_id'].unique():
          user_vector_dict[user_id]['millisecond_engaged_vector']=np.zeros(len(top_n_content))
          user_vector_dict[user_id]['like_vector']=np.zeros(len(top_n_content))
          user_vector_dict[user_id]['dislike_vector']=np.zeros(len(top_n_content))
        if len(df[df['content_id'].isin(top_n_content)]) != 0:
          engagement_aggregate = df[df['content_id'].isin(top_n_content)].groupby(['user_id', 'content_id']).apply(aggregate_engagement).reset_index()
          for _, row in engagement_aggregate.iterrows():
              user_id = row['user_id']
              content_id = row['content_id']
              idx = top_n_content.index(content_id)

              user_vector_dict[user_id]['millisecond_engaged_vector'][idx] = row['millisecond_engagement_sum']
              user_vector_dict[user_id]['like_vector'][idx] = row['likes_count']
              user_vector_dict[user_id]['dislike_vector'][idx] = row['dislikes_count']

        user_vector_df = pd.DataFrame.from_dict(user_vector_dict, orient='index')
        millisecond_columns = [f"ms_engaged_{i}" for i in range(TOP_CONTENT)]
        like_columns = [f"like_vector_{i}" for i in range(TOP_CONTENT)]
        dislike_columns = [f"dislike_vector_{i}" for i in range(TOP_CONTENT)]

        user_vector_df1 = pd.DataFrame(user_vector_df['millisecond_engaged_vector'].tolist(), index=user_vector_df.index, columns=millisecond_columns)
        user_vector_df2 = pd.DataFrame(user_vector_df['like_vector'].tolist(), index=user_vector_df.index, columns=like_columns)
        user_vector_df3 = pd.DataFrame(user_vector_df['dislike_vector'].tolist(), index=user_vector_df.index, columns=dislike_columns)

        feature_df = pd.concat([user_vector_df1, user_vector_df2, user_vector_df3], axis=1)
        feature_df = feature_df.reset_index().rename(columns={'index': 'user_id'})

        return feature_df, millisecond_columns + like_columns + dislike_columns, []

    def feature_generation_content(self) -> Tuple[pd.DataFrame, List[str], List[str]]:
        """Generates content features. Keep all the categorical variables as is,
        since the one-hot encoding will be done by our own pipeline. Along with
        the feature dataframe, you'll need to output lists of numberical features
        and categorical features as well.

        Returns
          pd.DataFrame: User feature dataframe
          List[str]: List of numerical features. E.g. ['feat_1', 'feat_3, ...]
          List[str]: List of categorical features. E.g. ['feat_2', 'feat_4, ...]
        """

        feature_df = self.generated_content_metadata.drop_duplicates(subset='content_id', keep='first')

        top_artist_styles = ['medieval', 'anime', 'studio', 'shepard_fairey', 'oil_on_canvas', 'unreal_engine', 'edward_hopper', 'keith_haring', 'detailed_portrait', 'van_gogh', 'jackson_pollock', 'juan_gris', 'scifi', 'ibrahim_el_salahi', 'ma_jir_bo', 'franck_slama', 'jean-michel_basquiat', 'kerry_james_marshall', 'marta_minujín', 'face_and_lighting', 'salvador_dali', 'leonardo_da_vinci', 'takashi_murakami', 'gta_v', 'laura_wheeler_waring', 'louise bourgeois', 'movie: Dances-with-Wolves', 'movie: Interstellar', 'movie: Indiana-Jones-IV', 'movie: Gravity']
        top_sources = ['human_prompts', 'r/EarthPorn', 'r/Showerthoughts', 'r/scifi', 'r/pics', 'r/Damnthatsinteresting', 'r/educationalgifs', 'r/MadeMeSmile', 'r/SimplePrompts', 'r/RetroFuturism', 'r/AccidentalArt', 'r/Cyberpunk', 'Dances-with-Wolves', 'Buddha', 'Interstellar', 'Napoleon Hill', 'Abraham Lincoln', 'r/oddlysatisfying', 'Indiana-Jones-IV', 'Gravity', 'Winston Churchill', 'Ralph Waldo Emerson', 'r/whoahdude', 'Confucius', 'Batman', 'Johann Wolfgang von Goethe', 'Tombstone', 'Godfather', 'Legend-of-Darkness', 'Ni-vu-ni-connu']

        feature_df['artist_style'] = feature_df['artist_style'].apply(lambda x: x if x in top_artist_styles else 'other')
        feature_df['source'] = feature_df['source'].apply(lambda x: x if x in top_sources else 'other')

        return feature_df, ['guidance_scale', 'num_inference_steps'], ['artist_style','source']

    def get_Ys(self, engagement_data) -> pd.DataFrame:
        """Engineers taget variable that you are predicting.
        Args
            engagement_data (pd.DataFrame): Engagement data.
        Returns
            pd.DataFrame: Dataframe of 5 columns;
                'user_id', 'content_id', 'like', 'dislike', 'engage_time'
        """

        target_df = engagement_data[['user_id', 'content_id']].drop_duplicates().copy()
        target_df = target_df.sort_values(by = ['user_id', 'content_id'], ascending=[True, True])

        engage_time_df = engagement_data[engagement_data['engagement_type']!='Like']
        engage_time_df = engage_time_df.sort_values(by=['user_id', 'content_id', 'engagement_value'], ascending=[True, True, False])
        engage_time_df = engage_time_df.drop_duplicates(subset=['user_id','content_id'], keep='first')[['user_id','content_id','engagement_value']]
        engage_time_df = engage_time_df.rename(columns = {'engagement_value':'engage_time'})

        like_dislike = engagement_data[engagement_data['engagement_type']=='Like']
        like_dislike_sorted = like_dislike.sort_values(by=['user_id', 'content_id', 'created_date'], ascending=[True, True, False])
        like_dislike_recent = like_dislike_sorted.drop_duplicates(subset=['user_id','content_id'], keep='first')[['user_id','content_id','engagement_value']]
        like_dislike_recent['like'] = (like_dislike_recent['engagement_value'] > 0).astype(int)
        like_dislike_recent['dislike'] = (like_dislike_recent['engagement_value'] < 0).astype(int)
        like_dislike_recent.drop(['engagement_value'],axis = 1,inplace = True)

        target_df = pd.merge(target_df,engage_time_df,on = ['user_id','content_id'],how='left')
        target_df = pd.merge(target_df,like_dislike_recent,on = ['user_id','content_id'],how='left')
        target_df.fillna(0,inplace = True)

        return target_df[['user_id','content_id','like','dislike','engage_time']]

    def predict(self, X: pd.DataFrame) -> Tuple[list, list, list]:
        """Predicts the 3 target variables by using the model that you trained.
        Make sure you load the model properly.

        Args:
            X (pd.DataFrame): Feature dataframe with 2-level index of (user_id, content_id)

        Returns:
            (list, list, list): (predicted prbability of like,
                                 predicted probability of dislike,
                                 predicted engagement time)
        """

        model = self.load_model()

        def sigmoid(x):
          return torch.exp(x)/(1+torch.exp(x))
        model.eval()
        with torch.no_grad():
            preds = model(torch.Tensor(X.to_numpy()))

        return sigmoid(preds[:,0]).flatten().numpy(), sigmoid(preds[:,1]).flatten().numpy(), torch.exp(preds[:,2]).flatten().numpy()

    def rank(self,
             score_df: pd.DataFrame,
             user_id: int,
             content_ids: Optional[list] = None) -> list:

        """Ranks the items for a given user based on your own criteria.

        Args:
            score_df (pd.DataFrame): Predicted-score Dataframe of columns;
                'user_id', 'content_id', 'like', 'dislike', 'engage_time', and
                also columns for content metadata.
            user_id (int): User ID to rank the items for.
            content_ids (Optional[list]): List of content ids to be considered for ranking.
        """

        user_df = score_df[score_df['user_id'] == user_id]

        # Standardize the engagement time (z-score normalization)
        mean_engage_time = user_df['engage_time'].mean()
        std_engage_time = user_df['engage_time'].std()
        user_df['standardized_engage_time'] = (user_df['engage_time'] - mean_engage_time) / std_engage_time

        # Cap the standardized values at -2 and 2, then normalize to range 0 to 0.5
        user_df['standardized_engage_time'] = user_df['standardized_engage_time'].clip(lower=-2, upper=2)
        user_df['standardized_engage_time'] = (user_df['standardized_engage_time'] + 2) / 8

        # Calculate a composite score considering likes, dislikes, and capped standardized engagement time
        user_df['composite_score'] = user_df['like'] - user_df['dislike'] - user_df['standardized_engage_time']

        # Sort by composite score in descending order
        ranked_pred = user_df.sort_values('composite_score', ascending=False)
        duplicate_ids = deque()
        pred_content_ids = ranked_pred['content_id'].tolist()

        result_ids = [pred_content_ids[0]]

        for i in range(1,len(pred_content_ids)):

          while duplicate_ids:
            current_style = score_df.loc[score_df['content_id'] == duplicate_ids[0], "artist_style"].iloc[0]
            prev_style = score_df.loc[score_df['content_id'] == result_ids[-1], "artist_style"].iloc[0]
            if current_style != prev_style:
              id = duplicate_ids.popleft()
              result_ids.append(id)
            else:
              break

          current_id = pred_content_ids[i]
          current_style = score_df.loc[score_df['content_id'] == current_id, "artist_style"].iloc[0]
          prev_style = score_df.loc[score_df['content_id'] == result_ids[-1], "artist_style"].iloc[0]

          if current_style == prev_style:
            duplicate_ids.append(current_id)
            continue
          result_ids.append(current_id)
        return result_ids

    def load_model(self) -> object:
        """Loads your model. Since different ML frameworks requires different
        ways to load the model. Change this to reflect your choice of framework.

        Returns:
            object: Model object
        """

        with open(f'{self.objects_dir}/model.pkl', 'rb') as f:
            return pickle.load(f)

# Training

## Train Test Split

In [7]:
engagement_data = pd.read_csv('sample_data/engagement.csv', sep="\t")
content_meta = pd.read_csv('sample_data/generated_content_metadata.csv', sep="\t")

interactions = engagement_data[
    ['user_id', 'content_id']].drop_duplicates()

interactions_train, interactions_test = train_test_split(interactions, test_size=0.2, random_state=42)

engagement_train = pd.merge(interactions_train, engagement_data, how='left', on=['user_id', 'content_id'])
engagement_test = pd.merge(interactions_test, engagement_data, how='left', on=['user_id', 'content_id'])

engagement_train.to_csv('sample_data/engagement_train.csv', sep="\t")
engagement_test.to_csv('sample_data/engagement_test.csv', sep="\t")

In [16]:
#@title get training data
data_collector = DataCollectorExample(
    engagement_path='sample_data/engagement_train.csv',
    content_meta_path='sample_data/generated_content_metadata.csv'
    )

X_train = data_collector.gen_model_input(is_train=True)
y_train = data_collector.gen_target_vars()

# ensure that each row of y_train corresponds to the correct user-content in X_train
y_train = y_train.reindex(index=X_train.index)

## Training: Create your own training
Make sure you save the model somewhere so you can send the model file to the professor later.

In [9]:
import torch
import torch.nn as nn
class RankingModel(nn.Module):
    def __init__(self, input_dim, hidden_dim):
        super(RankingModel, self).__init__()
        self.output_layer = nn.Sequential(
            nn.Linear(input_dim, hidden_dim*2),
            nn.ReLU(),
            nn.Linear(hidden_dim * 2, hidden_dim),
            nn.ReLU(),
            nn.Linear(hidden_dim, 3)
        )

    def forward(self, x):
        prediction = self.output_layer(x.float())

        return prediction

user_dim = TOP_CONTENT*3
content_dim = 4+TOP_ARTIST_STYLES+TOP_SOURCES
hidden_dim = 256
model = RankingModel(user_dim + content_dim, hidden_dim)

In [10]:
from tqdm import tqdm
import matplotlib.pyplot as plt
from torch.optim.lr_scheduler import StepLR
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset

def loss_function(outputs, targets):
    if outputs.shape != targets.shape:
        raise ValueError("Shape of outputs and targets must be the same")

    time_loss = nn.BCEWithLogitsLoss()(outputs[:,-1:], targets[:,-1:])
    like_dislike_loss = nn.BCEWithLogitsLoss()(outputs[:,:-1], targets[:,:-1])

    return time_loss + like_dislike_loss


train_data, val_data, train_labels, val_labels = train_test_split(
    X_train, y_train[['like','dislike','engage_time']], test_size=0.2, random_state=28
)

train_dataset = TensorDataset(torch.from_numpy(train_data.to_numpy()), torch.from_numpy(train_labels.to_numpy()))
val_dataset = TensorDataset(torch.from_numpy(val_data.to_numpy()), torch.from_numpy(val_labels.to_numpy()))

batch_size = 256
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False)

# Optimizer
optimizer = optim.Adam(model.parameters(), lr=0.001, betas=(0.9, 0.999), eps=1e-08, weight_decay=0, amsgrad=False)
scheduler = StepLR(optimizer, step_size=5, gamma=0.8)

for epoch in range(10):
    model.train()
    for batch in tqdm(train_loader, desc=f'Training {epoch + 1}: '):
        features_batch, targets_batch = batch
        targets_batch = targets_batch.float()
        targets_batch[:,-1:] = targets_batch[:,-1:] / (1+targets_batch[:,-1:]) # transform the target
        optimizer.zero_grad()
        outputs = model(features_batch)
        loss = loss_function(outputs, targets_batch)
        loss.backward()
        optimizer.step()
    print(f'Epoch {epoch+1}, Last Batch Single Loss: {loss.item()}')

    # Validation
    model.eval()
    val_loss = 0.0
    with torch.no_grad():
        for batch in tqdm(val_loader, desc=f'Validation {epoch + 1}: '):
            features_batch, targets_batch = batch
            targets_batch = targets_batch.float()
            targets_batch[:,-1] = targets_batch[:,-1] / (1+targets_batch[:,-1])
            outputs = model(features_batch)
            val_loss = loss_function(outputs, targets_batch)
            val_loss += loss.item()


        val_loss /= len(val_loader)
        print(f'Validation - Epoch {epoch+1}, Loss: {val_loss}')
    print(f'Current Learning Rate: {optimizer.param_groups[0]["lr"]}')
    scheduler.step()

with open('sample_data/model.pkl','wb') as f:
    pickle.dump(model,f)

Training 1: 100%|██████████| 459/459 [00:09<00:00, 47.52it/s]


Epoch 1, Last Batch Single Loss: 0.6099881529808044


Validation 1: 100%|██████████| 115/115 [00:00<00:00, 134.56it/s]


Validation - Epoch 1, Loss: 0.011358970776200294
Current Learning Rate: 0.001


Training 2: 100%|██████████| 459/459 [00:08<00:00, 56.93it/s]


Epoch 2, Last Batch Single Loss: 0.6168396472930908


Validation 2: 100%|██████████| 115/115 [00:00<00:00, 144.01it/s]


Validation - Epoch 2, Loss: 0.011394057422876358
Current Learning Rate: 0.001


Training 3: 100%|██████████| 459/459 [00:09<00:00, 46.83it/s]


Epoch 3, Last Batch Single Loss: 0.7433809041976929


Validation 3: 100%|██████████| 115/115 [00:00<00:00, 134.02it/s]


Validation - Epoch 3, Loss: 0.012377525679767132
Current Learning Rate: 0.001


Training 4: 100%|██████████| 459/459 [00:08<00:00, 55.71it/s]


Epoch 4, Last Batch Single Loss: 0.6205496788024902


Validation 4: 100%|██████████| 115/115 [00:01<00:00, 89.83it/s]


Validation - Epoch 4, Loss: 0.011322316713631153
Current Learning Rate: 0.001


Training 5: 100%|██████████| 459/459 [00:11<00:00, 41.55it/s]


Epoch 5, Last Batch Single Loss: 0.6873550415039062


Validation 5: 100%|██████████| 115/115 [00:00<00:00, 141.79it/s]


Validation - Epoch 5, Loss: 0.011904319748282433
Current Learning Rate: 0.001


Training 6: 100%|██████████| 459/459 [00:09<00:00, 46.15it/s]


Epoch 6, Last Batch Single Loss: 0.621544599533081


Validation 6: 100%|██████████| 115/115 [00:00<00:00, 139.82it/s]


Validation - Epoch 6, Loss: 0.011197648011147976
Current Learning Rate: 0.0008


Training 7: 100%|██████████| 459/459 [00:08<00:00, 55.59it/s]


Epoch 7, Last Batch Single Loss: 0.6065956354141235


Validation 7: 100%|██████████| 115/115 [00:01<00:00, 90.18it/s]


Validation - Epoch 7, Loss: 0.011237352155148983
Current Learning Rate: 0.0008


Training 8: 100%|██████████| 459/459 [00:09<00:00, 47.08it/s]


Epoch 8, Last Batch Single Loss: 0.6002206802368164


Validation 8: 100%|██████████| 115/115 [00:01<00:00, 106.67it/s]


Validation - Epoch 8, Loss: 0.01106829009950161
Current Learning Rate: 0.0008


Training 9: 100%|██████████| 459/459 [00:12<00:00, 38.13it/s]


Epoch 9, Last Batch Single Loss: 0.6787991523742676


Validation 9: 100%|██████████| 115/115 [00:00<00:00, 149.01it/s]


Validation - Epoch 9, Loss: 0.011730130761861801
Current Learning Rate: 0.0008


Training 10: 100%|██████████| 459/459 [00:07<00:00, 57.98it/s]


Epoch 10, Last Batch Single Loss: 0.5917190313339233


Validation 10: 100%|██████████| 115/115 [00:01<00:00, 98.49it/s]


Validation - Epoch 10, Loss: 0.01099639106541872
Current Learning Rate: 0.0008


# Evaluation

In [11]:
# Simulates contents filtered from previous stage.
# Feel free to change this to reflect your previous stage.

sample_contents = content_meta['content_id'].sample(frac=0.01)

In [12]:
# Get true target variables
y_true = data_collector.gen_target_vars(engagement_test)

# Make predictions
y_pred = data_collector.score(content_ids = sample_contents)

In [13]:
thres_like = 0.5
thres_dislike = 0.5
evaluate(y_true, y_pred, thres_like, thres_dislike)

{'like': {'precision': 0.6595744680851063, 'recall': 0.34444444444444444},
 'dislike': {'precision': 0.6666666666666666, 'recall': 0.3595505617977528},
 'engage_time': {'rmse': 204389.4167832767}}

# Inference Example

In [17]:
sample_contents = content_meta['content_id'].sample(frac=0.01)  # simulated contents filtered from previous stage


data_collector = DataCollectorExample(
    engagement_path='sample_data/engagement_train.csv',  # will be None in real production
    content_meta_path='sample_data/generated_content_metadata.csv'  # will be None in real production
    )

recs = data_collector.recommend(user_id=8, content_ids=sample_contents, top_k=20)

In [18]:
recs

[111018,
 81834,
 120209,
 81773,
 124867,
 112453,
 123110,
 83543,
 79740,
 83568,
 114831,
 124788,
 128384,
 81627,
 126306,
 121133,
 127912,
 111929,
 128043,
 66015]

In [19]:
artist_styles = []
for i in range(len(recs)):
    artist_styles.append(content_meta[content_meta['content_id'] == recs[i]]["artist_style"])
print(artist_styles)

[82420    van_gogh
Name: artist_style, dtype: object, 53236    franck_slama
Name: artist_style, dtype: object, 91611    takashi_murakami
Name: artist_style, dtype: object, 53175    franck_slama
Name: artist_style, dtype: object, 96269    takashi_murakami
Name: artist_style, dtype: object, 83855    leonardo_da_vinci
Name: artist_style, dtype: object, 94512    jean-michel_basquiat
Name: artist_style, dtype: object, 54945    keith_haring
Name: artist_style, dtype: object, 51142    jackson_pollock
Name: artist_style, dtype: object, 54970    keith_haring
Name: artist_style, dtype: object, 86233    leonardo_da_vinci
Name: artist_style, dtype: object, 96190    jean-michel_basquiat
Name: artist_style, dtype: object, 99786    van_gogh
Name: artist_style, dtype: object, 53029    franck_slama
Name: artist_style, dtype: object, 97708    van_gogh
Name: artist_style, dtype: object, 92535    jean-michel_basquiat
Name: artist_style, dtype: object, 99314    van_gogh
Name: artist_style, dtype: object, 8