# Imports

In [None]:
import pandas as pd
import numpy as np
#from google.colab import drive
from tqdm import tqdm

from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split

# DataCollector - Do Not Modify

In [None]:
#from sqlalchemy.sql.schema import ScalarElementColumnDefault
import pandas as pd
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import StandardScaler, Normalizer
import numpy as np
from typing import Tuple, List, Optional
import pickle


class Postprocessor:

    def __init__(self,
                 numberical_features: List[str],
                 categorical_features: List[str]):

        self.numberical_features = numberical_features
        self.categorical_features = categorical_features

        self.scaler = StandardScaler()
        self.encoder = OneHotEncoder(sparse_output=False, handle_unknown='ignore')
        self.encode_cols = []

    def fit(self, features_df: pd.DataFrame):

        self.scaler.fit(features_df[self.numberical_features])

        if len(self.categorical_features) > 0:
            self.encoder.fit(features_df[self.categorical_features])
            self.encode_cols = list(self.encoder.get_feature_names_out())

    def transform(self, features_df: pd.DataFrame) -> pd.DataFrame:

        features_df[self.numberical_features] = self.scaler.transform(features_df[self.numberical_features])

        if len(self.categorical_features) > 0:
            features_df[self.encode_cols] = self.encoder.transform(features_df[self.categorical_features])

        return features_df

    def fit_transform(self, features_df: pd.DataFrame) -> pd.DataFrame:

        self.fit(features_df)
        features_df = self.transform(features_df)

        return features_df


class DataCollector:

    def __init__(self,
                 engagement_path=None,
                 content_meta_path=None):


        self.engagement_path = engagement_path
        self.content_meta_path = content_meta_path

        self.objects_dir = 'sample_data'  #TODO change this
        self.numerical_features = []
        self.categorical_features = []

        self.postprocessor = None
        self.model = None

    def feature_generation_user(self) -> Tuple[pd.DataFrame, List[str], List[str]]:
        """
        Returns
          pd.DataFrame: User feature dataframe
          List[str]: List of numerical features. E.g. ['feat_1', 'feat_3, ...]
          List[str]: List of categorical features. E.g. ['feat_2', 'feat_4, ...]
        """
        raise NotImplementedError("you need to implement this")

    def feature_generation_content(self) -> Tuple[pd.DataFrame, List[str], List[str]]:
        """
        Returns
          pd.DataFrame: Content feature dataframe
          List[str]: List of numerical features. E.g. ['feat_1', 'feat_3, ...]
          List[str]: List of categorical features. E.g. ['feat_2', 'feat_4, ...]
        """
        raise NotImplementedError("you need to implement this")

    def get_Ys(self) -> pd.DataFrame:
        """Engineers taget variable.
        Args
            data (pd.DataFrame): Engagement data.
        Returns
            pd.DataFrame: Dataframe of 5 columns;
                'user_id', 'content_id', 'like', 'dislike', 'engage_time'
        """

        raise NotImplementedError("you need to implement this")

    def feature_generation(self, is_train=False) -> pd.DataFrame:
        """Generate features. If is_train, will generate features for user-content pairs
        exist in self.engagement_data. Else, will generate features for
        all possible user-content pairs.

        Args:
            is_train (bool): Whether in training mode.

        Returns:
            pd.DataFrame: Feature dataframe.

        """

        user_feature_df, user_num_feats, user_cat_feats = self.feature_generation_user()
        content_feature_df, content_num_feats, content_cat_feats = self.feature_generation_content()
        self.user_feature_df = user_feature_df
        self.content_feature_df = content_feature_df

        self.numerical_features = user_num_feats + content_num_feats
        self.categorical_features = user_cat_feats + content_cat_feats

        if is_train:
            interaction_pairs = self.engagement_data[
                ['user_id', 'content_id']].drop_duplicates()

        else:
            all_users = self.engagement_data['user_id'].drop_duplicates().tolist()
            all_contents = self.generated_content_metadata['content_id'].drop_duplicates().tolist()

            interaction_pairs = [(u, c) for u in all_users for c in all_contents]
            interaction_pairs = pd.DataFrame(interaction_pairs, columns=['user_id', 'content_id'])

        features_df = pd.merge(interaction_pairs,
                               user_feature_df, on='user_id', how='left')

        features_df = pd.merge(features_df,
                               content_feature_df, on='content_id', how='left')

        return features_df


    def get_engagement_data(self, user_id=None, content_ids=None):

        if self.engagement_path is None:
            #TODO: read from database
            pass
        else:
            df = pd.read_csv(self.engagement_path, sep="\t")

        if content_ids is not None:
            df = df[df['content_id'].isin(content_ids)]

        if user_id is not None:
            df = df[df['user_id'] == user_id]

        return df

    def get_generated_content_metadata(self, content_ids=None):

        if self.content_meta_path is None:
            #TODO: read from database
            pass
        else:
            df = pd.read_csv(self.content_meta_path, sep="\t")

        if content_ids is not None:
            df = df[df['content_id'].isin(content_ids)]

        return df

    def get_user_data(self, user_id=None):

        if self.engagement_path is None:
            #TODO: read from database
            pass
        else:
            df = pd.read_csv(self.engagement_path, sep="\t")

        if user_id is not None:
            df = df[df['user_id'] == user_id]

        return df

    def gather_data(self, user_id, content_ids):
        self.engagement_data = self.get_engagement_data(user_id, content_ids)
        self.generated_content_metadata = self.get_generated_content_metadata(content_ids)
        self.user_data = self.get_user_data(user_id)

        if len(self.engagement_data) == 0:
            raise Exception("either user_id or content_ids leads to empty engagement_data")

        if len(self.generated_content_metadata) == 0:
            raise Exception("content_ids leads to empty generated_content_metadata")

        if len(self.user_data) == 0:
            raise Exception("user_id leads to empty user_data")

    def postprocess_feature(self, features_df: pd.DataFrame, is_train=False) -> pd.DataFrame:
        """Applied postprocessings (one-hot encoding & scaler) to the feature dataframe.

        Args:
            features_df (pd.DataFrame): Input feature dataframe.
            is_train (bool): Whether in training mode. If True, will fit the
                Postprocessor() and save to a pickle file. Else, will load the
                saved Postprocessor() and use it.

        Returns:
            pd.DataFrame: Output feature dataframe.
        """

        if is_train:
            self.postprocessor = Postprocessor(self.numerical_features, self.categorical_features)
            features_df = self.postprocessor.fit_transform(features_df)
            self.save_postprocessor()

        else:
            self.postprocessor = self.load_postprocessor()
            features_df = self.postprocessor.transform(features_df)

        self.all_numeric_features = self.numerical_features + self.postprocessor.encode_cols


        return features_df


    def gen_model_input(self,
                        user_id: Optional[int] = None,
                        content_ids: Optional[list] = None,
                        is_train: bool = False) -> pd.DataFrame:
        """Generates input data (X) for model.

        Args:
            user_id (Optional[int]): User ID to generate features for.
                If None, will generate features for all available users in self.engagement_data.
            content_ids (Optional[list]): List of content ID to generate features for.
                If None, will generate features for all available contents in self.engagement_data.
            is_train (bool): Whether in training mode. If True, will generate
                features for user-content pairs exist in self.engagement_data.
                Else, will generate features for all possible user-content pairs.

        Returns:
            pd.DataFrame: Dataframe of features with 2-level index of ('user_id', 'content_id').
        """

        self.gather_data(user_id, content_ids)
        features_df = self.feature_generation(is_train)
        features_df = self.postprocess_feature(features_df, is_train)

        X = features_df.set_index(['user_id', 'content_id'])
        X = X[self.all_numeric_features]
        X = X.fillna(0)

        return X


    def gen_target_vars(self,
                        engagement_data: Optional[pd.DataFrame] = None
                        ) -> pd.DataFrame:
        """Wrapper to generate target variables.

        Args:
            engagement_data (Optional[pd.DataFrame]): Engagement data. If None,
                will use self.engagement_data which is loaded for training.
                For testing, parse in the engagement_data for testing.

        Returns:
            pd.DataFrame: Dataframe of 3 columns; 'like', 'dislike', 'engage_time'
                and 2-level index of ('user_id', 'content_id').
        """

        if engagement_data is None:
            engagement_data = self.engagement_data

        target_df = self.get_Ys(engagement_data)

        return target_df.set_index(['user_id', 'content_id'])


    def save_postprocessor(self):

        with open(f'{self.objects_dir}/postprocessor.pkl', 'wb') as f:
            pickle.dump(self.postprocessor, f)

    def load_postprocessor(self):

        with open(f'{self.objects_dir}/postprocessor.pkl', 'rb') as f:
            return pickle.load(f)

    def load_model(self):
        raise NotImplementedError("you need to implement this")

    def predict(self, X) -> Tuple[list, list, list]:
        raise NotImplementedError("you need to implement this")

    def rank(self, pred_score):
        raise NotImplementedError("you need to implement this")

    def score(self,
              user_id: Optional[int] = None,
              content_ids: Optional[list] = None) -> pd.DataFrame:
        """Predict the scores.

        Args:
            user_id (Optional[int]): User ID to generate features for.
                If None, will generate features for all available users in self.engagement_data.
            content_ids (Optional[list]): List of content ID to generate features for.
                If None, will generate features for all available contents in self.engagement_data.

        Returns:
            pd.DataFrame: Predicted score dataframe with 2-level index of (user_id, content_id).
                The dataframe also comes with the original content metadata which also
                can be used for ranking.
        """

        X = self.gen_model_input(user_id, content_ids, is_train=False)

        pred_like, pred_dislike, pred_engtime = self.predict(X)

        pred_df = pd.DataFrame(np.array([pred_like, pred_dislike, pred_engtime]).T,
                               index=X.index,
                               columns=['like', 'dislike', 'engage_time']).reset_index()

        pred_df = pd.merge(self.generated_content_metadata,
                           pred_df,
                           how='right',
                           on='content_id')

        return pred_df.set_index(['user_id', 'content_id'])

    def recommend(self, user_id, content_ids=None, top_k=20):

        score_df = self.score(user_id, content_ids).reset_index()

        rank = self.rank(score_df, user_id, content_ids)

        return rank[:top_k]




In [None]:
from sklearn.metrics import recall_score
from sklearn.metrics import precision_score
from sklearn.metrics import mean_squared_error


def evaluate(true_df: pd.DataFrame,
             pred_df: pd.DataFrame,
             thres_like: float = 0.5,
             thres_dislike: float = 0.5
             ) -> dict:

    """Compute evaluation metrics.

    Args:
        true_df (pd.DataFrame): Dataframe of true target variables.
        pred_df (pd.DataFrame): Dataframe of predicted target variables.
        thres_like (float): Probability threshold to consider a prediction as like.
        thres_dislike (float): Probability threshold to consider as a prediction dislike.

    Returns:
        dict: Dictionary of metrics.
    """

    true_df = true_df.reset_index()
    pred_df = pred_df[['like', 'dislike', 'engage_time']].reset_index()

    pred_df['like'] = (pred_df['like'] > thres_like).astype(int)
    pred_df['dislike'] = (pred_df['dislike'] > thres_dislike).astype(int)

    actual_user_content = true_df[['user_id', 'content_id']]
    pred_user_content = pred_df[['user_id', 'content_id']]

    common_user_content = pd.merge(actual_user_content,
                                   pred_user_content,
                                   how='inner',
                                   on=['user_id', 'content_id'])

    true_df = pd.merge(common_user_content,
                         true_df,
                         how='left',
                         on=['user_id', 'content_id'])


    pred_df = pd.merge(common_user_content,
                       pred_df,
                       how='left',
                       on=['user_id', 'content_id'])


    metrics = {}
    for col in ['like', 'dislike', 'engage_time']:
        metrics[col] = {}

        if col == 'engage_time':
            metrics[col]['rmse'] = np.sqrt(mean_squared_error(true_df[col], pred_df[col]))
        else:
            metrics[col]['precision'] = precision_score(true_df[col], pred_df[col])
            metrics[col]['recall'] = recall_score(true_df[col], pred_df[col])

    return metrics

# Your Implementation

1. Proprocessor



1) Experimented with different strategies for handling duplicate (user_id, content_id) rows in feature generation and get_Y

2) Experimented with different user and content features

In [None]:
class DataCollectorExample(DataCollector):

    def feature_generation_user(self) -> Tuple[pd.DataFrame, List[str], List[str]]:
        """Generates user features. Keep all the categorical variables as is,
        since the one-hot encoding will be done by our own pipeline. Along with
        the feature dataframe, you'll need to output lists of numberical features
        and categorical features as well.

        Returns
          pd.DataFrame: User feature dataframe
          List[str]: List of numerical features. E.g. ['feat_1', 'feat_3, ...]
          List[str]: List of categorical features. E.g. ['feat_2', 'feat_4, ...]
        """

        feature_df = self.user_data[['user_id']].drop_duplicates().copy()

        np.random.seed(42)
        feature_df[['user_feat_0', 'user_feat_1']] = np.random.rand(len(feature_df), 2)

        return feature_df, ['user_feat_0', 'user_feat_1'], []

    def feature_generation_content(self) -> Tuple[pd.DataFrame, List[str], List[str]]:
        """Generates content features. Keep all the categorical variables as is,
        since the one-hot encoding will be done by our own pipeline. Along with
        the feature dataframe, you'll need to output lists of numberical features
        and categorical features as well.

        Returns
          pd.DataFrame: User feature dataframe
          List[str]: List of numerical features. E.g. ['feat_1', 'feat_3, ...]
          List[str]: List of categorical features. E.g. ['feat_2', 'feat_4, ...]
        """

        feature_df = self.generated_content_metadata[['content_id']].drop_duplicates().copy()

        np.random.seed(1234)
        feature_df[['content_feat_0', 'content_feat_1']] = np.random.rand(len(feature_df), 2)

        feature_df['content_feat_2'] = np.random.choice(['a', 'b', 'c'], len(feature_df))


        return feature_df, ['content_feat_0', 'content_feat_1'], ['content_feat_2']

    def get_Ys(self, engagement_data) -> pd.DataFrame:
        """Engineers taget variable that you are predicting.
        Args
            engagement_data (pd.DataFrame): Engagement data.
        Returns
            pd.DataFrame: Dataframe of 5 columns;
                'user_id', 'content_id', 'like', 'dislike', 'engage_time'
        """

        # Filter the data for the "like" engagement type
        like_data = engagement_data[engagement_data['engagement_type'] == 'Like'].copy()

        # Sort the "MillisecondsEngagedWith" data by user, content, and timestamp in descending order
        engage_data = engagement_data[engagement_data['engagement_type'] == 'MillisecondsEngagedWith'].copy()

        # Keep only the latest row for each unique combination of user and content
        like_data_sorted = like_data.sort_values(by=['user_id', 'content_id', 'created_date'], ascending=[True, True, False])
        like_data= like_data_sorted.drop_duplicates(subset=['user_id', 'content_id'], keep='first')

        # Calculate the average engagement value for "MillisecondsEngagedWith" for the same user and content
        engage_data['engagement_value'] = engage_data['engagement_value'].fillna(0)
        engage_data['engagement_value'] = engage_data.groupby(['user_id', 'content_id'])['engagement_value'].transform('mean')
        engage_data.reset_index(drop=True, inplace=True)
        #engage_data = engage_data.drop_duplicates(subset=['user_id', 'content_id'], keep='first')

        # Merge the "like" data and the averaged "MillisecondsEngagedWith" data
        target_df= pd.concat([like_data, engage_data], ignore_index=True, sort=False)
        target_df = target_df.drop_duplicates(subset=['user_id', 'content_id']).copy()

        # Set "like", "dislike", and "engage_time" based on engagement_value or the averaged value
        target_df['like'] = np.where((target_df['engagement_type'] == 'Like') & (target_df['engagement_value'] == 1), 1, 0)
        target_df['dislike'] = np.where((target_df['engagement_type'] == 'Like') & (target_df['engagement_value'] == -1), 1, 0)

        # Set "engage_time" based on engagement_type and engagement_value
        target_df['engage_time'] = np.where(target_df['engagement_type'] == 'MillisecondsEngagedWith', target_df['engagement_value'],0)


        # Select and rename the required columns
        target_df = target_df[['user_id', 'content_id', 'like', 'dislike', 'engage_time']].copy()

        return target_df


    def predict(self, X: pd.DataFrame) -> Tuple[list, list, list]:
        """Predicts the 3 target variables by using the model that you trained.
        Make sure you load the model properly.

        Args:
            X (pd.DataFrame): Feature dataframe with 2-level index of (user_id, content_id)

        Returns:
            (list, list, list): (predicted prbability of like,
                                 predicted probability of dislike,
                                 predicted engagement time)
        """

        model = self.load_model()

        pred_like = model['like'].predict(X)
        pred_dislike = model['dislike'].predict(X)
        pred_engtime = model['engage_time'].predict(X)

        return pred_like, pred_dislike, pred_engtime

    def rank(self,
             score_df: pd.DataFrame,
             user_id: int,
             content_ids: Optional[list] = None) -> list:

        """Ranks the items for a given user based on your own criteria.

        Args:
            score_df (pd.DataFrame): Predicted-score Dataframe of columns;
                'user_id', 'content_id', 'like', 'dislike', 'engage_time', and
                also columns for content metadata.
            user_id (int): User ID to rank the items for.
            content_ids (Optional[list]): List of content ids to be considered for ranking.
        """

        score_df = score_df[score_df['user_id'] == user_id]
        ranked_pred = score_df.sort_values('like', ascending=False)

        return ranked_pred['content_id'].tolist()

    def load_model(self) -> object:
        """Loads your model. Since different ML frameworks requires different
        ways to load the model. Change this to reflect your choice of framework.

        Returns:
            object: Model object
        """

        with open(f'{self.objects_dir}/model.pkl', 'rb') as f:
            return pickle.load(f)

In [None]:
# Filter rows where engagement_type is 'MillisecondsEngagedWith'
milliseconds_engaged_with = engagement_data[engagement_data['engagement_type'] == 'MillisecondsEngagedWith']

# Get unique combinations of user_id and content_id for 'MillisecondsEngagedWith'
unique_with_milliseconds = milliseconds_engaged_with[['user_id', 'content_id']].drop_duplicates()

# Get all unique combinations of user_id and content_id
all_combinations = engagement_data[['user_id', 'content_id']].drop_duplicates()

# Find combinations with no 'MillisecondsEngagedWith'
combinations_with_no_milliseconds = all_combinations.merge(unique_with_milliseconds,
                                                           on=['user_id', 'content_id'],
                                                           how='left',
                                                           indicator=True)
combinations_with_no_milliseconds = combinations_with_no_milliseconds[
    combinations_with_no_milliseconds['_merge'] == 'left_only'
]


In [None]:
combinations_with_no_milliseconds

Unnamed: 0,user_id,content_id,_merge
1,30,77269,left_only
20,13,95201,left_only
27,99,85774,left_only
90,112,119813,left_only
93,45,57253,left_only
...,...,...,...
146627,48,96933,left_only
146640,7,86593,left_only
146648,13,91628,left_only
146652,15,112843,left_only


In [None]:
class DataCollectorExample(DataCollector):

    def feature_generation_user(self) -> Tuple[pd.DataFrame, List[str], List[str]]:
        """Generates user features. Keep all the categorical variables as is,
        since the one-hot encoding will be done by our own pipeline. Along with
        the feature dataframe, you'll need to output lists of numberical features
        and categorical features as well.

        Returns
          pd.DataFrame: User feature dataframe
          List[str]: List of numerical features. E.g. ['feat_1', 'feat_3, ...]
          List[str]: List of categorical features. E.g. ['feat_2', 'feat_4, ...]
        """

        # Filtering like/dislike engagements
        like_data = self.user_data[engagement_data['engagement_type'] == 'Like']

        # Grouping by 'user_id' and 'content_id' and getting the latest engagement for each pair
        latest_like_data = like_data.sort_values('created_date').groupby(['user_id', 'content_id']).tail(1)

        # Getting total likes for each user
        like_engagements = latest_like_data[(latest_like_data['engagement_value']==1)].copy()
        like_feature_df = like_engagements.groupby('user_id')['engagement_value'].sum().reset_index()
        like_feature_df.rename(columns={'engagement_value': 'user_likes'}, inplace=True)
        # Fill NaN values with 0 (users with no "like" engagements)
        like_feature_df['user_likes'].fillna(0, inplace=True)


        # Getting total dislikes for each user
        dislike_engagements = latest_like_data[(latest_like_data['engagement_value']==-1)].copy()
        dislike_feature_df = dislike_engagements.groupby('user_id')['engagement_value'].sum().reset_index()
        dislike_feature_df.rename(columns={'engagement_value': 'user_dislikes'}, inplace=True)
        # Fill NaN values with 0 (users with no "dislike" engagements)
        dislike_feature_df['user_dislikes'].fillna(0, inplace=True)

        # Getting average engage time for each user
        time_engagements = self.user_data[self.user_data['engagement_type'] == 'MillisecondsEngagedWith'].copy()
        engage_feature_df = time_engagements.groupby('user_id')['engagement_value'].mean().reset_index()
        engage_feature_df.rename(columns={'engagement_value': 'user_engagetime'}, inplace=True)
        # Fill NaN values with 0 (users with no engagment time data)
        engage_feature_df['user_engagetime'].fillna(0, inplace=True)

        feature_df = pd.merge(like_feature_df, dislike_feature_df , on='user_id', how='left')
        feature_df = pd.merge(feature_df, engage_feature_df , on='user_id', how='left')

        return feature_df, ['user_likes', 'user_dislikes', 'user_engagetime'], []


    def feature_generation_content(self) -> Tuple[pd.DataFrame, List[str], List[str]]:
        """Generates content features. Keep all the categorical variables as is,
        since the one-hot encoding will be done by our own pipeline. Along with
        the feature dataframe, you'll need to output lists of numberical features
        and categorical features as well.

        Returns
          pd.DataFrame: User feature dataframe
          List[str]: List of numerical features. E.g. ['feat_1', 'feat_3, ...]
          List[str]: List of categorical features. E.g. ['feat_2', 'feat_4, ...]
        """

        feature_df = self.generated_content_metadata.copy()

        # numerical feature 1: (average) guidance scale
        mean_engage = feature_df["guidance_scale"].mean()
        guide_df = feature_df.groupby('content_id')['guidance_scale'].mean().reset_index()
        guide_df = guide_df.rename(columns={'guidance_scale': 'content_guidance_scale'})
        feature_df = pd.merge(feature_df, guide_df, on='content_id', how='left')
        feature_df['content_guidance_scale'].fillna(mean_engage, inplace=True)

        # numerical feature 2: num inference steps
        mean_inf = feature_df["num_inference_steps"].mean()
        inf_df = feature_df.groupby('content_id')['num_inference_steps'].mean().reset_index()
        inf_df = inf_df.rename(columns={'num_inference_steps': 'content_inference_steps'})
        feature_df = pd.merge(feature_df, inf_df, on='content_id', how='left')
        feature_df['content_inference_steps'].fillna(mean_inf, inplace=True)


        # categorical feature 1: source
        feature_df['content_source'] = 'other'
        feature_df.loc[feature_df['source'] == 'human_prompts', 'content_source'] = 'human_prompts'
        feature_df.loc[feature_df['source'] == 'r/Showerthoughts', 'content_source'] = 'r/Showerthoughts'


        # categorical feature 2: artist style
        style_list = [
            'studio',
            'medieval',
            'anime',
            'kerry_james_marshall',
            'gta_v',
            'scifi',
            'van_gogh',
            'salvador_dali',
            'jean-michel_basquiat',
            'face_and_lighting'
        ]
        style_list = ['movie', 'empty']
        feature_df['content_style'] = feature_df['artist_style']
        feature_df['content_style'].fillna("empty", inplace=True)
        feature_df.loc[feature_df['content_style'].str.startswith('movie:'), 'content_style'] = 'movie'
        feature_df.loc[~feature_df['content_style'].isin(style_list), 'content_style'] = 'other'


        return feature_df, ['content_inference_steps'], ['content_source', 'content_style']


    def get_Ys(self, engagement_data) -> pd.DataFrame:
        """Engineers taget variable that you are predicting.
        Args
            engagement_data (pd.DataFrame): Engagement data.
        Returns
            pd.DataFrame: Dataframe of 5 columns;
                'user_id', 'content_id', 'like', 'dislike', 'engage_time'
        """
        # Filtering Like-type engagements
        like_data = engagement_data[engagement_data['engagement_type'] == 'Like']

        # Grouping by 'user_id' and 'content_id' and getting the latest engagement for each pair
        latest_engagements = like_data.sort_values('created_date').groupby(['user_id', 'content_id']).tail(1)

        # Creating the target DataFrame with unique pairs of user_id and content_id
        target_df = engagement_data[['user_id', 'content_id']].drop_duplicates()

        # Merging latest engagements to update 'like' and 'dislike' columns
        target_df = pd.merge(target_df, latest_engagements[['user_id', 'content_id', 'engagement_value']],
                            on=['user_id', 'content_id'], how='left')

        # Updating 'like' and 'dislike' columns based on the latest engagement values
        target_df['like'] = (target_df['engagement_value'] == 1).astype(int)
        target_df['dislike'] = (target_df['engagement_value'] == -1).astype(int)

        # Filling NaN values with 0 for pairs without like/dislike
        target_df.fillna(0, inplace=True)


        # Set "engage_time" based on engagement_type and engagement_value
        # assign existing engagement time if doesn't have that data, assign zero
        engage_times = engagement_data[engagement_data['engagement_type'] == 'MillisecondsEngagedWith']

        engage_times = engage_times.groupby(['user_id', 'content_id'])['engagement_value'].mean().reset_index()
        engage_times.rename(columns={'engagement_value': 'engage_time'}, inplace=True)

        target_df = pd.merge(target_df, engage_times[['user_id', 'content_id', 'engage_time']],
                            on=['user_id', 'content_id'], how='left')

        """
        # Grouping by 'user_id' and 'content_id' to get the earliest 'created_date' for each pair
        engage_times = engage_times.sort_values('created_date').groupby(['user_id', 'content_id']).head(1)
        """
        # Filling NaN values with 0 for pairs without engage_time
        target_df['engage_time'].fillna(0, inplace=True)

        # Select and rename the required columns
        target_df = target_df[['user_id', 'content_id', 'like', 'dislike', 'engage_time']].copy()

        return target_df


    def predict(self, X: pd.DataFrame) -> Tuple[list, list, list]:
        """Predicts the 3 target variables by using the model that you trained.
        Make sure you load the model properly.

        Args:
            X (pd.DataFrame): Feature dataframe with 2-level index of (user_id, content_id)

        Returns:
            (list, list, list): (predicted prbability of like,
                                 predicted probability of dislike,
                                 predicted engagement time)
        """

        model = self.load_model()

        pred_like = model['like'].predict(X)
        pred_dislike = model['dislike'].predict(X)
        pred_engtime = model['engage_time'].predict(X)

        return pred_like, pred_dislike, pred_engtime

    def rank(self,
             score_df: pd.DataFrame,
             user_id: int,
             content_ids: Optional[list] = None) -> list:

        """Ranks the items for a given user based on your own criteria.

        Args:
            score_df (pd.DataFrame): Predicted-score Dataframe of columns;
                'user_id', 'content_id', 'like', 'dislike', 'engage_time', and
                also columns for content metadata.
            user_id (int): User ID to rank the items for.
            content_ids (Optional[list]): List of content ids to be considered for ranking.
        """

        score_df = score_df[score_df['user_id'] == user_id]
        ranked_pred = score_df.sort_values('like', ascending=False)

        return ranked_pred['content_id'].tolist()

    def load_model(self) -> object:
        """Loads your model. Since different ML frameworks requires different
        ways to load the model. Change this to reflect your choice of framework.

        Returns:
            object: Model object
        """

        with open(f'{self.objects_dir}/model.pkl', 'rb') as f:
            return pickle.load(f)

In [None]:
content_df = pd.read_csv('sample_data/generated_content_metadata.csv', sep="\t")
content_df

Unnamed: 0,content_id,guidance_scale,num_inference_steps,artist_style,source
0,28598,8,75,movie: Prestige-The,Prestige-The
1,28599,7,75,movie: Prestige-The,Prestige-The
2,28600,9,75,movie: Prestige-The,Prestige-The
3,28601,8,75,movie: Prestige-The,Prestige-The
4,28602,8,75,movie: Prestige-The,Prestige-The
...,...,...,...,...,...
107700,136300,4,20,,A Dream by Edgar Allan Poe
107701,136301,4,20,,Romance by Edgar Allan Poe
107702,136302,4,20,,The Lake by Edgar Allan Poe
107703,136303,4,20,,Hymn To Aristogeiton And Harmodius by Edgar Al...


In [None]:
content_df['artist_style'].isna().sum()

31093

In [None]:
content_df['source'].isna().sum()

0

In [None]:
len(content_df['content_id'].unique())

107705

In [None]:
content_df['guidance_scale'].unique()

array([ 8,  7,  9,  4, 10, 17,  6, 20, 15, 12, 16, 11, 14, 19, 13, 18, 40,
        1, 22,  5, 24, 28, 25,  3, 21, 30, 45,  0, 23, 34, 44, 50, 29, 35,
       27, 42, 60, 38,  2, 36, 33, 41, 55, 39])

In [None]:
content_df['num_inference_steps'].unique()

array([ 75,  50, 100,  20])

In [None]:
X_train = data_practice.gen_model_input(is_train=True)

In [None]:
X_train

Unnamed: 0_level_0,Unnamed: 1_level_0,user_likes,user_dislikes,user_engagetime,content_guidance_scale,content_inference_steps,content_source_human_prompts,content_source_other,content_source_r/Showerthoughts,content_style_anime,content_style_face_and_lighting,content_style_gta_v,content_style_jean-michel_basquiat,content_style_kerry_james_marshall,content_style_medieval,content_style_other,content_style_salvador_dali,content_style_scifi,content_style_studio,content_style_van_gogh
user_id,content_id,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1
1,38318,-1.191375,0.991836,-0.355709,0.667778,0.406514,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
1,74487,-1.191375,0.991836,-0.355709,-1.502674,0.406514,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
1,78133,-1.191375,0.991836,-0.355709,-1.502674,0.406514,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
1,80916,-1.191375,0.991836,-0.355709,-1.502674,0.406514,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
1,89403,-1.191375,0.991836,-0.355709,-0.417448,0.406514,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
79,31775,-0.346505,0.271210,1.182456,-0.055706,0.406514,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
101,31616,-0.437372,0.340645,0.484672,-1.502674,0.406514,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
101,90990,-0.437372,0.340645,0.484672,-0.417448,0.406514,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
101,91758,-0.437372,0.340645,0.484672,-0.417448,0.406514,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0


In [None]:
data_practice.feature_generation_content()

(        content_id  guidance_scale  num_inference_steps         artist_style  \
 0            28598               8                   75  movie: Prestige-The   
 1            28599               7                   75  movie: Prestige-The   
 2            28600               9                   75  movie: Prestige-The   
 3            28601               8                   75  movie: Prestige-The   
 4            28602               8                   75  movie: Prestige-The   
 ...            ...             ...                  ...                  ...   
 107700      136300               4                   20                  NaN   
 107701      136301               4                   20                  NaN   
 107702      136302               4                   20                  NaN   
 107703      136303               4                   20                  NaN   
 107704      136304               4                   20                  NaN   
 
                          

In [None]:
data_practice.feature_generation_user()

(     user_id  user_likes  user_dislikes  user_engagetime
 0          1          12           -5.0      8252.100228
 1          4         788         -400.0      8994.178166
 2          5         123          -68.0    803651.618492
 3          6         247         -332.0     58446.510204
 4          7         116         -123.0     28379.503856
 ..       ...         ...            ...              ...
 99       110         194         -211.0      2971.729535
 100      111         154         -105.0     58186.126283
 101      112         604         -866.0      4550.324905
 102      113          13          -10.0      3050.506173
 103      115          10           -4.0       726.035714
 
 [104 rows x 4 columns],
 ['user_likes', 'user_dislikes', 'user_engagetime'],
 [])

In [None]:
# Set "engage_time" based on engagement_type and engagement_value
# assign existing engagement time if doesn't have that data, assign zero
engage_times = engagement_data[engagement_data['engagement_type'] == 'MillisecondsEngagedWith']
engage_times = engage_times.groupby(['user_id', 'content_id'])['engagement_value'].mean().reset_index()
engage_times.rename(columns={'engagement_value': 'engage_time'}, inplace=True)

target_df = pd.merge(target_df, engage_times[['user_id', 'content_id', 'engage_time']],
                    on=['user_id', 'content_id'], how='right')

# Filling NaN values with 0 for pairs without engage_time
target_df['engage_time'].fillna(0, inplace=True)

# Select and rename the required columns
target_df = target_df[['user_id', 'content_id', 'like', 'dislike', 'engage_time']].copy()

In [None]:
target_df['engage_time'].describe()

count    1.689180e+05
mean     3.808310e+04
std      2.449900e+06
min      2.510000e+02
25%      8.160000e+02
50%      1.483000e+03
75%      2.610000e+03
max      5.136718e+08
Name: engage_time, dtype: float64

In [None]:
target_df['dislike'].describe()

count    168918.000000
mean          0.205242
std           0.403879
min           0.000000
25%           0.000000
50%           0.000000
75%           0.000000
max           1.000000
Name: dislike, dtype: float64

In [None]:
target_df['like'].describe()

count    168918.000000
mean          0.252057
std           0.434195
min           0.000000
25%           0.000000
50%           0.000000
75%           1.000000
max           1.000000
Name: like, dtype: float64

In [None]:
target_df['like'].unique()

array([0, 1])

### 2. Predict

1. Experimented with various models (Random Forest, KNN, Neural Network, Linear Regression, XGBoost, Decision Trees, SVR, and Gradient Boosting)
2. Tuned Neural Network and Random Forest Regressor and Classifier (learning rate, batch size, number of training epochs, weight initialization, model architecture)
3. Implemented and tuned oversampling the minority class for neural network and random forests

In [None]:
class DataCollectorExample(DataCollector):

    def feature_generation_user(self) -> Tuple[pd.DataFrame, List[str], List[str]]:
        """Generates user features. Keep all the categorical variables as is,
        since the one-hot encoding will be done by our own pipeline. Along with
        the feature dataframe, you'll need to output lists of numberical features
        and categorical features as well.

        Returns
          pd.DataFrame: User feature dataframe
          List[str]: List of numerical features. E.g. ['feat_1', 'feat_3, ...]
          List[str]: List of categorical features. E.g. ['feat_2', 'feat_4, ...]
        """

        # Filtering like/dislike engagements
        like_data = self.user_data[engagement_data['engagement_type'] == 'Like']

        # Grouping by 'user_id' and 'content_id' and getting the latest engagement for each pair
        latest_like_data = like_data.sort_values('created_date').groupby(['user_id', 'content_id']).tail(1)

        # Getting total likes for each user
        like_engagements = latest_like_data[(latest_like_data['engagement_value']==1)].copy()
        like_feature_df = like_engagements.groupby('user_id')['engagement_value'].sum().reset_index()
        like_feature_df.rename(columns={'engagement_value': 'user_likes'}, inplace=True)
        # Fill NaN values with 0 (users with no "like" engagements)
        like_feature_df['user_likes'].fillna(0, inplace=True)


        # Getting total dislikes for each user
        dislike_engagements = latest_like_data[(latest_like_data['engagement_value']==-1)].copy()
        dislike_feature_df = dislike_engagements.groupby('user_id')['engagement_value'].sum().reset_index()
        dislike_feature_df.rename(columns={'engagement_value': 'user_dislikes'}, inplace=True)
        # Fill NaN values with 0 (users with no "dislike" engagements)
        dislike_feature_df['user_dislikes'].fillna(0, inplace=True)

        # Getting average engage time for each user
        time_engagements = self.user_data[self.user_data['engagement_type'] == 'MillisecondsEngagedWith'].copy()
        engage_feature_df = time_engagements.groupby('user_id')['engagement_value'].mean().reset_index()
        engage_feature_df.rename(columns={'engagement_value': 'user_engagetime'}, inplace=True)
        # Fill NaN values with 0 (users with no engagment time data)
        engage_feature_df['user_engagetime'].fillna(0, inplace=True)

        feature_df = pd.merge(like_feature_df, dislike_feature_df , on='user_id', how='left')
        feature_df = pd.merge(feature_df, engage_feature_df , on='user_id', how='left')

        return feature_df, ['user_likes', 'user_dislikes', 'user_engagetime'], []


    def feature_generation_content(self) -> Tuple[pd.DataFrame, List[str], List[str]]:
        """Generates content features. Keep all the categorical variables as is,
        since the one-hot encoding will be done by our own pipeline. Along with
        the feature dataframe, you'll need to output lists of numberical features
        and categorical features as well.

        Returns
          pd.DataFrame: User feature dataframe
          List[str]: List of numerical features. E.g. ['feat_1', 'feat_3, ...]
          List[str]: List of categorical features. E.g. ['feat_2', 'feat_4, ...]
        """

        feature_df = self.generated_content_metadata.copy()

        # numerical feature 1: (average) guidance scale
        mean_engage = feature_df["guidance_scale"].mean()
        guide_df = feature_df.groupby('content_id')['guidance_scale'].mean().reset_index()
        guide_df = guide_df.rename(columns={'guidance_scale': 'content_guidance_scale'})
        feature_df = pd.merge(feature_df, guide_df, on='content_id', how='left')
        feature_df['content_guidance_scale'].fillna(mean_engage, inplace=True)

        # numerical feature 2: num inference steps
        mean_inf = feature_df["num_inference_steps"].mean()
        inf_df = feature_df.groupby('content_id')['num_inference_steps'].mean().reset_index()
        inf_df = inf_df.rename(columns={'num_inference_steps': 'content_inference_steps'})
        feature_df = pd.merge(feature_df, inf_df, on='content_id', how='left')
        feature_df['content_inference_steps'].fillna(mean_inf, inplace=True)


        # categorical feature 1: source
        feature_df['content_source'] = 'other'
        feature_df.loc[feature_df['source'] == 'human_prompts', 'content_source'] = 'human_prompts'
        feature_df.loc[feature_df['source'] == 'r/Showerthoughts', 'content_source'] = 'r/Showerthoughts'


        # categorical feature 2: artist style
        style_list = [
            'studio',
            'medieval',
            'anime',
            'kerry_james_marshall',
            'gta_v',
            'scifi',
            'van_gogh',
            'salvador_dali',
            'jean-michel_basquiat',
            'face_and_lighting'
        ]
        #style_list = ['movie', 'empty']
        feature_df['content_style'] = feature_df['artist_style']
        feature_df['content_style'].fillna("empty", inplace=True)
        feature_df.loc[feature_df['content_style'].str.startswith('movie:'), 'content_style'] = 'movie'
        feature_df.loc[~feature_df['content_style'].isin(style_list), 'content_style'] = 'other'


        return feature_df, ['content_inference_steps'], ['content_source', 'content_style']


    def get_Ys(self, engagement_data) -> pd.DataFrame:
        """Engineers taget variable that you are predicting.
        Args
            engagement_data (pd.DataFrame): Engagement data.
        Returns
            pd.DataFrame: Dataframe of 5 columns;
                'user_id', 'content_id', 'like', 'dislike', 'engage_time'
        """
        # Filtering Like-type engagements
        like_data = engagement_data[engagement_data['engagement_type'] == 'Like']

        # Grouping by 'user_id' and 'content_id' and getting the latest engagement for each pair
        latest_engagements = like_data.sort_values('created_date').groupby(['user_id', 'content_id']).tail(1)

        # Creating the target DataFrame with unique pairs of user_id and content_id
        target_df = engagement_data[['user_id', 'content_id']].drop_duplicates()

        # Merging latest engagements to update 'like' and 'dislike' columns
        target_df = pd.merge(target_df, latest_engagements[['user_id', 'content_id', 'engagement_value']],
                            on=['user_id', 'content_id'], how='left')

        # Updating 'like' and 'dislike' columns based on the latest engagement values
        target_df['like'] = (target_df['engagement_value'] == 1).astype(int)
        target_df['dislike'] = (target_df['engagement_value'] == -1).astype(int)

        # Filling NaN values with 0 for pairs without like/dislike
        target_df.fillna(0, inplace=True)


        # Set "engage_time" based on engagement_type and engagement_value
        # assign existing engagement time if doesn't have that data, assign zero
        engage_times = engagement_data[engagement_data['engagement_type'] == 'MillisecondsEngagedWith']

        #engage_times = engage_times.groupby(['user_id', 'content_id'])['engagement_value'].mean().reset_index()
        engage_times = engage_times.groupby(['user_id', 'content_id'])['engagement_value'].max().reset_index()

        engage_times.rename(columns={'engagement_value': 'engage_time'}, inplace=True)

        target_df = pd.merge(target_df, engage_times[['user_id', 'content_id', 'engage_time']],
                            on=['user_id', 'content_id'], how='left')

        """
        # Grouping by 'user_id' and 'content_id' to get the earliest 'created_date' for each pair
        engage_times = engage_times.sort_values('created_date').groupby(['user_id', 'content_id']).head(1)
        """
        # Filling NaN values with 0 for pairs without engage_time
        target_df['engage_time'].fillna(0, inplace=True)

        # Select and rename the required columns
        target_df = target_df[['user_id', 'content_id', 'like', 'dislike', 'engage_time']].copy()

        return target_df


    def predict(self, X: pd.DataFrame) -> Tuple[list, list, list]:
        """Predicts the 3 target variables by using the model that you trained.
        Make sure you load the model properly.

        Args:
            X (pd.DataFrame): Feature dataframe with 2-level index of (user_id, content_id)

        Returns:
            (list, list, list): (predicted prbability of like,
                                 predicted probability of dislike,
                                 predicted engagement time)
        """

        model = self.load_model()

        pred_like = model['like'].predict(X).flatten()
        pred_dislike = model['dislike'].predict(X).flatten()
        pred_engtime = model['engage_time'].predict(X).flatten()

        return pred_like, pred_dislike, pred_engtime

    def rank(self,
             score_df: pd.DataFrame,
             user_id: int,
             content_ids: Optional[list] = None) -> list:

        """Ranks the items for a given user based on your own criteria.

        Args:
            score_df (pd.DataFrame): Predicted-score Dataframe of columns;
                'user_id', 'content_id', 'like', 'dislike', 'engage_time', and
                also columns for content metadata.
            user_id (int): User ID to rank the items for.
            content_ids (Optional[list]): List of content ids to be considered for ranking.
        """

        user_df = score_df[score_df['user_id'] == user_id]

        def select_artist_style(style):
            if pd.isna(style) or str(style).startswith('movie:'):
                return 'other'
            else:
                return style

        user_df['selected_artiststyle'] = user_df['artist_style'].apply(select_artist_style)

        user_df['value'] = user_df['like'] - user_df['dislike'] + np.minimum(user_df['engage_time'], 10000)
        # engage_time no longer than 10s
        user_df_sorted = user_df.sort_values(by='value', ascending=False)

        sorted_content_ids = []
        last_artist_style = 1
        keep_styles = {'other', 'gta_v', 'medieval', 'detailed_portrait', 'van_gogh', 'unreal_engine', 'face_and_lighting', 'scifi', 'oil_on_canvas', 'anime', 'studio'}

        while not user_df_sorted.empty:
            selected_rows = user_df_sorted.loc[(user_df_sorted['selected_artiststyle'] != last_artist_style) | (user_df_sorted['selected_artiststyle'].isin(keep_styles))]
            if selected_rows.empty:
                break
            selected_row = selected_rows.iloc[0]
            sorted_content_ids.append(selected_row['content_id'])
            last_artist_style = selected_row['selected_artiststyle']
            user_df_sorted = user_df_sorted.drop(selected_row.name)

        return sorted_content_ids

    def load_model(self) -> object:
        """Loads your model. Since different ML frameworks requires different
        ways to load the model. Change this to reflect your choice of framework.

        Returns:
            object: Model object
        """

        with open(f'{self.objects_dir}/model.pkl', 'rb') as f:
            return pickle.load(f)

###Neural Networks + Oversampling + Tuning

In [None]:
#@title get training data
data_collector = DataCollectorExample(
    engagement_path='sample_data/engagement_train.csv',
    content_meta_path='sample_data/generated_content_metadata.csv'
    )

X_train = data_collector.gen_model_input(is_train=True)
y_train = data_collector.gen_target_vars()

# ensure that each row of y_train corresponds to the correct user-content in X_train
y_train = y_train.reindex(index=X_train.index)

  like_data = self.user_data[engagement_data['engagement_type'] == 'Like']


In [None]:
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, mean_squared_error
import pandas as pd
import numpy as np
import pickle
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

# Assuming X_train and y_train are your feature and target variables
# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_train, y_train, test_size=0.2, random_state=42)


# Model for 'like'
# Model for 'dislike' (assuming binary classification)

# Creating a Sequential model
model_like = Sequential([
    Dense(128, activation='relu'),
    Dropout(0.2),  # Adding dropout for regularization
    Dense(64, activation='relu'),
    Dense(32, activation='relu'),
    Dense(1, activation='sigmoid')  # Output layer with 1 neuron for regression
])
model_like.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
model_like.fit(X_train, y_train['like'], epochs=20)
like_probs = model_like.predict(X_test)
like_predictions = np.array([0 if prob < 0.5 else 1 for prob in like_probs])
like_accuracy = accuracy_score(y_test['like'], like_predictions)
print(f"Accuracy for 'like': {like_accuracy}")


# Model for 'dislike' (assuming binary classification)

# Creating a Sequential model
model_dislike = Sequential([
    Dense(128, activation='relu', input_shape=(X_train.shape[1],)),
    Dropout(0.2),  # Adding dropout for regularization
    Dense(64, activation='relu'),
    Dense(32, activation='relu'),
    Dense(1, activation='sigmoid')  # Output layer with 1 neuron for regression
])
model_dislike.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
model_dislike.fit(X_train, y_train['dislike'], epochs=20)
dislike_probs = model_dislike.predict(X_test)
dislike_predictions = np.array([0 if prob < 0.5 else 1 for prob in dislike_probs])
dislike_accuracy = accuracy_score(y_test['dislike'], dislike_predictions)
print(f"Accuracy for 'dislike': {dislike_accuracy}")


# Creating a Sequential model
model_engtime = Sequential([
    Dense(128, activation='relu', input_shape=(X_train.shape[1],)),
    Dropout(0.2),  # Adding dropout for regularization
    Dense(64, activation='relu'),
    Dense(32, activation='relu'),
    Dense(1)  # Output layer with 1 neuron for regression
])
model_engtime.compile(optimizer='adam', loss='mean_squared_error', metrics=['mae'])
model_engtime.fit(X_train, y_train['engage_time'], epochs=20)
engtime_predictions = model_engtime.predict(X_test)
engtime_mse = mean_squared_error(y_test['engage_time'], engtime_predictions)
print(f"Root Mean Squared Error for 'engage_time': {engtime_mse ** 0.5}")


# Save the models to a file
model = {
    'like': model_like,
    'dislike': model_dislike,
    'engage_time': model_engtime
}

with open('sample_data/model.pkl', 'wb') as f:
    pickle.dump(model, f)

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20
Accuracy for 'like': 0.7668996079768196
Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20
Accuracy for 'dislike': 0.802863473666269
Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20
Root Mean Squared Error for 'engage_time': 3898109.7364842338


In [None]:
# Get true target variables
y_true = data_collector.gen_target_vars(engagement_test)

# Make predictions
y_pred = data_collector.score(content_ids = sample_contents)

  like_data = self.user_data[engagement_data['engagement_type'] == 'Like']




In [None]:
thres_like = 0.5
thres_dislike = 0.5
evaluate(y_true, y_pred, thres_like, thres_dislike)

{'like': {'precision': 0.5945945945945946, 'recall': 0.27848101265822783},
 'dislike': {'precision': 0.72, 'recall': 0.2571428571428571},
 'engage_time': {'rmse': 91198.75894471485}}

In [None]:
thres_like = 0.4
thres_dislike = 0.4
evaluate(y_true, y_pred, thres_like, thres_dislike)

{'like': {'precision': 0.5538461538461539, 'recall': 0.45569620253164556},
 'dislike': {'precision': 0.625, 'recall': 0.42857142857142855},
 'engage_time': {'rmse': 91198.75894471485}}

oversampling minority class to increase recall (like = 1) ==> precision 40%, recall=60%, train accuracy=70%

oversampling minority class to 40% of majority to increase recall (like = 1) ==> {'like': {'precision': 0.5909090909090909, 'recall': 0.3291139240506329}, train accuracy=77%

oversampling minority class to 66% of majority to increase recall (like = 1) ==> {'like': {'precision': 0.5616438356164384, 'recall': 0.5189873417721519}, train accuracy=76%


normally:

train accuracies: like:76%, dislike:80%

'like': {'precision': 0.5945945945945946, 'recall': 0.27848101265822783},

 'dislike': {'precision': 0.72, 'recall': 0.2571428571428571},

 'engage_time': {'rmse': 91198.75894471485}}

oversampling both like and dislike to 66% of majority:


'like': {'precision': 0.5616438356164384, 'recall': 0.5189873417721519}, train acc: 76

'dislike': {'precision': 0.5512820512820513, 'recall': 0.6142857142857143},
train acc:77

'engage_time': {'rmse': 91198.75894471485}




oversampling dislike to 56% of majority:

{'like': {'precision': 0.5616438356164384, 'recall': 0.5189873417721519},

 'dislike': {'precision': 0.5737704918032787, 'recall': 0.5},
train acc: 79
 'engage_time': {'rmse': 91198.75894471485}}


In [None]:
train_data = pd.concat([pd.DataFrame(X_train), pd.Series(y_train['like'], name='target_variable')], axis=1)
# Separate the majority and minority classes
majority_class = train_data[train_data['target_variable'] == 0]
minority_class = train_data[train_data['target_variable'] == 1]

In [None]:
from imblearn.over_sampling import SMOTE

# Assuming X_train and y_train are your feature and target variables
smote = SMOTE(random_state=42)
X_resampled, y_resampled = smote.fit_resample(X_train, y_train['like'])

In [None]:
from imblearn.over_sampling import SMOTE

# Assuming X_train and y_train are your feature and target variables
smote = SMOTE(random_state=42, sampling_strategy=0.4)
X_resampled, y_resampled = smote.fit_resample(X_train, y_train['like'])

In [None]:
from imblearn.over_sampling import SMOTE

# Assuming X_train and y_train are your feature and target variables
smote = SMOTE(random_state=42, sampling_strategy=0.66)
X_resampled, y_resampled = smote.fit_resample(X_train, y_train['like'])

In [None]:
# Creating a Sequential model
model_like_resampled = Sequential([
    Dense(128, activation='relu'),
    Dropout(0.2),  # Adding dropout for regularization
    Dense(64, activation='relu'),
    Dense(32, activation='relu'),
    Dense(1, activation='sigmoid')  # Output layer with 1 neuron for regression
])
model_like_resampled.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
model_like_resampled.fit(X_resampled, y_resampled, epochs=20)
like_probs = model_like_resampled.predict(X_test)
like_predictions = np.array([0 if prob < 0.5 else 1 for prob in like_probs])
like_accuracy = accuracy_score(y_test['like'], like_predictions)
print(f"Accuracy for 'like': {like_accuracy}")

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20
Accuracy for 'like': 0.6962331685699676


In [None]:
# Creating a Sequential model
model_like_resampled = Sequential([
    Dense(128, activation='relu'),
    Dropout(0.2),  # Adding dropout for regularization
    Dense(64, activation='relu'),
    Dense(32, activation='relu'),
    Dense(1, activation='sigmoid')  # Output layer with 1 neuron for regression
])
model_like_resampled.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
model_like_resampled.fit(X_resampled, y_resampled, epochs=20)
like_probs = model_like_resampled.predict(X_test)
like_predictions = np.array([0 if prob < 0.5 else 1 for prob in like_probs])
like_accuracy = accuracy_score(y_test['like'], like_predictions)
print(f"Accuracy for 'like': {like_accuracy}")

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20
Accuracy for 'like': 0.7672745866712118


In [None]:
# Creating a Sequential model
model_like_resampled = Sequential([
    Dense(128, activation='relu'),
    Dropout(0.2),  # Adding dropout for regularization
    Dense(64, activation='relu'),
    Dense(32, activation='relu'),
    Dense(1, activation='sigmoid')  # Output layer with 1 neuron for regression
])
model_like_resampled.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
model_like_resampled.fit(X_resampled, y_resampled, epochs=20)
like_probs = model_like_resampled.predict(X_test)
like_predictions = np.array([0 if prob < 0.5 else 1 for prob in like_probs])
like_accuracy = accuracy_score(y_test['like'], like_predictions)
print(f"Accuracy for 'like': {like_accuracy}")

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20
Accuracy for 'like': 0.7552411794784387


In [None]:
y_train['dislike'].sum() / (len(y_train['dislike']) - y_train['dislike'].sum())

0.2923579843335595

In [None]:
y_train['dislike'].sum() / (len(y_train['dislike']))

0.22622058893714503

In [None]:
len(y_train['dislike'])

146671

In [None]:
from imblearn.over_sampling import SMOTE

# Assuming X_train and y_train are your feature and target variables
smote = SMOTE(random_state=42, sampling_strategy=0.56)
X_resampled, y_resampled = smote.fit_resample(X_train, y_train['dislike'])

In [None]:
# dislike: Creating a Sequential model with minority 56% of majority
model_dislike = Sequential([
    Dense(128, activation='relu', input_shape=(X_train.shape[1],)),
    Dropout(0.2),  # Adding dropout for regularization
    Dense(64, activation='relu'),
    Dense(32, activation='relu'),
    Dense(1, activation='sigmoid')  # Output layer with 1 neuron for regression
])
model_dislike.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
model_dislike.fit(X_resampled, y_resampled, epochs=20)
dislike_probs = model_dislike.predict(X_test)
dislike_predictions = np.array([0 if prob < 0.5 else 1 for prob in dislike_probs])
dislike_accuracy = accuracy_score(y_test['dislike'], dislike_predictions)
print(f"Accuracy for 'dislike': {dislike_accuracy}")

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20
Accuracy for 'dislike': 0.791955002556673


In [None]:
from imblearn.over_sampling import SMOTE

smote = SMOTE(random_state=42, sampling_strategy=0.66)
X_resampled, y_resampled = smote.fit_resample(X_train, y_train['dislike'])

In [None]:
# dislike: Creating a Sequential model with minority 66%
model_dislike = Sequential([
    Dense(128, activation='relu', input_shape=(X_train.shape[1],)),
    Dropout(0.2),  # Adding dropout for regularization
    Dense(64, activation='relu'),
    Dense(32, activation='relu'),
    Dense(1, activation='sigmoid')  # Output layer with 1 neuron for regression
])
model_dislike.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
model_dislike.fit(X_resampled, y_resampled, epochs=20)
dislike_probs = model_dislike.predict(X_test)
dislike_predictions = np.array([0 if prob < 0.5 else 1 for prob in dislike_probs])
dislike_accuracy = accuracy_score(y_test['dislike'], dislike_predictions)
print(f"Accuracy for 'dislike': {dislike_accuracy}")

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20
Accuracy for 'dislike': 0.7673768535878643


In [None]:
# Save the models to a file
model = {
    'like': model_like_resampled,
    'dislike': model_dislike,
    'engage_time': model_engtime
}

with open('sample_data/model.pkl', 'wb') as f:
    pickle.dump(model, f)

In [None]:
# Get true target variables
y_true = data_collector.gen_target_vars(engagement_test)

# Make predictions
y_pred = data_collector.score(content_ids = sample_contents)

  like_data = self.user_data[engagement_data['engagement_type'] == 'Like']




In [None]:
thres_like = 0.5
thres_dislike = 0.5
evaluate(y_true, y_pred, thres_like, thres_dislike)

{'like': {'precision': 0.5616438356164384, 'recall': 0.5189873417721519},
 'dislike': {'precision': 0.5737704918032787, 'recall': 0.5},
 'engage_time': {'rmse': 91198.75894471485}}

In [None]:
1/ (1/0.5737704918032787 + 1/0.5)  * 2

0.5343511450381679

In [None]:
thres_like = 0.5
thres_dislike = 0.5
evaluate(y_true, y_pred, thres_like, thres_dislike)

{'like': {'precision': 0.5616438356164384, 'recall': 0.5189873417721519},
 'dislike': {'precision': 0.5512820512820513, 'recall': 0.6142857142857143},
 'engage_time': {'rmse': 91198.75894471485}}

In [None]:
1/ (1/0.5512820512820513 + 1/0.6142857142857143) * 2

0.5810810810810811

In [None]:
thres_like = 0.5
thres_dislike = 0.5
evaluate(y_true, y_pred, thres_like, thres_dislike)

{'like': {'precision': 0.5616438356164384, 'recall': 0.5189873417721519},
 'dislike': {'precision': 0.72, 'recall': 0.2571428571428571},
 'engage_time': {'rmse': 91198.75894471485}}

In [None]:
thres_like = 0.5
thres_dislike = 0.5
evaluate(y_true, y_pred, thres_like, thres_dislike)

{'like': {'precision': 0.44715447154471544, 'recall': 0.6962025316455697},
 'dislike': {'precision': 0.72, 'recall': 0.2571428571428571},
 'engage_time': {'rmse': 91198.75894471485}}

In [None]:
thres_like = 0.5
thres_dislike = 0.5
evaluate(y_true, y_pred, thres_like, thres_dislike)

{'like': {'precision': 0.5909090909090909, 'recall': 0.3291139240506329},
 'dislike': {'precision': 0.72, 'recall': 0.2571428571428571},
 'engage_time': {'rmse': 91198.75894471485}}

In [None]:
y_resampled.sum() / (len(y_resampled) - y_resampled.sum())

0.39999063363461806

In [None]:
y_resampled.sum() / (len(y_resampled))

0.28570950692446645

In [None]:
y_train['like'].sum() / (len(y_train) - y_train['like'].sum())

0.3737648105652602

In [None]:
y_train['like'].sum() / (len(y_train))

0.27207336196904613

In [None]:
y_resampled.sum()

85412

In [None]:
y_resampled

0         0
1         0
2         1
3         0
4         0
         ..
170819    1
170820    1
170821    1
170822    1
170823    1
Name: like, Length: 170824, dtype: int64

In [None]:
recs = data_collector.recommend(user_id=8, content_ids=sample_contents, top_k=20)

  like_data = self.user_data[engagement_data['engagement_type'] == 'Like']




In [None]:
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, mean_squared_error
import pandas as pd
import numpy as np
import pickle
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from imblearn.over_sampling import SMOTE



# Model for 'like'

# Oversample minority class to 66% of majority class
smote = SMOTE(random_state=42, sampling_strategy=0.66)
X_like, y_like = smote.fit_resample(X_train, y_train['like'])

# Creating a Sequential model
model_like = Sequential([
    Dense(128, activation='relu'),
    Dropout(0.2),  # Adding dropout for regularization
    Dense(64, activation='relu'),
    Dense(32, activation='relu'),
    Dense(1, activation='sigmoid')  # Output layer with 1 neuron for regression
])
model_like.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
model_like.fit(X_like, y_like, epochs=50)


# Model for 'dislike'

# Oversample minority class to 66% of majority class
smote = SMOTE(random_state=42, sampling_strategy=0.66)
X_dislike, y_dislike = smote.fit_resample(X_train, y_train['dislike'])

# Creating a Sequential model
model_dislike = Sequential([
    Dense(128, activation='relu', input_shape=(X_train.shape[1],)),
    Dropout(0.2),  # Adding dropout for regularization
    Dense(64, activation='relu'),
    Dense(32, activation='relu'),
    Dense(1, activation='sigmoid')
])
model_dislike.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
model_dislike.fit(X_dislike, y_dislike, epochs=50)


# Creating a Sequential model
model_engtime = Sequential([
    Dense(128, activation='relu', input_shape=(X_train.shape[1],)),
    Dropout(0.2),  # Adding dropout for regularization
    Dense(64, activation='relu'),
    Dense(32, activation='relu'),
    Dense(1)
])
model_engtime.compile(optimizer='adam', loss='mean_squared_error', metrics=['mae'])
model_engtime.fit(X_train, y_train['engage_time'], epochs=50)



# Save the models to a file
model = {
    'like': model_like,
    'dislike': model_dislike,
    'engage_time': model_engtime
}

with open('sample_data/model.pkl', 'wb') as f:
    pickle.dump(model, f)

Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50
Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/5

In [None]:
# Simulates contents filtered from previous stage.
# Feel free to change this to reflect your previous stage.

sample_contents = content_meta['content_id'].sample(frac=0.01)

In [None]:
# Get true target variables
y_true = data_collector.gen_target_vars(engagement_test)

# Make predictions
y_pred = data_collector.score(content_ids = sample_contents)

  like_data = self.user_data[engagement_data['engagement_type'] == 'Like']




In [None]:
thres_like = 0.5
thres_dislike = 0.5
evaluate(y_true, y_pred, thres_like, thres_dislike)

{'like': {'precision': 0.5353535353535354, 'recall': 0.5196078431372549},
 'dislike': {'precision': 0.43137254901960786, 'recall': 0.5},
 'engage_time': {'rmse': 102165.67322659433}}

In [None]:
dislike_probs = model['dislike'].predict(X_train)
dislike_preds = np.where(dislike_probs > 0.5, 1, 0)
dislike_accuracy = accuracy_score(y_train['dislike'], dislike_preds)



In [None]:
dislike_accuracy

0.7782247342692148

In [None]:
like_probs = model['like'].predict(X_train)
like_preds = np.where(dislike_probs > 0.5, 1, 0)
like_accuracy = accuracy_score(y_train['like'], dislike_preds)
like_accuracy



0.6240088361025697

In [None]:
engtime_predictions = model_engtime.predict(X_train)
engtime_mse = mean_squared_error(y_train['engage_time'], engtime_predictions)
engtime_mse ** 0.5



2657244.520720369

###random forest regressor and classifer + oversampling

In [None]:
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, mean_squared_error
import pandas as pd
import numpy as np
import pickle

from imblearn.over_sampling import SMOTE

# Model for 'like'
smote = SMOTE(random_state=42, sampling_strategy=0.66)
X_like, y_like = smote.fit_resample(X_train, y_train['like'])

model_like = RandomForestClassifier(random_state=42)
model_like.fit(X_like, y_like)


# Model for 'dislike'
smote = SMOTE(random_state=42, sampling_strategy=0.66)
X_dislike, y_dislike = smote.fit_resample(X_train, y_train['dislike'])

model_dislike = RandomForestClassifier(random_state=42)
model_dislike.fit(X_dislike, y_dislike)


# Model for 'engage_time'
model_engtime = RandomForestRegressor(random_state=42)
model_engtime.fit(X_train, y_train['engage_time'])


# Save the models to a file
model = {
    'like': model_like,
    'dislike': model_dislike,
    'engage_time': model_engtime
}

with open('sample_data/model.pkl', 'wb') as f:
    pickle.dump(model, f)

In [None]:
# Get true target variables
y_true = data_collector.gen_target_vars(engagement_test)

# Make predictions
y_pred = data_collector.score(content_ids = sample_contents)

  like_data = self.user_data[engagement_data['engagement_type'] == 'Like']


In [None]:
thres_like = 0.5
thres_dislike = 0.5
evaluate(y_true, y_pred, thres_like, thres_dislike)

{'like': {'precision': 0.5662650602409639, 'recall': 0.5949367088607594},
 'dislike': {'precision': 0.5111111111111111, 'recall': 0.6571428571428571},
 'engage_time': {'rmse': 151103.34039714394}}

In [None]:
from sklearn.metrics import accuracy_score, mean_squared_error

In [None]:
dislike_probs = model['like'].predict(X_train)
dislike_preds = np.where(dislike_probs > 0.5, 1, 0)
dislike_accuracy = accuracy_score(y_train['dislike'], dislike_preds)

In [None]:
dislike_accuracy

0.6219225341069469

In [None]:
like_probs = model['dislike'].predict(X_train)
like_preds = np.where(like_probs > 0.5, 1, 0)
like_accuracy = accuracy_score(y_train['like'], like_preds)
like_accuracy

0.619188523975428

In [None]:
engtime_predictions = model_engtime.predict(X_train)
engtime_mse = mean_squared_error(y_train['engage_time'], engtime_predictions)
engtime_mse

7006509147213.162

In [None]:
#with max for engagetime

In [None]:
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, mean_squared_error
import pandas as pd
import numpy as np
import pickle

# Assuming X_train and y_train are your feature and target variables
# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_train, y_train, test_size=0.2, random_state=42)

# Model for 'like'
model_like = RandomForestClassifier(random_state=42)
model_like.fit(X_train, y_train['like'])
like_predictions = model_like.predict(X_test)
like_accuracy = accuracy_score(y_test['like'], like_predictions)
print(f"Accuracy for 'like': {like_accuracy}")

# Model for 'dislike' (assuming binary classification)
model_dislike = RandomForestClassifier(random_state=42)
model_dislike.fit(X_train, y_train['dislike'])
dislike_predictions = model_dislike.predict(X_test)
dislike_accuracy = accuracy_score(y_test['dislike'], dislike_predictions)
print(f"Accuracy for 'dislike': {dislike_accuracy}")

# Model for 'engage_time'
model_engtime = RandomForestRegressor(random_state=42)
model_engtime.fit(X_train, y_train['engage_time'])
engtime_predictions = model_engtime.predict(X_test)
engtime_mse = mean_squared_error(y_test['engage_time'], engtime_predictions)
print(f"Root Mean Squared Error for 'engage_time': {engtime_mse ** 0.5}")

# Save the models to a file
model = {
    'like': model_like,
    'dislike': model_dislike,
    'engage_time': model_engtime
}

with open('sample_data/model.pkl', 'wb') as f:
    pickle.dump(model, f)

Accuracy for 'like': 0.7707877137953483
Accuracy for 'dislike': 0.8017399538379594
Root Mean Squared Error for 'engage_time': 2984430.57930276


In [None]:
# Get true target variables
y_true = data_collector.gen_target_vars(engagement_test)

# Make predictions
y_pred = data_collector.score(content_ids = sample_contents)

  like_data = self.user_data[engagement_data['engagement_type'] == 'Like']


In [None]:
thres_like = 0.5
thres_dislike = 0.5
evaluate(y_true, y_pred, thres_like, thres_dislike)

{'like': {'precision': 0.5333333333333333, 'recall': 0.3037974683544304},
 'dislike': {'precision': 0.625, 'recall': 0.2857142857142857},
 'engage_time': {'rmse': 156037.20978833942}}

In [None]:
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, mean_squared_error
import pandas as pd
import numpy as np
import pickle

# Assuming X_train and y_train are your feature and target variables
# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_train, y_train, test_size=0.2, random_state=42)

# Model for 'like'
model_like = RandomForestClassifier(random_state=42)
model_like.fit(X_train, y_train['like'])
like_predictions = model_like.predict(X_test)
like_accuracy = accuracy_score(y_test['like'], like_predictions)
print(f"Accuracy for 'like': {like_accuracy}")

# Model for 'dislike' (assuming binary classification)
model_dislike = RandomForestClassifier(random_state=42)
model_dislike.fit(X_train, y_train['dislike'])
dislike_predictions = model_dislike.predict(X_test)
dislike_accuracy = accuracy_score(y_test['dislike'], dislike_predictions)
print(f"Accuracy for 'dislike': {dislike_accuracy}")

# Model for 'engage_time'
model_engtime = RandomForestRegressor(random_state=42)
model_engtime.fit(X_train, y_train['engage_time'])
engtime_predictions = model_engtime.predict(X_test)
engtime_mse = mean_squared_error(y_test['engage_time'], engtime_predictions)
print(f"Root Mean Squared Error for 'engage_time': {engtime_mse ** 0.5}")

# Save the models to a file
model = {
    'like': model_like,
    'dislike': model_dislike,
    'engage_time': model_engtime
}

with open('sample_data/model.pkl', 'wb') as f:
    pickle.dump(model, f)

Accuracy for 'like': 0.7703233748402064
Accuracy for 'dislike': 0.7972633871502296
Root Mean Squared Error for 'engage_time': 3741611.1504308474


In [None]:
sample_contents = content_meta['content_id'].sample(frac=0.01)

In [None]:
# Get true target variables
y_true = data_collector.gen_target_vars(engagement_test)

# Make predictions
y_pred = data_collector.score(content_ids = sample_contents)

  like_data = self.user_data[engagement_data['engagement_type'] == 'Like']


In [None]:
thres_like = 0.5
thres_dislike = 0.5
evaluate(y_true, y_pred, thres_like, thres_dislike)

{'like': {'precision': 0.5869565217391305, 'recall': 0.34177215189873417},
 'dislike': {'precision': 0.6363636363636364, 'recall': 0.3},
 'engage_time': {'rmse': 156490.66564865594}}

###KNN Classifier + KNN Regressor

In [None]:
from sklearn.utils import resample

train_data = pd.concat([pd.DataFrame(X_train), pd.Series(y_train['like'], name='target_variable')], axis=1)
# Separate the majority and minority classes
majority_class = train_data[train_data['target_variable'] == 0]
minority_class = train_data[train_data['target_variable'] == 1]

# Downsample the majority class
downsampled_majority = resample(majority_class, replace=False, n_samples=len(minority_class), random_state=42)

# Combine the downsampled majority class with the original minority class
downsampled_data = pd.concat([downsampled_majority, minority_class])

# Split the downsampled data into features and labels
X_train_downsampled = downsampled_data.drop('target_variable', axis=1)
y_train_downsampled = downsampled_data['target_variable']

In [None]:
from sklearn.neighbors import KNeighborsClassifier
knn_model = KNeighborsClassifier(n_neighbors=10)
knn_model.fit(X_train_downsampled, y_train_downsampled)

In [None]:
y_proba = knn_model.predict_proba(X_test)[:, 1]
y_pred = (y_proba > 0.5).astype(int)

In [None]:
#k=10
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix, roc_auc_score

# Evaluate the model
accuracy = accuracy_score(y_test['like'], y_pred)
conf_matrix = confusion_matrix(y_test['like'], y_pred)
class_report = classification_report(y_test['like'], y_pred)
roc_auc = roc_auc_score(y_test['like'], y_proba)

print(f"Accuracy: {accuracy:.2f}")
print("Confusion Matrix:")
print(conf_matrix)
print("Classification Report:")
print(class_report)
print(f"ROC AUC: {roc_auc:.2f}")

Accuracy: 0.72
Confusion Matrix:
[[10121   470]
 [ 3577   500]]
Classification Report:
              precision    recall  f1-score   support

           0       0.74      0.96      0.83     10591
           1       0.52      0.12      0.20      4077

    accuracy                           0.72     14668
   macro avg       0.63      0.54      0.52     14668
weighted avg       0.68      0.72      0.66     14668

ROC AUC: 0.54


In [None]:
#k=3
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix, roc_auc_score

# Evaluate the model
accuracy = accuracy_score(y_test['like'], y_pred)
conf_matrix = confusion_matrix(y_test['like'], y_pred)
class_report = classification_report(y_test['like'], y_pred)
roc_auc = roc_auc_score(y_test['like'], y_proba)

print(f"Accuracy: {accuracy:.2f}")
print("Confusion Matrix:")
print(conf_matrix)
print("Classification Report:")
print(class_report)
print(f"ROC AUC: {roc_auc:.2f}")

Accuracy: 0.72
Confusion Matrix:
[[10197   394]
 [ 3687   390]]
Classification Report:
              precision    recall  f1-score   support

           0       0.73      0.96      0.83     10591
           1       0.50      0.10      0.16      4077

    accuracy                           0.72     14668
   macro avg       0.62      0.53      0.50     14668
weighted avg       0.67      0.72      0.65     14668

ROC AUC: 0.54


In [None]:
from sklearn.neighbors import KNeighborsClassifier
knn_model = KNeighborsClassifier(n_neighbors=3)
knn_model.fit(X_train, y_train['like'])

In [None]:
y_proba = knn_model.predict_proba(X_test)[:, 1]
y_pred = (y_proba > 0.5).astype(int)

In [None]:
#k=3
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix, roc_auc_score

# Evaluate the model
accuracy = accuracy_score(y_test['like'], y_pred)
conf_matrix = confusion_matrix(y_test['like'], y_pred)
class_report = classification_report(y_test['like'], y_pred)
roc_auc = roc_auc_score(y_test['like'], y_proba)

print(f"Accuracy: {accuracy:.2f}")
print("Confusion Matrix:")
print(conf_matrix)
print("Classification Report:")
print(class_report)
print(f"ROC AUC: {roc_auc:.2f}")

Accuracy: 0.71
Confusion Matrix:
[[8671 1920]
 [2359 1718]]
Classification Report:
              precision    recall  f1-score   support

           0       0.79      0.82      0.80     10591
           1       0.47      0.42      0.45      4077

    accuracy                           0.71     14668
   macro avg       0.63      0.62      0.62     14668
weighted avg       0.70      0.71      0.70     14668

ROC AUC: 0.68


In [None]:
#k=20
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix, roc_auc_score

# Evaluate the model
accuracy = accuracy_score(y_test['like'], y_pred)
conf_matrix = confusion_matrix(y_test['like'], y_pred)
class_report = classification_report(y_test['like'], y_pred)
roc_auc = roc_auc_score(y_test['like'], y_proba)

print(f"Accuracy: {accuracy:.2f}")
print("Confusion Matrix:")
print(conf_matrix)
print("Classification Report:")
print(class_report)
print(f"ROC AUC: {roc_auc:.2f}")

Accuracy: 0.76
Confusion Matrix:
[[9651  940]
 [2642 1435]]
Classification Report:
              precision    recall  f1-score   support

           0       0.79      0.91      0.84     10591
           1       0.60      0.35      0.44      4077

    accuracy                           0.76     14668
   macro avg       0.69      0.63      0.64     14668
weighted avg       0.73      0.76      0.73     14668

ROC AUC: 0.75


In [None]:
#k=5
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix, roc_auc_score

# Evaluate the model
accuracy = accuracy_score(y_test['like'], y_pred)
conf_matrix = confusion_matrix(y_test['like'], y_pred)
class_report = classification_report(y_test['like'], y_pred)
roc_auc = roc_auc_score(y_test['like'], y_proba)

print(f"Accuracy: {accuracy:.2f}")
print("Confusion Matrix:")
print(conf_matrix)
print("Classification Report:")
print(class_report)
print(f"ROC AUC: {roc_auc:.2f}")

Accuracy: 0.73
Confusion Matrix:
[[9022 1569]
 [2381 1696]]
Classification Report:
              precision    recall  f1-score   support

           0       0.79      0.85      0.82     10591
           1       0.52      0.42      0.46      4077

    accuracy                           0.73     14668
   macro avg       0.66      0.63      0.64     14668
weighted avg       0.72      0.73      0.72     14668

ROC AUC: 0.70


###XGBoost Regressor + XGBoost Classifier

In [None]:
#@title get training data
data_collector = DataCollectorExample(
    engagement_path='sample_data/engagement_train.csv',
    content_meta_path='sample_data/generated_content_metadata.csv'
    )

X_train = data_collector.gen_model_input(is_train=True)
y_train = data_collector.gen_target_vars()

# ensure that each row of y_train corresponds to the correct user-content in X_train
y_train = y_train.reindex(index=X_train.index)

  like_data = self.user_data[engagement_data['engagement_type'] == 'Like']


In [None]:
X_train, X_test, y_train, y_test = train_test_split(X_train, y_train, test_size=0.1, random_state=42)

In [None]:
from xgboost import XGBRegressor

model = XGBRegressor(objective ='reg:squarederror', colsample_bytree = 0.3, learning_rate = 0.01,
                max_depth = 10, alpha = 10, n_estimators = 10)

In [None]:
model.fit(X_train, y_train['engage_time'])

In [None]:
y_pred = model.predict(X_test)

In [None]:
from sklearn.metrics import mean_squared_error

rmse = np.sqrt(mean_squared_error(y_test['engage_time'], y_pred))
print("Root Mean Squared Error: {}".format(rmse))

Root Mean Squared Error: 1783167.3329244216


In [None]:
from xgboost import XGBClassifier

model = XGBClassifier(
    learning_rate=0.1,  # Adjust as needed
    n_estimators=300,   # Adjust as needed
    max_depth=20,        # Adjust as needed
    subsample=0.5,      # Adjust as needed
    colsample_bytree=1.0,  # Adjust as needed
    random_state=42
)

In [None]:
model.fit(X_train, y_train['like'])
y_proba = model.predict_proba(X_test)[:, 1]
y_pred = (y_proba > 0.5).astype(int)

In [None]:
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix, roc_auc_score

# Evaluate the model
accuracy = accuracy_score(y_test['like'], y_pred)
conf_matrix = confusion_matrix(y_test['like'], y_pred)
class_report = classification_report(y_test['like'], y_pred)
roc_auc = roc_auc_score(y_test['like'], y_proba)

print(f"Accuracy: {accuracy:.2f}")
print("Confusion Matrix:")
print(conf_matrix)
print("Classification Report:")
print(class_report)
print(f"ROC AUC: {roc_auc:.2f}")

Accuracy: 0.77
Confusion Matrix:
[[9758  833]
 [2580 1497]]
Classification Report:
              precision    recall  f1-score   support

           0       0.79      0.92      0.85     10591
           1       0.64      0.37      0.47      4077

    accuracy                           0.77     14668
   macro avg       0.72      0.64      0.66     14668
weighted avg       0.75      0.77      0.74     14668

ROC AUC: 0.77


In [None]:
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix, roc_auc_score

# Evaluate the model
accuracy = accuracy_score(y_test['like'], y_pred)
conf_matrix = confusion_matrix(y_test['like'], y_pred)
class_report = classification_report(y_test['like'], y_pred)
roc_auc = roc_auc_score(y_test['like'], y_proba)

print(f"Accuracy: {accuracy:.2f}")
print("Confusion Matrix:")
print(conf_matrix)
print("Classification Report:")
print(class_report)
print(f"ROC AUC: {roc_auc:.2f}")

Accuracy: 0.77
Confusion Matrix:
[[9852  739]
 [2613 1464]]
Classification Report:
              precision    recall  f1-score   support

           0       0.79      0.93      0.85     10591
           1       0.66      0.36      0.47      4077

    accuracy                           0.77     14668
   macro avg       0.73      0.64      0.66     14668
weighted avg       0.76      0.77      0.75     14668

ROC AUC: 0.78


In [None]:
from xgboost import XGBClassifier

model = XGBClassifier(
    learning_rate=0.01,  # Adjust as needed
    n_estimators=100,   # Adjust as needed
    max_depth=10,        # Adjust as needed
    subsample=0.5,      # Adjust as needed
    colsample_bytree=0.8,  # Adjust as needed
    random_state=42
)

In [None]:
model.fit(X_train, y_train['like'])
y_proba = model.predict_proba(X_test)[:, 1]
y_pred = (y_proba > 0.5).astype(int)

In [None]:
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix, roc_auc_score

# Evaluate the model
accuracy = accuracy_score(y_test['like'], y_pred)
conf_matrix = confusion_matrix(y_test['like'], y_pred)
class_report = classification_report(y_test['like'], y_pred)
roc_auc = roc_auc_score(y_test['like'], y_proba)

print(f"Accuracy: {accuracy:.2f}")
print("Confusion Matrix:")
print(conf_matrix)
print("Classification Report:")
print(class_report)
print(f"ROC AUC: {roc_auc:.2f}")

Accuracy: 0.76
Confusion Matrix:
[[10314   277]
 [ 3245   832]]
Classification Report:
              precision    recall  f1-score   support

           0       0.76      0.97      0.85     10591
           1       0.75      0.20      0.32      4077

    accuracy                           0.76     14668
   macro avg       0.76      0.59      0.59     14668
weighted avg       0.76      0.76      0.71     14668

ROC AUC: 0.78


In [None]:
from xgboost import XGBClassifier
model = XGBClassifier(
    learning_rate=0.1,  # Adjust as needed
    n_estimators=100,   # Adjust as needed
    max_depth=3,        # Adjust as needed
    subsample=0.8,      # Adjust as needed
    colsample_bytree=0.8,  # Adjust as needed
    random_state=42
)

In [None]:
model.fit(X_train, y_train['like'])
y_proba = model.predict_proba(X_test)[:, 1]
y_pred = (y_proba > 0.5).astype(int)

In [None]:
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix, roc_auc_score

# Evaluate the model
accuracy = accuracy_score(y_test['like'], y_pred)
conf_matrix = confusion_matrix(y_test['like'], y_pred)
class_report = classification_report(y_test['like'], y_pred)
roc_auc = roc_auc_score(y_test['like'], y_proba)

print(f"Accuracy: {accuracy:.2f}")
print("Confusion Matrix:")
print(conf_matrix)
print("Classification Report:")
print(class_report)
print(f"ROC AUC: {roc_auc:.2f}")

Accuracy: 0.74
Confusion Matrix:
[[10459   132]
 [ 3718   359]]
Classification Report:
              precision    recall  f1-score   support

           0       0.74      0.99      0.84     10591
           1       0.73      0.09      0.16      4077

    accuracy                           0.74     14668
   macro avg       0.73      0.54      0.50     14668
weighted avg       0.74      0.74      0.65     14668

ROC AUC: 0.74


In [None]:
y_proba = model.predict_proba(X_train)[:, 1]
y_pred = (y_proba > 0.5).astype(int)

In [None]:
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix, roc_auc_score

# Evaluate the model
accuracy = accuracy_score(y_train['like'], y_pred)
conf_matrix = confusion_matrix(y_train['like'], y_pred)
class_report = classification_report(y_train['like'], y_pred)
roc_auc = roc_auc_score(y_train['like'], y_proba)

print(f"Accuracy: {accuracy:.2f}")
print("Confusion Matrix:")
print(conf_matrix)
print("Classification Report:")
print(class_report)
print(f"ROC AUC: {roc_auc:.2f}")

Accuracy: 0.74
Confusion Matrix:
[[94724  1209]
 [32654  3416]]
Classification Report:
              precision    recall  f1-score   support

           0       0.74      0.99      0.85     95933
           1       0.74      0.09      0.17     36070

    accuracy                           0.74    132003
   macro avg       0.74      0.54      0.51    132003
weighted avg       0.74      0.74      0.66    132003

ROC AUC: 0.75


In [None]:
from xgboost import XGBRegressor

xg_reg = XGBRegressor(objective ='reg:squarederror', colsample_bytree = 0.3, learning_rate = 0.1,
                max_depth = 5, alpha = 10, n_estimators = 10)

In [None]:
xg_reg.fit(X_train, y_train['engage_time'])

In [None]:
y_pred = xg_reg.predict(X_test)

In [None]:
from sklearn.metrics import mean_squared_error

rmse = np.sqrt(mean_squared_error(y_test['engage_time'], y_pred))
print("Root Mean Squared Error: {}".format(rmse))

Root Mean Squared Error: 1783203.3344699605


###Initial Neural Networks Experiments

In [None]:
#@title get training data
data_collector = DataCollectorExample(
    engagement_path='sample_data/engagement.csv',
    content_meta_path='sample_data/generated_content_metadata.csv'
    )

X_train = data_collector.gen_model_input(is_train=True)
y_train = data_collector.gen_target_vars()

# ensure that each row of y_train corresponds to the correct user-content in X_train
y_train = y_train.reindex(index=X_train.index)

In [None]:
X_train.isna().sum()

user_likes                         0
user_dislikes                      0
user_engagetime                    0
content_guidance_scale             0
content_inference_steps            0
content_source_human_prompts       0
content_source_other               0
content_source_r/Showerthoughts    0
content_style_movie                0
content_style_other                0
dtype: int64

In [None]:
y_train.isna().sum()

like           0
dislike        0
engage_time    0
dtype: int64

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X_train, y_train, test_size=0.2, random_state=42)

In [None]:
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

# Creating a Sequential model
model_engtime = Sequential([
    Dense(64, activation='relu', input_shape=(X_train.shape[1],)),
    Dropout(0.2),  # Adding dropout for regularization
    Dense(32, activation='relu'),
    Dense(1)  # Output layer with 1 neuron for regression
])
model_engtime.compile(optimizer='adam', loss='mean_squared_error', metrics=['mae'])
model_engtime.fit(X_train, y_train['engage_time'], epochs=20, batch_size=32)
engtime_predictions = model_engtime.predict(X_test)
engtime_mse = mean_squared_error(y_test['engage_time'], engtime_predictions)
print(f"Mean Squared Error for 'engage_time': {engtime_mse}")

In [None]:
model_engtime.compile(optimizer='adam', loss='mean_squared_error', metrics=['mae'])

In [None]:
model_engtime.fit(X_train, y_train['engage_time'], epochs=20, batch_size=32)
engtime_predictions = model_engtime.predict(X_test)
engtime_mse = mean_squared_error(y_test['engage_time'], engtime_predictions)
print(f"Mean Squared Error for 'engage_time': {engtime_mse}")

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20
Mean Squared Error for 'engage_time': 4641587510655.143


In [None]:
4641441474730.845 ** 0.5

2154400.4907934004

In [None]:
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

# Creating a Sequential model
model_engtime = Sequential([
    Dense(64, activation='relu', input_shape=(X_train.shape[1],)),
    Dense(128, activation='relu'),
    Dropout(0.2),  # Adding dropout for regularization
    Dense(64, activation='relu'),
    Dense(32, activation='relu'),
    Dense(1)  # Output layer with 1 neuron for regression
])

In [None]:
model_engtime.compile(optimizer='adam', loss='mean_squared_error', metrics=['mae'])

In [None]:
#more complex model, after droping excess style features
model_engtime.fit(X_train, y_train['engage_time'], epochs=20, batch_size=32)
engtime_predictions = model_engtime.predict(X_test)
engtime_mse = mean_squared_error(y_test['engage_time'], engtime_predictions)
print(f"Mean Squared Error for 'engage_time': {engtime_mse}")

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20
Mean Squared Error for 'engage_time': 4643218314273.779


In [None]:
4643218314273.779 ** 0.5

2154812.8258096525

In [None]:
#remove dropout
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

# Creating a Sequential model
model_engtime = Sequential([
    Dense(64, activation='relu', input_shape=(X_train.shape[1],)),
    Dense(128, activation='relu'),
    #Dropout(0.2),  # Adding dropout for regularization
    Dense(64, activation='relu'),
    Dense(32, activation='relu'),
    Dense(1)  # Output layer with 1 neuron for regression
])

In [None]:
model_engtime.compile(optimizer='adam', loss='mean_squared_error', metrics=['mae'])

In [None]:
model_engtime.fit(X_train, y_train['engage_time'], epochs=40, batch_size=32)
engtime_predictions = model_engtime.predict(X_test)
engtime_mse = mean_squared_error(y_test['engage_time'], engtime_predictions)
print(f"Mean Squared Error for 'engage_time': {engtime_mse}")

Epoch 1/40
Epoch 2/40
Epoch 3/40
Epoch 4/40
Epoch 5/40
Epoch 6/40
Epoch 7/40
Epoch 8/40
Epoch 9/40
Epoch 10/40
Epoch 11/40
Epoch 12/40
Epoch 13/40
Epoch 14/40
Epoch 15/40
Epoch 16/40
Epoch 17/40
Epoch 18/40
Epoch 19/40
Epoch 20/40
Epoch 21/40
Epoch 22/40
Epoch 23/40
Epoch 24/40
Epoch 25/40
Epoch 26/40
Epoch 27/40
Epoch 28/40
Epoch 29/40
Epoch 30/40
Epoch 31/40
Epoch 32/40
Epoch 33/40
Epoch 34/40
Epoch 35/40
Epoch 36/40
Epoch 37/40
Epoch 38/40
Epoch 39/40
Epoch 40/40
Mean Squared Error for 'engage_time': 4644100230095.7295


In [None]:
4644100230095.7295 ** 0.5

2155017.4547079033

In [None]:
X_train

Unnamed: 0_level_0,Unnamed: 1_level_0,user_likes,user_dislikes,user_engagetime,content_guidance_scale,content_inference_steps,content_source_human_prompts,content_source_other,content_source_r/Showerthoughts,content_style_movie,content_style_other
user_id,content_id,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
51,124105,-0.648106,0.304989,-0.392457,-1.502674,-2.460551,0.0,1.0,0.0,0.0,1.0
30,77269,-0.112571,-0.582658,-0.046227,-0.055706,0.406514,0.0,1.0,0.0,1.0,0.0
53,106302,-0.553372,0.779777,-0.163153,-0.417448,0.406514,1.0,0.0,0.0,0.0,1.0
64,72623,-0.787306,0.740368,-0.255457,0.667778,0.406514,0.0,1.0,0.0,1.0,0.0
89,41445,0.457765,0.226171,-0.407132,-0.055706,0.406514,0.0,1.0,0.0,1.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...
59,81683,1.059033,-0.002778,3.162894,-0.055706,0.406514,0.0,1.0,0.0,0.0,1.0
62,101336,-0.733173,0.113573,-0.084124,0.667778,0.406514,1.0,0.0,0.0,0.0,1.0
100,92628,-0.000437,0.094806,-0.440601,2.476488,0.406514,1.0,0.0,0.0,0.0,1.0
103,48548,-0.139638,0.353782,-0.437761,0.667778,0.406514,0.0,1.0,0.0,1.0,0.0


In [None]:
#using he_normal for weight initialization and adding one dropout layer

In [None]:
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

# Creating a Sequential model
model_engtime = Sequential([
    Dense(64, activation='relu', input_shape=(X_train.shape[1],), kernel_initializer='he_normal'),
    Dropout(0.2),  # Adding dropout for regularization
    Dense(128, activation='relu', kernel_initializer='he_normal'),
    Dropout(0.2),  # Adding dropout for regularization
    Dense(64, activation='relu', kernel_initializer='he_normal'),
    Dense(32, activation='relu', kernel_initializer='he_normal'),
    Dense(1)  # Output layer with 1 neuron for regression
])

In [None]:
model_engtime.compile(optimizer='adam', loss='mean_squared_error', metrics=['mae'])

In [None]:
#using he_normal for weight initialization and adding one dropout layer
model_engtime.compile(optimizer='adam', loss='mean_squared_error', metrics=['mae'])
model_engtime.fit(X_train, y_train['engage_time'], epochs=20, batch_size=32)
engtime_predictions = model_engtime.predict(X_test)
engtime_mse = mean_squared_error(y_test['engage_time'], engtime_predictions)
print(f"Mean Squared Error for 'engage_time': {engtime_mse}")

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20
Mean Squared Error for 'engage_time': 4644681374124.586


In [None]:
4644681374124.586 ** 0.5

2155152.285599462

In [None]:
class DataCollectorExample(DataCollector):

    def feature_generation_user(self) -> Tuple[pd.DataFrame, List[str], List[str]]:
        """Generates user features. Keep all the categorical variables as is,
        since the one-hot encoding will be done by our own pipeline. Along with
        the feature dataframe, you'll need to output lists of numberical features
        and categorical features as well.

        Returns
          pd.DataFrame: User feature dataframe
          List[str]: List of numerical features. E.g. ['feat_1', 'feat_3, ...]
          List[str]: List of categorical features. E.g. ['feat_2', 'feat_4, ...]
        """

        # Filtering like/dislike engagements
        like_data = self.user_data[engagement_data['engagement_type'] == 'Like']

        # Grouping by 'user_id' and 'content_id' and getting the latest engagement for each pair
        latest_like_data = like_data.sort_values('created_date').groupby(['user_id', 'content_id']).tail(1)

        # Getting total likes for each user
        like_engagements = latest_like_data[(latest_like_data['engagement_value']==1)].copy()
        like_feature_df = like_engagements.groupby('user_id')['engagement_value'].sum().reset_index()
        like_feature_df.rename(columns={'engagement_value': 'user_likes'}, inplace=True)
        # Fill NaN values with 0 (users with no "like" engagements)
        like_feature_df['user_likes'].fillna(0, inplace=True)


        # Getting total dislikes for each user
        dislike_engagements = latest_like_data[(latest_like_data['engagement_value']==-1)].copy()
        dislike_feature_df = dislike_engagements.groupby('user_id')['engagement_value'].sum().reset_index()
        dislike_feature_df.rename(columns={'engagement_value': 'user_dislikes'}, inplace=True)
        # Fill NaN values with 0 (users with no "dislike" engagements)
        dislike_feature_df['user_dislikes'].fillna(0, inplace=True)

        # Getting average engage time for each user
        time_engagements = self.user_data[self.user_data['engagement_type'] == 'MillisecondsEngagedWith'].copy()
        engage_feature_df = time_engagements.groupby('user_id')['engagement_value'].mean().reset_index()
        engage_feature_df.rename(columns={'engagement_value': 'user_engagetime'}, inplace=True)
        # Fill NaN values with 0 (users with no engagment time data)
        engage_feature_df['user_engagetime'].fillna(0, inplace=True)

        feature_df = pd.merge(like_feature_df, dislike_feature_df , on='user_id', how='left')
        feature_df = pd.merge(feature_df, engage_feature_df , on='user_id', how='left')

        return feature_df, ['user_likes', 'user_dislikes', 'user_engagetime'], []


    def feature_generation_content(self) -> Tuple[pd.DataFrame, List[str], List[str]]:
        """Generates content features. Keep all the categorical variables as is,
        since the one-hot encoding will be done by our own pipeline. Along with
        the feature dataframe, you'll need to output lists of numberical features
        and categorical features as well.

        Returns
          pd.DataFrame: User feature dataframe
          List[str]: List of numerical features. E.g. ['feat_1', 'feat_3, ...]
          List[str]: List of categorical features. E.g. ['feat_2', 'feat_4, ...]
        """

        feature_df = self.generated_content_metadata.copy()

        # numerical feature 1: (average) guidance scale
        mean_engage = feature_df["guidance_scale"].mean()
        guide_df = feature_df.groupby('content_id')['guidance_scale'].mean().reset_index()
        guide_df = guide_df.rename(columns={'guidance_scale': 'content_guidance_scale'})
        feature_df = pd.merge(feature_df, guide_df, on='content_id', how='left')
        feature_df['content_guidance_scale'].fillna(mean_engage, inplace=True)

        # numerical feature 2: num inference steps
        mean_inf = feature_df["num_inference_steps"].mean()
        inf_df = feature_df.groupby('content_id')['num_inference_steps'].mean().reset_index()
        inf_df = inf_df.rename(columns={'num_inference_steps': 'content_inference_steps'})
        feature_df = pd.merge(feature_df, inf_df, on='content_id', how='left')
        feature_df['content_inference_steps'].fillna(mean_inf, inplace=True)


        # categorical feature 1: source
        feature_df['content_source'] = 'other'
        feature_df.loc[feature_df['source'] == 'human_prompts', 'content_source'] = 'human_prompts'
        feature_df.loc[feature_df['source'] == 'r/Showerthoughts', 'content_source'] = 'r/Showerthoughts'


        # categorical feature 2: artist style
        style_list = [
            'studio',
            'medieval',
            'anime',
            'kerry_james_marshall',
            'gta_v',
            'scifi',
            'van_gogh',
            'salvador_dali',
            'jean-michel_basquiat',
            'face_and_lighting'
        ]
        #style_list = ['movie', 'empty']
        feature_df['content_style'] = feature_df['artist_style']
        feature_df['content_style'].fillna("empty", inplace=True)
        feature_df.loc[feature_df['content_style'].str.startswith('movie:'), 'content_style'] = 'movie'
        feature_df.loc[~feature_df['content_style'].isin(style_list), 'content_style'] = 'other'


        return feature_df, ['content_inference_steps'], ['content_source', 'content_style']


    def get_Ys(self, engagement_data) -> pd.DataFrame:
        """Engineers taget variable that you are predicting.
        Args
            engagement_data (pd.DataFrame): Engagement data.
        Returns
            pd.DataFrame: Dataframe of 5 columns;
                'user_id', 'content_id', 'like', 'dislike', 'engage_time'
        """
        # Filtering Like-type engagements
        like_data = engagement_data[engagement_data['engagement_type'] == 'Like']

        # Grouping by 'user_id' and 'content_id' and getting the latest engagement for each pair
        latest_engagements = like_data.sort_values('created_date').groupby(['user_id', 'content_id']).tail(1)

        # Creating the target DataFrame with unique pairs of user_id and content_id
        target_df = engagement_data[['user_id', 'content_id']].drop_duplicates()

        # Merging latest engagements to update 'like' and 'dislike' columns
        target_df = pd.merge(target_df, latest_engagements[['user_id', 'content_id', 'engagement_value']],
                            on=['user_id', 'content_id'], how='left')

        # Updating 'like' and 'dislike' columns based on the latest engagement values
        target_df['like'] = (target_df['engagement_value'] == 1).astype(int)
        target_df['dislike'] = (target_df['engagement_value'] == -1).astype(int)

        # Filling NaN values with 0 for pairs without like/dislike
        target_df.fillna(0, inplace=True)


        # Set "engage_time" based on engagement_type and engagement_value
        # assign existing engagement time if doesn't have that data, assign zero
        engage_times = engagement_data[engagement_data['engagement_type'] == 'MillisecondsEngagedWith']

        engage_times = engage_times.groupby(['user_id', 'content_id'])['engagement_value'].mean().reset_index()
        engage_times.rename(columns={'engagement_value': 'engage_time'}, inplace=True)

        target_df = pd.merge(target_df, engage_times[['user_id', 'content_id', 'engage_time']],
                            on=['user_id', 'content_id'], how='left')

        """
        # Grouping by 'user_id' and 'content_id' to get the earliest 'created_date' for each pair
        engage_times = engage_times.sort_values('created_date').groupby(['user_id', 'content_id']).head(1)
        """
        # Filling NaN values with 0 for pairs without engage_time
        target_df['engage_time'].fillna(0, inplace=True)

        # Select and rename the required columns
        target_df = target_df[['user_id', 'content_id', 'like', 'dislike', 'engage_time']].copy()

        return target_df


    def predict(self, X: pd.DataFrame) -> Tuple[list, list, list]:
        """Predicts the 3 target variables by using the model that you trained.
        Make sure you load the model properly.

        Args:
            X (pd.DataFrame): Feature dataframe with 2-level index of (user_id, content_id)

        Returns:
            (list, list, list): (predicted prbability of like,
                                 predicted probability of dislike,
                                 predicted engagement time)
        """

        model = self.load_model()

        pred_like = model['like'].predict(X)
        pred_dislike = model['dislike'].predict(X)
        pred_engtime = model['engage_time'].predict(X)

        return pred_like, pred_dislike, pred_engtime

    def rank(self,
             score_df: pd.DataFrame,
             user_id: int,
             content_ids: Optional[list] = None) -> list:

        """Ranks the items for a given user based on your own criteria.

        Args:
            score_df (pd.DataFrame): Predicted-score Dataframe of columns;
                'user_id', 'content_id', 'like', 'dislike', 'engage_time', and
                also columns for content metadata.
            user_id (int): User ID to rank the items for.
            content_ids (Optional[list]): List of content ids to be considered for ranking.
        """

        score_df = score_df[score_df['user_id'] == user_id]
        ranked_pred = score_df.sort_values('like', ascending=False)

        return ranked_pred['content_id'].tolist()

    def load_model(self) -> object:
        """Loads your model. Since different ML frameworks requires different
        ways to load the model. Change this to reflect your choice of framework.

        Returns:
            object: Model object
        """

        with open(f'{self.objects_dir}/model.pkl', 'rb') as f:
            return pickle.load(f)

In [None]:
#@title get training data
data_collector = DataCollectorExample(
    engagement_path='sample_data/engagement.csv',
    content_meta_path='sample_data/generated_content_metadata.csv'
    )

X_train = data_collector.gen_model_input(is_train=True)
y_train = data_collector.gen_target_vars()

# ensure that each row of y_train corresponds to the correct user-content in X_train
y_train = y_train.reindex(index=X_train.index)

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X_train, y_train, test_size=0.2, random_state=42)

In [None]:
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

# Creating a Sequential model
model_engtime = Sequential([
    Dense(128, activation='relu', input_shape=(X_train.shape[1],)),
    Dense(128, activation='relu'),
    Dense(64, activation='relu'),
    Dense(32, activation='relu'),
    Dense(1)  # Output layer with 1 neuron for regression
])

In [None]:
from tensorflow.keras.optimizers import Adam

# Assuming 'model' is your neural network model
# Define the optimizer with a custom learning rate
custom_optimizer = Adam(learning_rate=0.01)  # increasing from 0.001

# Compile the model with the custom optimizer
model_engtime.compile(optimizer=custom_optimizer, loss='mean_squared_error', metrics=['mae'])

In [None]:
model_engtime.compile(optimizer='adam', loss='mean_squared_error', metrics=['mae'])
model_engtime.fit(X_train, y_train['engage_time'], epochs=20)
engtime_predictions = model_engtime.predict(X_test)
engtime_mse = mean_squared_error(y_test['engage_time'], engtime_predictions)
print(f"Mean Squared Error for 'engage_time': {engtime_mse}")

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20
Mean Squared Error for 'engage_time': 4651136217153.661


In [None]:
4651136217153.661 ** 0.5

2156649.3032372375

In [None]:
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

# Creating a Sequential model
model_engtime = Sequential([
    Dense(128, activation='relu', input_shape=(X_train.shape[1],)),
    Dense(128, activation='relu'),
    Dense(64, activation='relu'),
    Dense(32, activation='relu'),
    Dense(1)  # Output layer with 1 neuron for regression
])

In [None]:
model_engtime.compile(optimizer='adam', loss='mean_squared_error', metrics=['mae'])
model_engtime.fit(X_train, y_train['engage_time'], epochs=20)
engtime_predictions = model_engtime.predict(X_test)
engtime_mse = mean_squared_error(y_test['engage_time'], engtime_predictions)
print(f"Mean Squared Error for 'engage_time': {engtime_mse}")

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20
Mean Squared Error for 'engage_time': 4646769610928.456


In [None]:
4646769610928.456 ** 0.5

2155636.7066202173

In [None]:
654562078735.6278 ** 0.5

809050.1089151573

In [None]:
engtime_predictions = model_engtime.predict(X_train)
engtime_mse = mean_squared_error(y_train['engage_time'], engtime_predictions)
print(f"Mean Squared Error for 'engage_time': {engtime_mse}")

Mean Squared Error for 'engage_time': 5424845287818.471


In [None]:
5424845287818.471 ** 0.5

2329129.7275631665

In [None]:
def create_model():
    model_engtime = Sequential([
        Dense(128, activation='relu', input_shape=(X_train.shape[1],)),
        Dense(128, activation='relu'),
        Dense(64, activation='relu'),
        Dense(32, activation='relu'),
        Dense(1)  # Output layer with 1 neuron for regression
    ])
    return model_engtime

In [None]:
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

# Creating a Sequential model
model_engtime = Sequential([
    Dense(64, activation='relu', input_shape=(X_train.shape[1],), kernel_initializer='he_normal'),
    Dropout(0.2),  # Adding dropout for regularization
    Dense(128, activation='relu', kernel_initializer='he_normal'),
    Dropout(0.2),  # Adding dropout for regularization
    Dense(64, activation='relu', kernel_initializer='he_normal'),
    Dense(32, activation='relu', kernel_initializer='he_normal'),
    Dense(1)  # Output layer with 1 neuron for regression
])

###Linear Regression & Weight Analysis to Experiment with different set of features

In [None]:
from sklearn.linear_model import LinearRegression
model_engtime = LinearRegression()
model_engtime.fit(X_train, y_train['engage_time'])

In [None]:
#all styles + removed guidance scale
engtime_predictions = model_engtime.predict(X_test)
engtime_mse = mean_squared_error(y_test['engage_time'], engtime_predictions)
print(f"Mean Squared Error for 'engage_time': {engtime_mse}")

Mean Squared Error for 'engage_time': 4643915339660.62


In [None]:
4643915339660.62 ** 0.5

2154974.556615604

In [None]:
[(X_train.columns[i], model_engtime.coef_[i]) for i in range(len(X_train.columns))]

[('user_likes', -900.9417808815841),
 ('user_dislikes', 3264.2796532672055),
 ('user_engagetime', 80113.46133270074),
 ('content_inference_steps', 4890.11980082893),
 ('content_source_human_prompts', -12368.507781687658),
 ('content_source_other', 10169.872782901075),
 ('content_source_r/Showerthoughts', 2198.634998786587),
 ('content_style_anime', -18873.911162572975),
 ('content_style_face_and_lighting', 24380.03857542192),
 ('content_style_gta_v', -5393.9347958834915),
 ('content_style_jean-michel_basquiat', -21768.13053114205),
 ('content_style_kerry_james_marshall', -21130.875619138496),
 ('content_style_medieval', -15667.914659514465),
 ('content_style_other', 12337.891616038129),
 ('content_style_salvador_dali', 23247.262720825172),
 ('content_style_scifi', -22841.476578718433),
 ('content_style_studio', 13471.09843869714),
 ('content_style_van_gogh', 32239.95199598753)]

In [None]:
#limited styles
engtime_predictions = model_engtime.predict(X_test)
engtime_mse = mean_squared_error(y_test['engage_time'], engtime_predictions)
print(f"Mean Squared Error for 'engage_time': {engtime_mse}")

Mean Squared Error for 'engage_time': 10814879893857.01


In [None]:
10814879893857.01 ** 0.5

3288598.4695394193

In [None]:
[(X_train.columns[i], model_engtime.coef_[i]) for i in range(len(X_train.columns))]

[('user_likes', -1208.5858164150204),
 ('user_dislikes', 1403.638294720462),
 ('user_engagetime', 93412.53732233719),
 ('content_guidance_scale', 281.6310374356142),
 ('content_inference_steps', -3271.8479810710683),
 ('content_source_human_prompts', 6990.243688509355),
 ('content_source_other', -8147.285661574587),
 ('content_source_r/Showerthoughts', 1157.0419730652204),
 ('content_style_empty', -16069.359290149534),
 ('content_style_movie', 28834.68492314835),
 ('content_style_other', -12765.32563299882)]

In [None]:
from tensorflow.keras.optimizers import Adam

# Assuming 'model' is your neural network model
# Define the optimizer with a custom learning rate
custom_optimizer = Adam(learning_rate=0.01)  # increasing from 0.001

# Compile the model with the custom optimizer
model_engtime.compile(optimizer=custom_optimizer, loss='mean_squared_error', metrics=['mae'])

In [None]:
#using he_normal for weight initialization and adding one dropout layer
model_engtime.fit(X_train, y_train['engage_time'], epochs=20, batch_size=32)
engtime_predictions = model_engtime.predict(X_test)
engtime_mse = mean_squared_error(y_test['engage_time'], engtime_predictions)
print(f"Mean Squared Error for 'engage_time': {engtime_mse}")

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20
Mean Squared Error for 'engage_time': 4643746094708.583


In [None]:
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

# Creating a Sequential model
model_like = Sequential([
    Dense(64, activation='relu', input_shape=(X_train.shape[1],)),
    Dense(128, activation='relu'),
    Dropout(0.2),  # Adding dropout for regularization
    Dense(64, activation='relu'),
    Dense(32, activation='relu'),
    Dense(1, activation='sigmoid')  # Output layer with 1 neuron for regression
])
model_like.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

In [None]:
model_like.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

In [None]:
#more complex model, after droping excess style features
model_like.fit(X_train, y_train['like'], epochs=20, batch_size=32)

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


<keras.src.callbacks.History at 0x7a4aae9912a0>

In [None]:
like_predictions = model_like.predict(X_test)


###RandomForestClassifier + RandomForestRegressor & tuning

In [None]:
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, mean_squared_error
import pandas as pd
import numpy as np
import pickle

# Assuming X_train and y_train are your feature and target variables
# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_train, y_train, test_size=0.2, random_state=42)

# Model for 'like'
model_like = RandomForestClassifier(random_state=42)
model_like.fit(X_train, y_train['like'])
like_predictions = model_like.predict(X_test)
like_accuracy = accuracy_score(y_test['like'], like_predictions)
print(f"Accuracy for 'like': {like_accuracy}")

# Model for 'dislike' (assuming binary classification)
model_dislike = RandomForestClassifier(random_state=42)
model_dislike.fit(X_train, y_train['dislike'])
dislike_predictions = model_dislike.predict(X_test)
dislike_accuracy = accuracy_score(y_test['dislike'], dislike_predictions)
print(f"Accuracy for 'dislike': {dislike_accuracy}")

# Model for 'engage_time'
model_engtime = RandomForestRegressor(random_state=42)
model_engtime.fit(X_train, y_train['engage_time'])
engtime_predictions = model_engtime.predict(X_test)
engtime_mse = mean_squared_error(y_test['engage_time'], engtime_predictions)
print(f"Mean Squared Error for 'engage_time': {engtime_mse}")

# Save the models to a file
model = {
    'like': model_like,
    'dislike': model_dislike,
    'engage_time': model_engtime
}

with open('sample_data/model.pkl', 'wb') as f:
    pickle.dump(model, f)

In [None]:
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, mean_squared_error
import pandas as pd
import numpy as np
import pickle

# Assuming X_train and y_train are your feature and target variables
# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_train, y_train, test_size=0.2, random_state=42)

# Model for 'like'
model_like = RandomForestClassifier(random_state=42)
model_like.fit(X_train, y_train['like'])
like_predictions = model_like.predict(X_test)
like_accuracy = accuracy_score(y_test['like'], like_predictions)
print(f"Accuracy for 'like': {like_accuracy}")

# Model for 'dislike' (assuming binary classification)
model_dislike = RandomForestClassifier(random_state=42)
model_dislike.fit(X_train, y_train['dislike'])
dislike_predictions = model_dislike.predict(X_test)
dislike_accuracy = accuracy_score(y_test['dislike'], dislike_predictions)
print(f"Accuracy for 'dislike': {dislike_accuracy}")

# Model for 'engage_time'
model_engtime = RandomForestRegressor(random_state=42)
model_engtime.fit(X_train, y_train['engage_time'])
engtime_predictions = model_engtime.predict(X_test)
engtime_mse = mean_squared_error(y_test['engage_time'], engtime_predictions)
print(f"Mean Squared Error for 'engage_time': {engtime_mse}")

# Save the models to a file
model = {
    'like': model_like,
    'dislike': model_dislike,
    'engage_time': model_engtime
}

with open('sample_data/model.pkl', 'wb') as f:
    pickle.dump(model, f)

Accuracy for 'like': 0.7170615305948526
Accuracy for 'dislike': 0.7595875234361684
Mean Squared Error for 'engage_time': 1194319493893.0874


In [None]:
data_collector = DataCollectorExample(
    engagement_path='sample_data/engagement_train.csv',
    content_meta_path='sample_data/generated_content_metadata.csv'
    )

X_train = data_collector.gen_model_input(is_train=True)
y_train = data_collector.gen_target_vars()

# ensure that each row of y_train corresponds to the correct user-content in X_train
y_train = y_train.reindex(index=X_train.index)

In [None]:
# Simulates contents filtered from previous stage.
# Feel free to change this to reflect your previous stage.

sample_contents = content_meta['content_id'].sample(frac=0.01)
# Get true target variables
y_true = data_collector.gen_target_vars(engagement_test)

# Make predictions
y_pred = data_collector.score(content_ids = sample_contents)

thres_like = 0.5
thres_dislike = 0.5
evaluate(y_true, y_pred, thres_like, thres_dislike)

{'like': {'precision': 0.5303030303030303, 'recall': 0.38461538461538464},
 'dislike': {'precision': 0.3148148148148148, 'recall': 0.22077922077922077},
 'engage_time': {'rmse': 28397.677221999707}}

In [None]:
# Simulates contents filtered from previous stage.
# Feel free to change this to reflect your previous stage.

sample_contents = content_meta['content_id'].sample(frac=0.01)
# Get true target variables
y_true = data_collector.gen_target_vars(engagement_test)

# Make predictions
y_pred = data_collector.score(content_ids = sample_contents)

thres_like = 0.5
thres_dislike = 0.5
evaluate(y_true, y_pred, thres_like, thres_dislike)

{'like': {'precision': 0.5483870967741935, 'recall': 0.37362637362637363},
 'dislike': {'precision': 0.36, 'recall': 0.23376623376623376},
 'engage_time': {'rmse': 16554.833079757926}}

In [None]:
feature_df = pd.merge(user_data[['user_id']].drop_duplicates(), user_feature_df, on='user_id', how='left')

        # Fill NaN values with 0 (users with no "like" engagements)
feature_df['user_feature_1'].fillna(1, inplace=True)
feature_df

Unnamed: 0,user_id,user_feature_1
0,51,-55.0
1,30,-192.0
2,53,191.0
3,64,63.0
4,89,339.0
...,...,...
105,76,1.0
106,105,0.0
107,116,1.0
108,98,1.0


In [None]:
like_engagements = user_data[user_data['engagement_type'] == 'Like']
user_feature_df = like_engagements.groupby('user_id')['engagement_value'].sum().reset_index()
user_feature_df.rename(columns={'engagement_value': 'user_feature_1'}, inplace=True)
feature_df_1 = pd.merge(user_data[['user_id']].drop_duplicates(), user_feature_df, on='user_id', how='left')

# Fill NaN values with 0 (users with no "like" engagements)
feature_df_1['user_feature_1'].fillna(1, inplace=True)


engage_with_engagements = user_data[user_data['engagement_type'] == 'MillisecondsEngagedWith']
mean_engage=engage_with_engagements["engagement_value"].mean()
engage_feature_df = engage_with_engagements.groupby('user_id')['engagement_value'].mean().reset_index()
engage_feature_df.rename(columns={'engagement_value': 'user_feature_2'}, inplace=True)


feature_df = pd.merge(feature_df_1,engage_feature_df , on='user_id', how='left')
feature_df['user_feature_2'].fillna(mean_engage, inplace=True)
feature_df

Unnamed: 0,user_id,user_feature_1,user_feature_2
0,51,-55.0,5601.212146
1,30,-192.0,36020.518703
2,53,191.0,28672.626283
3,64,63.0,16648.687984
4,89,339.0,3784.363016
...,...,...,...
105,76,1.0,13531.000000
106,105,0.0,2442.153846
107,116,1.0,366.000000
108,98,1.0,822.000000


In [None]:
engage_with_engagements["engagement_value"].mean()

31211.370137232316

In [None]:
def feature_generation_user(self) -> Tuple[pd.DataFrame, List[str], List[str]]:
        """Generates user features. Keep all the categorical variables as is,
        since the one-hot encoding will be done by our own pipeline. Along with
        the feature dataframe, you'll need to output lists of numberical features
        and categorical features as well.

        Returns
          pd.DataFrame: User feature dataframe
          List[str]: List of numerical features. E.g. ['feat_1', 'feat_3, ...]
          List[str]: List of categorical features. E.g. ['feat_2', 'feat_4, ...]
        """
        like_engagements = self.user_data[self.user_data['engagement_type'] == 'Like']
        user_feature_df = like_engagements.groupby('user_id')['engagement_value'].sum().reset_index()
        user_feature_df.rename(columns={'engagement_value': 'user_feature_1'}, inplace=True)
        feature_df_1 = pd.merge(user_data[['user_id']].drop_duplicates(), user_feature_df, on='user_id', how='left')

        # Fill NaN values with 0 (users with no "like" engagements)
        feature_df_1['user_feature_1'].fillna(1, inplace=True)


        engage_with_engagements = self.user_data[self.user_data['engagement_type'] == 'MillisecondsEngagedWith']
        mean_engage=engage_with_engagements["engagement_value"].mean()
        engage_feature_df = engage_with_engagements.groupby('user_id')['engagement_value'].mean().reset_index()
        engage_feature_df.rename(columns={'engagement_value': 'user_feature_2'}, inplace=True)


        feature_df = pd.merge(feature_df_1,engage_feature_df , on='user_id', how='left')
        feature_df['user_feature_2'].fillna(mean_engage, inplace=True)


        return feature_df, ['user_feature_1', 'user_feature_2'], []

###GradientBoostingRegressor

In [None]:
from sklearn.ensemble import RandomForestClassifier, GradientBoostingRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, mean_squared_error
import pandas as pd
import numpy as np
import pickle

# Assuming X_train and y_train are your feature and target variables
# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_train, y_train, test_size=0.2, random_state=42)

# Model for 'like'
model_like = RandomForestClassifier(random_state=42)
model_like.fit(X_train, y_train['like'])
like_predictions = model_like.predict(X_test)
like_accuracy = accuracy_score(y_test['like'], like_predictions)
print(f"Accuracy for 'like': {like_accuracy}")

# Model for 'dislike' (assuming binary classification)
model_dislike = RandomForestClassifier(random_state=42)
model_dislike.fit(X_train, y_train['dislike'])
dislike_predictions = model_dislike.predict(X_test)
dislike_accuracy = accuracy_score(y_test['dislike'], dislike_predictions)
print(f"Accuracy for 'dislike': {dislike_accuracy}")

# Model for 'engage_time'
model_engtime = GradientBoostingRegressor(random_state=42)
model_engtime.fit(X_train, y_train['engage_time'])
engtime_predictions = model_engtime.predict(X_test)
engtime_mse = mean_squared_error(y_test['engage_time'], engtime_predictions)
print(f"Mean Squared Error for 'engage_time': {engtime_mse}")

# Save the models to a file
model = {
    'like': model_like,
    'dislike': model_dislike,
    'engage_time': model_engtime
}

with open('sample_data/model.pkl', 'wb') as f:
    pickle.dump(model, f)

Accuracy for 'like': 0.769728992670871
Accuracy for 'dislike': 0.8031361854440089
Mean Squared Error for 'engage_time': 10851745196166.953


In [None]:
thres_like = 0.5
thres_dislike = 0.5
evaluate(y_true, y_pred, thres_like, thres_dislike)

{'like': {'precision': 0.6779661016949152, 'recall': 0.40404040404040403},
 'dislike': {'precision': 0.6764705882352942, 'recall': 0.3026315789473684},
 'engage_time': {'rmse': 100246.68841042921}}

###Decision Tree Regressor

In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, mean_squared_error
import pandas as pd
import numpy as np
import pickle

# Assuming X_train and y_train are your feature and target variables
# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_train, y_train, test_size=0.2, random_state=42)

# Model for 'like'
model_like = RandomForestClassifier(random_state=42)
model_like.fit(X_train, y_train['like'])
like_predictions = model_like.predict(X_test)
like_accuracy = accuracy_score(y_test['like'], like_predictions)
print(f"Accuracy for 'like': {like_accuracy}")

# Model for 'dislike' (assuming binary classification)
model_dislike = RandomForestClassifier(random_state=42)
model_dislike.fit(X_train, y_train['dislike'])
dislike_predictions = model_dislike.predict(X_test)
dislike_accuracy = accuracy_score(y_test['dislike'], dislike_predictions)
print(f"Accuracy for 'dislike': {dislike_accuracy}")

# Model for 'engage_time' using Decision Tree Regressor
model_engtime = DecisionTreeRegressor(random_state=42)
model_engtime.fit(X_train, y_train['engage_time'])
engtime_predictions = model_engtime.predict(X_test)
engtime_mse = mean_squared_error(y_test['engage_time'], engtime_predictions)
print(f"Mean Squared Error for 'engage_time': {engtime_mse}")

# Save the models to a file
model = {
    'like': model_like,
    'dislike': model_dislike,
    'engage_time': model_engtime
}

with open('sample_data/model.pkl', 'wb') as f:
    pickle.dump(model, f)

Accuracy for 'like': 0.769728992670871
Accuracy for 'dislike': 0.8031361854440089
Mean Squared Error for 'engage_time': 10891565781167.445


In [None]:
thres_like = 0.5
thres_dislike = 0.5
evaluate(y_true, y_pred, thres_like, thres_dislike)

{'like': {'precision': 0.7741935483870968, 'recall': 0.45714285714285713},
 'dislike': {'precision': 0.6136363636363636, 'recall': 0.30337078651685395},
 'engage_time': {'rmse': 299971.85326031374}}

In [None]:
y_train['engage_time'].describe()

count    1.466710e+05
mean     4.547468e+04
std      2.659561e+06
min      0.000000e+00
25%      7.360000e+02
50%      1.517000e+03
75%      2.814000e+03
max      5.136718e+08
Name: engage_time, dtype: float64