In [2]:
import pandas as pd
import numpy as np
from tqdm.notebook import tqdm
tqdm.pandas()

import re
import nltk
import string
from textblob import TextBlob
from collections import defaultdict
from collections import Counter
from nltk.corpus import wordnet
from nltk.corpus import stopwords

from sklearn.metrics.pairwise import cosine_similarity
from numpy import dot
from numpy.linalg import norm
from sklearn.preprocessing import OneHotEncoder
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.compose import ColumnTransformer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import log_loss, accuracy_score, precision_score
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier,  GradientBoostingClassifier
import lightgbm as lgb
import pickle

import warnings
warnings.filterwarnings('ignore')

In [98]:
content = pd.read_csv("../services/backend/seed_data/data/nov_19_dump/content.csv")
engagement = pd.read_csv("../services/backend/seed_data/data/nov_19_dump/engagement.csv")
gcm_ = pd.read_csv("../services/backend/seed_data/data/nov_19_dump/generated_content_metadata.csv")
target = pd.read_csv("../services/backend/processed_data/target.csv")

## Create a User Feature Table

<li> For the prediction part, we can easily retrieve the table for further feature engineering </li>

In [116]:
# User Features
def convert_embedding(row):
    if row["engagement_value"] == 0:
        row["prompt_embedding"] = [x * -1 for x in row["prompt_embedding"]]
    return row["prompt_embedding"]


def user_embedding(gcm,target):
    gcm_tmp = pd.merge(gcm[["content_id","prompt_embedding"]],target,on = "content_id")
    gcm_tmp["embedding_new"] = gcm_tmp[["prompt_embedding","engagement_value"]].apply(convert_embedding,axis=1)
    ## Explode list to multiple columns
    usr_embed = pd.DataFrame(gcm_tmp["embedding_new"].tolist())
    usr_embed = usr_embed.add_prefix('user_embed_')
    usr_embed = pd.concat([gcm_tmp["user_id"],usr_embed],axis=1)
    # Get the mean content embedding as the user embedding 
    usr_embed = usr_embed.groupby("user_id").mean().reset_index()
    ## Get embedding combined into one column to further calculate cosine similarity
    usr_embed['embed_combined']= usr_embed.drop("user_id",axis=1).values.tolist()
    return usr_embed[['user_id', 'embed_combined']]

def add_user_features(target,usr_embed):
    ## Add user like rate and average engagement time features
    user_info1 = target.groupby("user_id").mean()["engagement_value"].rename("User_Like_Rate").reset_index()
    user_info2 = engagement[engagement["engagement_type"] == "MillisecondsEngagedWith"]\
                .groupby("user_id").mean()["engagement_value"].rename("User_Avg_Time_Millisec").reset_index()

    user_info = pd.merge(user_info1,user_info2,how = "left", on="user_id")
    user_info = pd.merge(user_info,usr_embed, on = "user_id")#.drop("embed_combined",axis=1)
    return user_info


In [113]:
# convert string into corresponding embedding list
gcm = gcm_.copy()
gcm["prompt_embedding"] = gcm["prompt_embedding"].apply(lambda x:eval(x))

prompt_embedding = np.array(gcm['prompt_embedding'].tolist())
prompt_embedding = prompt_embedding[:,:32].tolist()
gcm["prompt_embedding"] = prompt_embedding

In [117]:
usr_embed = user_embedding(gcm,target)
user_info = add_user_features(target,usr_embed)

In [120]:
user_info.to_parquet("../services/backend/processed_data/User_Features.parquet",index=False)

In [7]:
user_info.to_csv("User_Features.csv",index=False)

NameError: name 'user_info' is not defined

## Create a Item Feature Table

In [121]:
## Clean the text
def decontraction(phrase):
    phrase = re.sub(r"won\'t", "will not", phrase)
    phrase = re.sub(r"can\'t", "can not", phrase)
    phrase = re.sub(r"n\'t", " not", phrase)
    phrase = re.sub(r"\'re", " are", phrase)
    phrase = re.sub(r"\'s", " is", phrase)
    phrase = re.sub(r"\'d", " would", phrase)
    phrase = re.sub(r"\'ll", " will", phrase)
    phrase = re.sub(r"\'t", " not", phrase)
    phrase = re.sub(r"\'ve", " have", phrase)
    phrase = re.sub(r"\'m", " am", phrase)
    phrase = re.sub(r"he's", "he is", phrase)
    phrase = re.sub(r"there's", "there is", phrase)
    phrase = re.sub(r"We're", "We are", phrase)
    phrase = re.sub(r"That's", "That is", phrase)
    phrase = re.sub(r"won't", "will not", phrase)
    phrase = re.sub(r"they're", "they are", phrase)
    phrase = re.sub(r"Can't", "Cannot", phrase)
    phrase = re.sub(r"wasn't", "was not", phrase)
    phrase = re.sub(r"don\x89Ûªt", "do not", phrase)
    phrase = re.sub(r"donãât", "do not", phrase)
    phrase = re.sub(r"aren't", "are not", phrase)
    phrase = re.sub(r"isn't", "is not", phrase)
    phrase = re.sub(r"What's", "What is", phrase)
    phrase = re.sub(r"haven't", "have not", phrase)
    phrase = re.sub(r"hasn't", "has not", phrase)
    phrase = re.sub(r"There's", "There is", phrase)
    phrase = re.sub(r"He's", "He is", phrase)
    phrase = re.sub(r"It's", "It is", phrase)
    phrase = re.sub(r"You're", "You are", phrase)
    phrase = re.sub(r"I'M", "I am", phrase)
    phrase = re.sub(r"shouldn't", "should not", phrase)
    phrase = re.sub(r"wouldn't", "would not", phrase)
    phrase = re.sub(r"i'm", "I am", phrase)
    phrase = re.sub(r"I\x89Ûªm", "I am", phrase)
    phrase = re.sub(r"I'm", "I am", phrase)
    phrase = re.sub(r"Isn't", "is not", phrase)
    phrase = re.sub(r"Here's", "Here is", phrase)
    phrase = re.sub(r"you've", "you have", phrase)
    phrase = re.sub(r"you\x89Ûªve", "you have", phrase)
    phrase = re.sub(r"we're", "we are", phrase)
    phrase = re.sub(r"what's", "what is", phrase)
    phrase = re.sub(r"couldn't", "could not", phrase)
    phrase = re.sub(r"we've", "we have", phrase)
    phrase = re.sub(r"it\x89Ûªs", "it is", phrase)
    phrase = re.sub(r"doesn\x89Ûªt", "does not", phrase)
    phrase = re.sub(r"It\x89Ûªs", "It is", phrase)
    phrase = re.sub(r"Here\x89Ûªs", "Here is", phrase)
    phrase = re.sub(r"who's", "who is", phrase)
    phrase = re.sub(r"I\x89Ûªve", "I have", phrase)
    phrase = re.sub(r"y'all", "you all", phrase)
    phrase = re.sub(r"can\x89Ûªt", "cannot", phrase)
    phrase = re.sub(r"would've", "would have", phrase)
    phrase = re.sub(r"it'll", "it will", phrase)
    phrase = re.sub(r"we'll", "we will", phrase)
    phrase = re.sub(r"wouldn\x89Ûªt", "would not", phrase)
    phrase = re.sub(r"We've", "We have", phrase)
    phrase = re.sub(r"he'll", "he will", phrase)
    phrase = re.sub(r"Y'all", "You all", phrase)
    phrase = re.sub(r"Weren't", "Were not", phrase)
    phrase = re.sub(r"Didn't", "Did not", phrase)
    phrase = re.sub(r"they'll", "they will", phrase)
    phrase = re.sub(r"they'd", "they would", phrase)
    phrase = re.sub(r"DON'T", "DO NOT", phrase)
    phrase = re.sub(r"That\x89Ûªs", "That is", phrase)
    phrase = re.sub(r"they've", "they have", phrase)
    phrase = re.sub(r"i'd", "I would", phrase)
    phrase = re.sub(r"should've", "should have", phrase)
    phrase = re.sub(r"You\x89Ûªre", "You are", phrase)
    phrase = re.sub(r"where's", "where is", phrase)
    phrase = re.sub(r"Don\x89Ûªt", "Do not", phrase)
    phrase = re.sub(r"we'd", "we would", phrase)
    phrase = re.sub(r"i'll", "I will", phrase)
    phrase = re.sub(r"weren't", "were not", phrase)
    phrase = re.sub(r"They're", "They are", phrase)
    phrase = re.sub(r"Can\x89Ûªt", "Cannot", phrase)
    phrase = re.sub(r"you\x89Ûªll", "you will", phrase)
    phrase = re.sub(r"I\x89Ûªd", "I would", phrase)
    phrase = re.sub(r"let's", "let us", phrase)
    phrase = re.sub(r"it's", "it is", phrase)
    phrase = re.sub(r"can't", "cannot", phrase)
    phrase = re.sub(r"don't", "do not", phrase)
    phrase = re.sub(r"you're", "you are", phrase)
    phrase = re.sub(r"i've", "I have", phrase)
    phrase = re.sub(r"that's", "that is", phrase)
    phrase = re.sub(r"i'll", "I will", phrase)
    phrase = re.sub(r"doesn't", "does not",phrase)
    phrase = re.sub(r"i'd", "I would", phrase)
    phrase = re.sub(r"didn't", "did not", phrase)
    phrase = re.sub(r"ain't", "am not", phrase)
    phrase = re.sub(r"you'll", "you will", phrase)
    phrase = re.sub(r"I've", "I have", phrase)
    phrase = re.sub(r"Don't", "do not", phrase)
    phrase = re.sub(r"I'll", "I will", phrase)
    phrase = re.sub(r"I'd", "I would", phrase)
    phrase = re.sub(r"Let's", "Let us", phrase)
    phrase = re.sub(r"you'd", "You would", phrase)
    phrase = re.sub(r"It's", "It is", phrase)
    phrase = re.sub(r"Ain't", "am not", phrase)
    phrase = re.sub(r"Haven't", "Have not", phrase)
    phrase = re.sub(r"Could've", "Could have", phrase)
    phrase = re.sub(r"youve", "you have", phrase)  
    phrase = re.sub(r"donå«t", "do not", phrase)
    return phrase

def remove_punctuations(text):
    for punctuation in list(string.punctuation):
        text = text.replace(punctuation, '')
    return text
    


def clean_text(text):
    text = decontraction(text)
    text = text.lower()
    text = remove_punctuations(text)
    return text


## Get tabular feature and sentiment of the text
def text_features(df,col):
    df["num_words"] = df[col].apply(lambda x: len(str(x).split()))
    df["num_chars"] = df[col].apply(lambda x: len(str(x)))
    df["num_stopwords"] = df[col].apply(lambda x: len([w for w in str(x).lower().split() if w in stopwords.words('english')]))
    df["num_punctuations"] = df[col].apply(lambda x: len([c for c in str(x) if c in list(string.punctuation)]))
    df["mean_word_len"] = df[col].apply(lambda x: np.mean([len(w) for w in str(x).split()]))
    #Sentiment: lies between [-1,1], -1 defines a negative sentiment and 1 defines a positive sentiment.
    df["polarity"] = df[col].apply(lambda x: TextBlob(x).sentiment[0])
    #The higher subjectivity means that the text contains personal opinion rather than factual information.
    df["subjectivity"] = df[col].apply(lambda x: TextBlob(x).sentiment[1])
    return df

def add_content_like(gcm,target):
    content_like = target.groupby("content_id").mean()["engagement_value"].rename("Like Rate").reset_index()
    avg_like_rt = content_like["Like Rate"].mean()
    slt_content = target.groupby("content_id").count()["user_id"].rename("expose_cnt").reset_index()
    slt_content = slt_content[slt_content["expose_cnt"]>1].drop("expose_cnt",axis=1)
    content_like = pd.merge(content_like, slt_content, how = "left")
    # Use avg like rate to fill the content exposed to only one user
    content_like = content_like.fillna(avg_like_rt)
    # Join two dataframes to get the general like rate of the content
    gcm = pd.merge(gcm,content_like,on="content_id",how = "left") # Keep the NA without filling the avg like rate
    return gcm

def combine_content(gcm,gcm_useful_cols):
    ## Explode content embedding list to multiple columns
    content_embed = pd.DataFrame(gcm["prompt_embedding"].tolist())
    content_embed = content_embed.add_prefix('content_embed_')
    content_embed = pd.concat([gcm["content_id"],content_embed],axis=1).drop_duplicates()
    # Join two dataframes to get full content features
    content_features = pd.merge(gcm[gcm_useful_cols],content_embed, on = "content_id", how="left")
    return content_features


In [122]:
## Clean the text
gcm['original_prompt'] = gcm['original_prompt'].apply(clean_text)
## Get text features
gcm = text_features(gcm, "original_prompt")
gcm = add_content_like(gcm,target)

gcm_useful_cols = ['content_id', 'num_inference_steps', 'guidance_scale','source',
                   'num_words','num_chars', 'num_stopwords', 'num_punctuations',
                   'mean_word_len', 'polarity', 'subjectivity','Like Rate', 'prompt_embedding'] # Keep prompt_embedding for further cos similarity calculation

In [123]:
content_features = combine_content(gcm,gcm_useful_cols)

In [146]:
dft = content_features.iloc[:, :13]
# dft = dft.drop('source', axis=1)
# dft = dft.drop('prompt_embedding', axis=1)
# dft.iloc[:,-32:] = dft.iloc[:,-32:].astype('float16')
dft.to_parquet('../services/backend/processed_data/Content_Features.parquet', index=False)

In [42]:
content_features_cols = content_features.columns.tolist()[:12]
user_features_cols = user_features.columns.tolist()[:3]

## Build feature generation pipeline

In [63]:
def gen_cosine(user_id,content_id,user_features,content_features):
    
    slct_user_feature = user_features[user_features["user_id"]== user_id]
    slct_content_feature = content_features[content_features["content_id"]== content_id]
    user_embedding_lst = slct_user_feature["embed_combined"].tolist()[0]
    content_feature_lst = slct_content_feature["prompt_embedding"].tolist()[0]
    a,b = user_embedding_lst, content_feature_lst
    cosine_similarity = dot(a, b)/(norm(a)*norm(b))
    
    return cosine_similarity

In [60]:
all_users = engagement.user_id.unique().tolist()
all_contents = content.id.unique().tolist()
from itertools import product

user_item_df = pd.DataFrame(list(product(all_users, all_contents)), columns = ['user_id', 'content_id'])

In [185]:
def generate_features(user_id,content_id,user_features,content_features):#,user_path= "User_Features.parquet",content_path = "Content_Features.parquet" ):
    slct_user_feature = user_features[user_features["user_id"]== user_id]
    slct_content_feature = content_features[content_features["content_id"]== content_id]
    user_embedding_lst = slct_user_feature["embed_combined"].tolist()[0]
    content_feature_lst = slct_content_feature["prompt_embedding"].tolist()[0]
    a,b = user_embedding_lst, content_feature_lst
    cosine_similarity = dot(a, b)/(norm(a)*norm(b))
    full_features = slct_user_feature.drop(["user_id","embed_combined"],axis=1).values.tolist()[0]\
                    + slct_content_feature.drop(["content_id","source","prompt_embedding"],axis=1).values.tolist()[0] \
                    + [cosine_similarity]

    
    return full_features

In [200]:
user_path = "../services/backend/processed_data/User_Features.parquet"
content_path = "../services/backend/processed_data/Content_Features.parquet"
user_features_ = pd.read_parquet(user_path)
content_features_ = pd.read_parquet(content_path)

In [201]:
user_features_ = pd.concat([user_features_, pd.DataFrame(user_features_["embed_combined"].tolist()).add_prefix('emb_')], axis=1)
content_features_ = pd.concat([content_features_, pd.DataFrame(content_features_["prompt_embedding"].tolist()).add_prefix('emb_')],axis=1)

In [206]:
content_features_.shape

(107705, 45)

In [64]:
cosine_df = user_item_df[["user_id","content_id"]].apply(lambda row: gen_cosine(row["user_id"],row["content_id"],user_features,content_features),axis=1)


KeyboardInterrupt



In [169]:
train = pd.read_csv("../services/backend/processed_data/train.csv")
test = pd.read_csv("../services/backend/processed_data/test.csv")

In [211]:
X_train = train[["user_id","content_id"]].apply(lambda row: generate_features(row["user_id"],row["content_id"],user_features_,content_features_),axis=1)
X_train = pd.DataFrame(X_train.tolist())

In [212]:
y_train = train["engagement_value"]

In [213]:
X_test = test[["user_id","content_id"]].apply(lambda row: generate_features(row["user_id"],row["content_id"],user_features_,content_features_),axis=1)
X_test = pd.DataFrame(X_test.tolist())

In [214]:
y_test = test["engagement_value"]

In [216]:
model = lgb.LGBMClassifier()
model.fit(X_train, y_train)

In [217]:
model.predict_proba(X_test)

array([[9.01978917e-05, 9.99909802e-01],
       [9.99970595e-01, 2.94049651e-05],
       [7.47890756e-01, 2.52109244e-01],
       ...,
       [9.77124234e-01, 2.28757663e-02],
       [9.99975656e-01, 2.43440029e-05],
       [9.99954244e-01, 4.57559627e-05]])

In [218]:
#lgb.save(model, '../services/backend/src/recommendation_system/ml_models/lgbm.pkl', num_iteration = NULL)
import joblib
# save model
joblib.dump(model, '../services/backend/src/recommendation_system/ml_models/lgbm.pkl')
# # load model
# gbm_pickle = joblib.load('lgb.pkl')

['../services/backend/src/recommendation_system/ml_models/lgbm.pkl']

In [219]:
model_ = joblib.load('../services/backend/src/recommendation_system/ml_models/lgbm.pkl')

In [220]:
model_.n_features_

77

In [215]:
X_train

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,67,68,69,70,71,72,73,74,75,76
0,0.437773,89446.786957,6.728266,-2.205232,-0.935654,0.414398,-0.722790,-0.170461,-0.396117,-0.478514,...,-5.375868,-15.747827,10.470771,0.691198,-2.789990,1.827842,-0.277631,1.139680,3.945075,-0.574410
1,0.513393,3834.459649,10.334824,-3.120607,-0.430606,-0.189135,-1.491082,-1.161160,-0.883617,1.476626,...,-13.424211,-14.681874,1.536202,2.356138,5.213597,5.147564,7.545617,5.880928,-2.953583,-0.545915
2,0.347561,10888.293103,4.592510,-3.190515,-1.007697,3.566104,-0.926311,0.800057,-1.553255,0.150828,...,2.404554,-7.091844,-5.271067,-4.057229,-2.413814,0.685560,14.102049,15.216167,2.129391,-0.436742
3,0.355908,58446.510204,9.826862,-1.359778,1.603947,1.285903,-1.549141,1.407438,-0.527707,-1.540423,...,6.423792,0.420492,9.965358,-4.019853,17.191104,1.620610,9.487975,2.405741,9.072886,0.547553
4,0.482927,32380.040541,17.349594,1.228321,-0.375397,-2.452161,-3.844825,-0.479502,-2.034023,-1.296189,...,-1.284498,9.315584,-10.445529,11.428464,0.595655,1.485687,-1.503689,2.343183,0.056319,-0.736785
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
19463,0.437773,89446.786957,6.728266,-2.205232,-0.935654,0.414398,-0.722790,-0.170461,-0.396117,-0.478514,...,-9.846691,-2.671703,-0.628226,18.510978,-10.815182,-0.016847,-7.861359,-7.371767,-34.196477,0.632159
19464,0.181338,3442.966292,1.844459,2.867678,2.187282,4.048394,-1.494160,0.845086,1.209595,-0.510142,...,4.637885,3.869178,2.353058,-4.430796,3.717093,-6.779565,9.317777,-3.288596,-0.027603,-0.035675
19465,0.500595,4623.850480,14.910786,-4.786874,-0.135374,0.845853,-5.201098,-0.397249,-0.755364,-1.867080,...,-2.323619,-22.983434,19.214541,8.592070,-6.565695,9.690675,-4.462943,-9.763791,-11.033206,-0.207197
19466,0.464844,5210.244186,17.172745,-4.287327,-0.035162,0.498981,-2.851425,0.040663,-0.839526,-0.143111,...,19.401534,15.638111,-15.014686,11.238390,-13.355906,-1.986952,13.559861,-19.614411,-4.226408,0.204496
