In [22]:
import pandas as pd
import numpy as np
import joblib
import matplotlib.pyplot as plt

# PIPELINES FOR DEPLOYMENT

# 1. Cosine Similatiry Matrix from transcripts

In [14]:
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.feature_extraction.text import TfidfVectorizer

# Create tfidf matrix: create vectors from video_transcripts
def create_tfidf_matrix(df):
    tfidf_vectorizer = TfidfVectorizer(max_features=5000, ngram_range=(1,2), stop_words='english')
    tfidf_matrix = tfidf_vectorizer.fit_transform(df['clean_text'])
    
    return tfidf_vectorizer, tfidf_matrix

# Create cosine_sim_matrix: matrix based on similarity of tfidf vectors
def calculate_cosine_similarity(tfidf_matrix, video_ids):
    cosine_sim_matrix = cosine_similarity(tfidf_matrix)
    cosine_sim_triangular = np.triu(cosine_sim_matrix, k=0)
    cosine_sim_df = pd.DataFrame(cosine_sim_triangular, index=video_ids, columns=video_ids)
    return cosine_sim_df

In [15]:
# Load dataframe
df_content = pd.read_csv('../data/clean_data/df_content.csv')
df = df_content.copy()

# Create tfidf model
tfidf_vectorizer, tfidf_matrix = create_tfidf_matrix(df)

# Create cosine_sim matrix from video_transcripts
cosine_sim_matrix = calculate_cosine_similarity(tfidf_matrix, df['video_id'])

cosine_sim_matrix.head(5)

video_id,qtlUwwtvuEg,QaoDXYYtgK0,PqDwddEHswU,B-Y7rnOa43w,vyit-1zKsZ4,MWNqE4z2C34,SWeQO2SEOmQ,pRC4M5Na3jM,ax-3m4GSa1c,_8fDMmnDyYQ,...,I7Uqlv1WX18,qvbZmxUbGaE,Y2FOUg_jo7k,9gmeVAmsTW4,uuh7spVdf0c,Bm6CAjVtrIw,atiYXm7JZv0,plKAsDIiEJo,ZQazWxegNm8,CkG15bX4z90
video_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
qtlUwwtvuEg,1.0,0.065733,0.061912,0.151126,0.09574,0.133286,0.032403,0.026269,0.132834,0.195527,...,0.011048,0.049032,0.048195,0.025302,0.137597,0.121548,0.089471,0.020735,0.00927,0.091266
QaoDXYYtgK0,0.0,1.0,0.065913,0.060257,0.197272,0.111148,0.089107,0.037385,0.097818,0.105595,...,0.023071,0.044115,0.054087,0.039808,0.192708,0.070982,0.141533,0.010328,0.12358,0.048784
PqDwddEHswU,0.0,0.0,1.0,0.045326,0.143175,0.139038,0.049364,0.053683,0.12813,0.174537,...,0.001226,0.065881,0.097605,0.025204,0.134635,0.142591,0.405382,0.029464,0.053422,0.033756
B-Y7rnOa43w,0.0,0.0,0.0,1.0,0.050299,0.042735,0.033021,0.014589,0.130971,0.222227,...,0.0,0.052008,0.037969,0.015327,0.072554,0.01742,0.071598,0.007942,0.021755,0.026093
vyit-1zKsZ4,0.0,0.0,0.0,0.0,1.0,0.177629,0.08444,0.072475,0.17387,0.116326,...,0.018901,0.065463,0.098764,0.060736,0.22921,0.113786,0.200721,0.061007,0.15318,0.066519


# 2. Sentiment Score from comments

In [16]:
# Load the data output from the Emotion Analysis Score
df_emotion_score = pd.read_csv("../data/clean/df_videoid_emotionscore.csv/part-00000-1f04c136-e788-4899-92fa-efb423e4fcd4-c000.csv", header=None)

# Rename columns
df_emotion_score.rename(columns={0: 'video_id', 1: 'weighted_sentiment'}, inplace=True)

In [17]:
# Merge the transcript and emotion_score dataframes
df_merged = df_content.merge(df_emotion_score, on='video_id', how='left')

# Set weighted_sentiment as '0' for videos without comments
df_merged['weighted_sentiment'] = df_merged['weighted_sentiment'].fillna(float(0))

In [18]:
# Set the video_id as index
df_sentiment_scores = df_merged.set_index('video_id')['weighted_sentiment']

# Re-index as per cosine_sim_matrix to have the same order, filling the videos_id without sentiment score as '0' netural
sentiment_scores = df_sentiment_scores.reindex(cosine_sim_matrix.index, fill_value=0)

# Multiply sentiment_score and its transpose by cosine_sim_matrix
sentiment_product_matrix = (1 + sentiment_scores) * cosine_sim_matrix

# Create symmetrical matrix for better visualization
symm_matrix = sentiment_product_matrix + sentiment_product_matrix.T
np.fill_diagonal(symm_matrix.values, 0)

symm_matrix.head(5)

video_id,qtlUwwtvuEg,QaoDXYYtgK0,PqDwddEHswU,B-Y7rnOa43w,vyit-1zKsZ4,MWNqE4z2C34,SWeQO2SEOmQ,pRC4M5Na3jM,ax-3m4GSa1c,_8fDMmnDyYQ,...,I7Uqlv1WX18,qvbZmxUbGaE,Y2FOUg_jo7k,9gmeVAmsTW4,uuh7spVdf0c,Bm6CAjVtrIw,atiYXm7JZv0,plKAsDIiEJo,ZQazWxegNm8,CkG15bX4z90
video_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
qtlUwwtvuEg,0.0,0.05985,0.078945,0.24232,0.1104,0.140574,0.032403,0.026779,0.261482,0.19796,...,0.013288,0.058231,0.080977,0.025302,0.137597,0.162798,0.159532,0.020735,0.011555,0.091266
QaoDXYYtgK0,0.05985,0.0,0.084047,0.096618,0.227481,0.117226,0.089107,0.038109,0.192553,0.106908,...,0.027748,0.052392,0.090877,0.039808,0.192708,0.095072,0.252362,0.010328,0.154043,0.048784
PqDwddEHswU,0.078945,0.084047,0.0,0.072678,0.1651,0.146641,0.049364,0.054724,0.252223,0.176708,...,0.001474,0.078242,0.163995,0.025204,0.134635,0.190982,0.722823,0.029464,0.06659,0.033756
B-Y7rnOa43w,0.24232,0.096618,0.072678,0.0,0.058001,0.045071,0.033021,0.014871,0.257814,0.224992,...,0.0,0.061765,0.063796,0.015327,0.072554,0.023332,0.127664,0.007942,0.027117,0.026093
vyit-1zKsZ4,0.1104,0.227481,0.1651,0.058001,0.0,0.187342,0.08444,0.073881,0.342262,0.117773,...,0.022733,0.077745,0.165942,0.060736,0.22921,0.152402,0.357899,0.061007,0.190939,0.066519


# 3. Clustering from video stats

In [28]:
# Load the info
cluster_scores = pd.read_csv('../data/clean_data/df_clusters.csv', index_col=0)

In [29]:
cluster_scores['cluster'].nunique()

28

In [None]:
# Reindex as per cosine_sim_matrix
cluster_scores = cluster_scores.set_index('video_id')['cluster']

# Fill empty values to '-1' cluster (videos not belonging to cluster 0 or 1)
df_cluster_scores = cluster_scores.reindex(cosine_sim_matrix.index, fill_value=-1)

# Create a boolean mask where the cluster is the same for both videos
same_cluster_mask = df_cluster_scores.values[:, None] == df_cluster_scores.values

# Set '0' for False and '1.2' (cluster_boost) for True values
boost_factor_matrix = np.where(same_cluster_mask, 1.2, 1)

# Multiply the consine_sim_matrix by the boost_factor_matrix
final_score_matrix = symm_matrix * boost_factor_matrix

# Display the Model
final_score_matrix.head(5)

video_id,qtlUwwtvuEg,QaoDXYYtgK0,PqDwddEHswU,B-Y7rnOa43w,vyit-1zKsZ4,MWNqE4z2C34,SWeQO2SEOmQ,pRC4M5Na3jM,ax-3m4GSa1c,_8fDMmnDyYQ,...,I7Uqlv1WX18,qvbZmxUbGaE,Y2FOUg_jo7k,9gmeVAmsTW4,uuh7spVdf0c,Bm6CAjVtrIw,atiYXm7JZv0,plKAsDIiEJo,ZQazWxegNm8,CkG15bX4z90
video_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
qtlUwwtvuEg,0.0,0.05985,0.078945,0.24232,0.13248,0.168689,0.032403,0.026779,0.261482,0.19796,...,0.013288,0.069878,0.080977,0.025302,0.137597,0.162798,0.159532,0.020735,0.013866,0.091266
QaoDXYYtgK0,0.05985,0.0,0.084047,0.096618,0.227481,0.117226,0.089107,0.038109,0.231064,0.106908,...,0.027748,0.052392,0.090877,0.039808,0.23125,0.114086,0.302835,0.010328,0.154043,0.05854
PqDwddEHswU,0.078945,0.084047,0.0,0.072678,0.1651,0.146641,0.049364,0.054724,0.252223,0.176708,...,0.001769,0.078242,0.163995,0.030244,0.134635,0.190982,0.722823,0.029464,0.06659,0.033756
B-Y7rnOa43w,0.24232,0.096618,0.072678,0.0,0.058001,0.045071,0.033021,0.014871,0.257814,0.224992,...,0.0,0.061765,0.076555,0.015327,0.072554,0.023332,0.127664,0.007942,0.027117,0.026093
vyit-1zKsZ4,0.13248,0.227481,0.1651,0.058001,0.0,0.224811,0.08444,0.073881,0.342262,0.117773,...,0.022733,0.093294,0.165942,0.060736,0.22921,0.152402,0.357899,0.061007,0.229126,0.066519


# 4. Save ML Model as 'joblib' for deployment

In [None]:
# Save the final_score_matrix dataframe for deployment

# joblib.dump(final_score_matrix, '../docker_app/final_score_matrix.joblib')

NameError: name 'joblib' is not defined

# 5. Simulate app output

In [None]:
import random
import joblib

# Load dataframe
final_matrix = joblib.load('../docker_app/final_score_matrix.joblib')

# Set a random video as the first one
initial_video = random.choice(final_matrix.index)

# Recommend the top 10 videos
top_10_videos = final_matrix[initial_video].sort_values(ascending=False)[:10]

print(f'Initial video: {initial_video}\n')
print(f'Top 10 recommended videos: {top_10_videos}')

Initial video: pMMK6lQvBW8

Top 10 recommended videos: video_id
noVZqGc2hsU    0.685038
jSxCWLJt0wY    0.643959
Aof4BxK0UlY    0.609103
S94VK0nspzY    0.608532
yIRL4xtmXE4    0.602002
PZQMyj-9z-w    0.554031
lEvoMBwD49M    0.542494
2_8kgV-EfTA    0.497903
qpoRO378qRY    0.480539
tjYvx_xidF4    0.474198
Name: pMMK6lQvBW8, dtype: float64


# Future steps: model evaluation

Precision@K

$$
\text{Precision@K} = \frac{\text{Número de elementos relevantes en el top K}}{K}
$$


Recall@K
$$
\text{Recall@K} = \frac{\text{Número de elementos relevantes en el top K}}{\text{Número total de elementos relevantes}}
$$


F1-Score@K
$$
\text{F1-Score@K} = 2 \times \frac{\text{Precision@K} \times \text{Recall@K}}{\text{Precision@K} + \text{Recall@K}}
$$
