In [11]:
import pandas as pd
import pandas as pd

import numpy as np
import matplotlib.pyplot as plt

# Load Clean Data

In [12]:
folder = '../data/clean_data/'

df_categories = pd.read_csv(f'{folder}df_categories.csv')
df_comments = pd.read_parquet(f'{folder}df_comments_video.parquet')
df_transcript = pd.read_csv(f'{folder}df_transcript_original.csv')
df_videos = pd.read_csv(f'{folder}df_video_data.csv')
df_channel = pd.read_csv(f'{folder}df_channel_data.csv')
df_content = pd.read_csv(f'{folder}df_content.csv')

In [13]:
df_content.head()

Unnamed: 0,video_id,original_language,transcription,language,clean_text,comment_list,sentiment
0,qtlUwwtvuEg,English (auto-generated),[Music] thank you hello everyone I hope you ar...,english,music thank hello everyone hope great era ai w...,thank well explained video tutorial\n useful t...,1.0
1,QaoDXYYtgK0,English (auto-generated),number three [Music] Facebook has enacted an e...,english,number three music facebook enacted emergency ...,quotthey invented code languagequot man thats ...,1.0
2,PqDwddEHswU,English (auto-generated),in this series we're going to introduce deep l...,english,series going introduce deep learning least per...,brian douglas hero\n learned far sitting couch...,0.9972
3,B-Y7rnOa43w,English (auto-generated),this is how to earn money with AI and it's par...,english,earn money ai part two let go want create kind...,oh im going try,0.0
4,vyit-1zKsZ4,English (auto-generated),when current Medical Science has run out of op...,english,current medical science run option doctor dont...,locked syndrome lateral amyloid multiple scler...,0.9994


# TF-IDF model for `df_transcripts` and `df_comments`

# Modelo 1

Variables: `clean_text`, `video_id`

In [14]:
model1 = df_content[df_content['language'] == 'english'][['video_id', 'clean_text', 'comment_list', 'sentiment']].copy()
model1.head()

Unnamed: 0,video_id,clean_text,comment_list,sentiment
0,qtlUwwtvuEg,music thank hello everyone hope great era ai w...,thank well explained video tutorial\n useful t...,1.0
1,QaoDXYYtgK0,number three music facebook enacted emergency ...,quotthey invented code languagequot man thats ...,1.0
2,PqDwddEHswU,series going introduce deep learning least per...,brian douglas hero\n learned far sitting couch...,0.9972
3,B-Y7rnOa43w,earn money ai part two let go want create kind...,oh im going try,0.0
4,vyit-1zKsZ4,current medical science run option doctor dont...,locked syndrome lateral amyloid multiple scler...,0.9994



El parámetro ngram_range en TfidfVectorizer se refiere a los tamaños de n-gramas que el vectorizador debe considerar al analizar el texto. Un n-grama es una secuencia de "n" palabras consecutivas en un texto.

Desglose de ngram_range=(1, 2):
Unigrama (1-gram): Es una secuencia de 1 palabra. Si ngram_range=(1, 1), el vectorizador solo consideraría palabras individuales.

Ejemplo: Para la frase "hola mundo", los unigramas serían: ["hola", "mundo"].
Bigramas (2-gram): Es una secuencia de 2 palabras consecutivas. Si ngram_range=(2, 2), el vectorizador solo consideraría pares de palabras consecutivas.

Ejemplo: Para la frase "hola mundo", los bigramas serían: ["hola mundo"].
ngram_range=(1, 2): Con este rango, el vectorizador considerará tanto unigramas como bigramas. Es decir, se generarán características tanto para palabras individuales como para pares de palabras consecutivas.

Ejemplo: Para la frase "hola mundo", los n-gramas serían: ["hola", "mundo", "hola mundo"]

In [15]:
from sklearn.feature_extraction.text import TfidfVectorizer

# Crear el vectorizador TF-IDF
tfidf_vectorizer = TfidfVectorizer(max_features=5000, ngram_range=(1, 2), stop_words='english')

# Aplicar TF-IDF a comentarios
tfidf_transcripts = tfidf_vectorizer.fit_transform(model1['clean_text'])

# Convertir a DataFrame para visualizar las características
tfidf_transcripts_df = pd.DataFrame(tfidf_transcripts.toarray(), columns=tfidf_vectorizer.get_feature_names_out())

print(f'tfidf_transcript: {tfidf_transcripts_df.shape}')


tfidf_transcript: (1496, 5000)


# Get recommendations based on transcriptions `VALID`

La similitud coseno (cosine similarity) es una medida de similitud entre dos vectores en un espacio vectorial que calcula el coseno del ángulo entre ellos. Es ampliamente utilizada en procesamiento de lenguaje natural (NLP), recuperación de información, y otras áreas relacionadas con la comparación de texto o documentos, ya que es eficiente y proporciona una buena estimación de la similitud entre documentos o textos representados como vectores.

In [16]:
model1 = df_content[df_content['language'] == 'english'][['video_id', 'clean_text', 'comment_list']].copy()
model1.head()

Unnamed: 0,video_id,clean_text,comment_list
0,qtlUwwtvuEg,music thank hello everyone hope great era ai w...,thank well explained video tutorial\n useful t...
1,QaoDXYYtgK0,number three music facebook enacted emergency ...,quotthey invented code languagequot man thats ...
2,PqDwddEHswU,series going introduce deep learning least per...,brian douglas hero\n learned far sitting couch...
3,B-Y7rnOa43w,earn money ai part two let go want create kind...,oh im going try
4,vyit-1zKsZ4,current medical science run option doctor dont...,locked syndrome lateral amyloid multiple scler...


In [17]:
from sklearn.metrics.pairwise import cosine_similarity

video_id = 'PqDwddEHswU'  

# Extraer el índice del video en el DataFrame
index = model1[model1['video_id'] == video_id].index[0]

# Calcular la similitud del coseno entre este video y todos los demás
cosine_similarities = cosine_similarity(tfidf_transcripts[index], tfidf_transcripts).flatten()

# Crear un DataFrame con los video_ids y las similitudes correspondientes
similar_videos = pd.DataFrame({
    'video_id': model1['video_id'],
    'similarity': cosine_similarities
})

# Ordenar los videos por similitud, excluyendo el propio video
similar_videos = similar_videos[similar_videos['video_id'] != video_id].sort_values(by='similarity', ascending=False)

similar_videos = similar_videos.merge(df_videos[['videoId', 'title']], left_on='video_id', right_on='videoId')
similar_videos.drop(columns='videoId', inplace=True)

similar_videos.head()

Unnamed: 0,video_id,similarity,title
0,3cSjsTKtN9M,0.512788,What Is Deep Learning?
1,8duf31d6oyY,0.465386,2️⃣5️⃣ Deep Learning: how machines learn like ...
2,VOaoabf3LPM,0.461375,Artificial Intelligence Full Course in 10 Hour...
3,cDTp-qXXZU0,0.455235,Machine Learning vs. Deep Learning [What's the...
4,VyWAvY2CF9c,0.448958,Deep Learning Crash Course for Beginners


# Final Score based on content and comments sentiment

$$
\text{Final Score} = \text{Similar\_Videos} \times (1 + \text{Sentiment\_Score})
$$

In [18]:
df_content = pd.read_csv('../data/clean_data/df_content.csv')

In [19]:
df_merged = df_content.merge(similar_videos, on='video_id', how='right')
df_merged['sentiment'].fillna(0)

df_merged['final_score'] = df_merged['similarity'] * (1 + df_merged['sentiment'])
df_merged.sort_values(by='final_score', ascending=False, inplace=True)

selected_columns = ['video_id', 'title', 'final_score']
df_merged[selected_columns].head(10)

Unnamed: 0,video_id,title,final_score
0,3cSjsTKtN9M,What Is Deep Learning?,1.019678
1,8duf31d6oyY,2️⃣5️⃣ Deep Learning: how machines learn like ...,0.930772
2,VOaoabf3LPM,Artificial Intelligence Full Course in 10 Hour...,0.922335
4,VyWAvY2CF9c,Deep Learning Crash Course for Beginners,0.897916
5,O5xeyoRL95U,Deep Learning Basics: Introduction and Overview,0.896204
3,cDTp-qXXZU0,Machine Learning vs. Deep Learning [What's the...,0.890395
6,7x2YZhEj9Dw,Deep Learning Tutorial with Python | Machine L...,0.850921
7,HGXlFG_Rz4E,Deep Learning Interview Questions and Answers ...,0.828183
9,9dFhZFUkzuQ,Machine Learning vs Deep Learning vs Artificia...,0.815358
8,atiYXm7JZv0,Machine Learning with R and TensorFlow,0.815333


# MODEL EVALUATION

# Precision@K

$$
\text{Precision@K} = \frac{\text{Número de elementos relevantes en el top K}}{K}
$$


# Recall@K
$$
\text{Recall@K} = \frac{\text{Número de elementos relevantes en el top K}}{\text{Número total de elementos relevantes}}
$$


# F1-Score@K
$$
\text{F1-Score@K} = 2 \times \frac{\text{Precision@K} \times \text{Recall@K}}{\text{Precision@K} + \text{Recall@K}}
$$
