# Cosine Similarity with Transcriptions

Notebook used to find cosine similarity between video transcriptions and NYT articles

In [98]:
import pandas as pd
import tensorflow as tf
import tensorflow_hub as hub
import numpy as np
import os
import re

In [2]:
# get combined pdf to access dates

cwd = os.getcwd()
metadata_dir = f'{cwd}/../pre-processing/metadata-csv'
metadata_files = [file for file in os.listdir(metadata_dir) if "Sec2Gr3_" in file]      # only get metadata files for our group

dataframes = []

for file in metadata_files:
    file_path = os.path.join(metadata_dir, file)
    df = pd.read_csv(file_path)
    dataframes.append(df)

combined_df = pd.concat(dataframes, ignore_index=True)
combined_df.head()

Unnamed: 0,video_id,video_timestamp,video_duration,video_locationcreated,suggested_words,video_diggcount,video_sharecount,video_commentcount,video_playcount,video_description,video_is_ad,video_stickers,author_username,author_name,author_followercount,author_followingcount,author_heartcount,author_videocount,author_diggcount,author_verified
0,7273221955937914155,2023-08-30T16:56:01,37.0,US,"angels in tibet, angels in tibet dance, angels...",356300.0,5606.0,986.0,2000000.0,Replying to @jade🐉not perfect yet & i made a ...,False,,thebeaulexx,beaulexx,,,,,,False
1,7273221955937914155,2023-08-30T16:56:01,37.0,US,"angels in tibet, angels in tibet dance, angels...",356300.0,5606.0,986.0,2000000.0,Replying to @jade🐉not perfect yet & i made a ...,False,,thebeaulexx,beaulexx,,,,,,False
2,7283080657893379334,2023-09-26T06:32:40,15.0,PH,"angels in tibet, Jam Republic, angels in tibet...",419100.0,3518.0,708.0,2600000.0,🧠🧠🧠,False,,clarkie_cpm,Clarkie,,,,,,False
3,7273221955937914155,2023-08-30T16:56:01,37.0,US,"angels in tibet, angels in tibet dance, angels...",356300.0,5606.0,986.0,2000000.0,Replying to @jade🐉not perfect yet & i made a ...,False,,thebeaulexx,beaulexx,,,,,,False
4,7285397643725983008,2023-10-02T12:23:48,37.0,US,"Dream Academy, angels in tibet, Adela Dream Ac...",142700.0,1373.0,551.0,1000000.0,s/o to dream academy for teaching me how to da...,False,,adelajergova,ADÉLA,,,,,,False


In [29]:
transcript_dir = f'{cwd}/../transcription/txt-transcripts/'

transcripts = [file for file in os.listdir(transcript_dir) if file.endswith('.txt')]

transcript_dict = {'video_id': [], 'transcription': []}

for file in transcripts:
    video_id = file.split('.')[0]
    with open(os.path.join(transcript_dir, file), "r") as transcription_file:
        transcription = transcription_file.read()

    transcript_dict['video_id'].append(video_id)
    transcript_dict['transcription'].append(transcription)

transcript_df = pd.DataFrame(transcript_dict)

In [30]:
transcript_df.head()

Unnamed: 0,video_id,transcription
0,7300021255258901806,I'm sorry.
1,7236076693822246170,Outro
2,7301769517041126702,"Okay, right. No, I'm getting with this."
3,7289191984529575214,"People, open your eyes. We are supporting gen..."
4,7297432141485444394,The Zionist argument we will address today is...


In [31]:
date_df = combined_df[['video_id', 'video_timestamp']]
date_df['video_date'] = date_df['video_timestamp'].str[:10]

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  date_df['video_date'] = date_df['video_timestamp'].str[:10]


In [23]:
# date_df = date_df.set_index('video_id')
# transcript_df = transcript_df.set_index('video_id')

In [69]:
date_df.loc[:, 'video_id']

0        7273221955937914155
1        7273221955937914155
2        7283080657893379334
3        7273221955937914155
4        7285397643725983008
                ...         
24910    7283846172425407750
24911    7285527057394584863
24912    7284024264334806315
24913    7284462467290303787
24914    7286843613030518059
Name: video_id, Length: 24915, dtype: object

In [76]:
date_df['video_id'] = date_df.loc[:, 'video_id'].astype(str)
transcript_df['video_id'] = transcript_df['video_id'].astype(str)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  date_df['video_id'] = date_df.loc[:, 'video_id'].astype(str)


In [92]:
transcript_date_df = pd.merge(transcript_df, date_df, on='video_id')
transcript_date_df.head()

Unnamed: 0,video_id,transcription,video_timestamp,video_date
0,7300021255258901806,I'm sorry.,2023-11-10T21:11:00,2023-11-10
1,7236076693822246170,Outro,2023-05-22T14:33:29,2023-05-22
2,7301769517041126702,"Okay, right. No, I'm getting with this.",2023-11-15T14:15:04,2023-11-15
3,7289191984529575214,"People, open your eyes. We are supporting gen...",2023-10-12T17:47:59,2023-10-12
4,7297432141485444394,The Zionist argument we will address today is...,2023-11-03T22:43:55,2023-11-03


In [93]:
print(transcript_df.shape)
print(transcript_date_df.shape) ### PROBLEM!! SOLVE THIS: why are there more rows now??

(406, 2)
(432, 4)


In [10]:
# load the Universal Sentence Encoder's TF Hub module
embed = hub.load("https://tfhub.dev/google/universal-sentence-encoder/4")

In [11]:
# cosine similarity function, from week 7 notebook
from numpy.linalg import norm

def cosineSimilarity(vec1, vec2):
    """Calculate the cosine similarity between two vectors."""
    V1 = np.array(vec1)
    V2 = np.array(vec2)
    cosine = np.dot(V1, V2)/(norm(V1)*norm(V2))
    return cosine

In [85]:
from get_nyt_articles_revised import filter_by_date, filter_by_week

In [101]:
def cosine_sim_nyt(nyt_df, col_name, trans_embedding):
    max_similarity = -1  # initialize maximums
    max_index = -1
    for index, nyt_row in nyt_df.iterrows():
        cosine_similarities = {}
        nyt_embedding = embed([nyt_row[col_name]])[0]               # universal sentence encoder
    
         # calculate cosine similarity
        cosine_sim = cosineSimilarity(trans_embedding, nyt_embedding)

        if cosine_sim > max_similarity:
            max_similarity = cosine_sim
            max_index = index
            top_row = nyt_df.loc[max_index]
    headline = top_row['headline']
    
    print(f"Index of Maximum Cosine Similarity for {col_name}:", max_index)
    print(f"Maximum Cosine Similarity for {col_name}:", max_similarity)

    return (max_similarity, headline, col_name)

In [99]:
import re
from nltk.corpus import stopwords

stop_words = set(stopwords.words('english'))

In [100]:
def clean_transcription(transcript):
    """
    Helper function, takes video transcript and splits into words, removes punctuation, and stop words
    """
    if pd.isna(description):  
        return [] 
    
    # remove numbers from the text
    description = re.sub(r'\d+', '', description)

    # split the description into words
    words = description.split()
    
    # make everything lowercase
    cleaned_words = [word.lower() for word in words]
    
    # remove stop words
    cleaned_words = [word for word in cleaned_words if word not in stop_words]

    # remove empty strings
    cleaned_words = [word for word in cleaned_words if word]

    sentence = " ".join(cleaned_words)
    
    return sentence

In [None]:
def split_keywords(text):
    """Split text into individual keywords based on whitespace and punctuation, remove stop words"""
    if pd.isna(text):  # check if text is NaN
        return []  
    
    # split text into individual keywords based on whitespace and punctuation
    keywords = re.findall(r'\b\w+\b', text)

    # remove stop words
    cleaned_words = [word.lower() for word in keywords if word not in stop_words]
    
    return cleaned_words

def clean_headline(text):
    """Split headline into individual words based on whitespace and punctuation, remove stop words"""
    if pd.isna(text):  # check if text is NaN
        return []  
    
    # remove numbers from the text
    text = re.sub(r'\d+', '', text)

    # split text into individual keywords based on whitespace
    keywords = text.split()

    # remove stop words
    cleaned_words = [word.lower() for word in keywords if word not in stop_words]
    
    return cleaned_words

In [None]:
total_num_videos = len(transcript_date_df)
for index, row in transcript_date_df.iterrows():
    print(f"Processing video {index+1}/{total_num_videos}, video id: {row['video_id']}")

    transcript_embedding = embed([row['transcription']])[0]              # universal sentence encoder
    nyt_df = filter_by_week(row['video_date'])

    ## Part 1: transcription comparison to headline, abstract, lead_paragraph
    headline_comp = cosine_sim_nyt(nyt_df, 'headline', transcript_embedding)
    abstract_comp = cosine_sim_nyt(nyt_df, 'abstract', transcript_embedding)
    leadpara_comp = cosine_sim_nyt(nyt_df, 'lead_paragraph', transcript_embedding)

    transcript_date_df['headline_sim'] = headline_comp[0]
    transcript_date_df['abstract_sim'] = abstract_comp[0]
    transcript_date_df['leadpara_sim'] = leadpara_comp[0]

    max = -1
    top_headline = 'xxx'
    type_comp = 'yyy'
    for headline, cosine_sim, col in [headline_comp, abstract_comp, leadpara_comp]:
        if cosine_sim > max:
            max = cosine_sim
            top_headline = headline
            type_comp = col
    
    transcript_date_df['top_cosine_sim'] = max
    transcript_date_df['top_headline'] = top_headline
    transcript_date_df['top_headline_fromtype'] = type_comp

    ## Part 2: transcription keyword comparison to NYT keywords (revised by us)
    nyt_df['keywords_cleaned'] = nyt_df['keywords'].apply(split_keywords)
    nyt_df['headline_cleaned'] = nyt_df['headline'].apply(clean_headline)
    nyt_df['nyt_keywords'] = nyt_df['keywords_cleaned'] + nyt_df['headline_cleaned']
    nyt_df['article_sentence'] = nyt_df.apply(lambda row: ' '.join(row['keywords_cleaned'] + row['headline_cleaned']), axis=1)
    
    transcription_key_sen = clean_transcription(row['trancription'])
    trans_key_embedding = embed([transcription_key_sen])[0]

    # find cosine similarity for each article 
    max_sim_keywords = -1  # initialize maximums
    max_index_keywords = -1
    for index2, nyt_row in nyt_df.iterrows():
        cosine_similarities_kyewords = {}
        nyt_embedding = embed([nyt_row['article_sentence']])[0]               # universal sentence encoder
    
         # calculate cosine similarity
        cosine_sim_key = cosineSimilarity(trans_key_embedding, nyt_embedding)

        if cosine_sim_key > max_sim_keywords:
            max_sim_keywords = cosine_sim_key
            max_index_keywords = index2
            top_row_key = nyt_df.loc[max_index_keywords]
    headline_key = top_row_key['headline']
    
    print("Index of Maximum Cosine Similarity for Keywords:", max_index_keywords)
    print("Maximum Cosine Similarity for Keywords:", max_sim_keywords)

    transcript_date_df['top_cosine_sim_keywords'] = max_sim_keywords
    transcript_date_df['top_headline_keywords'] = headline_key
    transcript_date_df['transcript_keywords'] = transcription_key_sen
    transcript_date_df['top_nyt_article_keywords'] = top_row_key['article_sentence']

In [None]:
transcript_date_df.head()