# Cosine Similarity with Transcriptions

Notebook used to find cosine similarity between video transcriptions and NYT articles

In [9]:
import pandas as pd
import tensorflow as tf
import tensorflow_hub as hub
import numpy as np
import os

In [13]:
# get combined pdf to access dates

cwd = os.getcwd()
metadata_dir = f'{cwd}/../pre-processing/metadata-csv'
metadata_files = [file for file in os.listdir(metadata_dir) if "Sec2Gr3_" in file]      # only get metadata files for our group

dataframes = []

for file in metadata_files:
    file_path = os.path.join(metadata_dir, file)
    df = pd.read_csv(file_path)
    dataframes.append(df)

combined_df = pd.concat(dataframes, ignore_index=True)
combined_df.head()

Unnamed: 0,video_id,video_timestamp,video_duration,video_locationcreated,suggested_words,video_diggcount,video_sharecount,video_commentcount,video_playcount,video_description,video_is_ad,video_stickers,author_username,author_name,author_followercount,author_followingcount,author_heartcount,author_videocount,author_diggcount,author_verified
0,7273221955937914155,2023-08-30T16:56:01,37.0,US,"angels in tibet, angels in tibet dance, angels...",356300.0,5606.0,986.0,2000000.0,Replying to @jade🐉not perfect yet & i made a ...,False,,thebeaulexx,beaulexx,,,,,,False
1,7273221955937914155,2023-08-30T16:56:01,37.0,US,"angels in tibet, angels in tibet dance, angels...",356300.0,5606.0,986.0,2000000.0,Replying to @jade🐉not perfect yet & i made a ...,False,,thebeaulexx,beaulexx,,,,,,False
2,7283080657893379334,2023-09-26T06:32:40,15.0,PH,"angels in tibet, Jam Republic, angels in tibet...",419100.0,3518.0,708.0,2600000.0,🧠🧠🧠,False,,clarkie_cpm,Clarkie,,,,,,False
3,7273221955937914155,2023-08-30T16:56:01,37.0,US,"angels in tibet, angels in tibet dance, angels...",356300.0,5606.0,986.0,2000000.0,Replying to @jade🐉not perfect yet & i made a ...,False,,thebeaulexx,beaulexx,,,,,,False
4,7285397643725983008,2023-10-02T12:23:48,37.0,US,"Dream Academy, angels in tibet, Adela Dream Ac...",142700.0,1373.0,551.0,1000000.0,s/o to dream academy for teaching me how to da...,False,,adelajergova,ADÉLA,,,,,,False


In [11]:
transcript_dir = f'{cwd}/../transcription /txt-transcripts/'

transcripts = [file for file in os.listdir(transcript_dir) if file.endswith('.txt')]

transcript_dict = {'video_id': [], 'transcription': []}

for file in transcripts:
    video_id = file.split('.')[0]
    with open(os.path.join(transcript_dir, file), "r") as transcription_file:
        transcription = transcription_file.read()

    transcript_dict['video_id'].append(video_id)
    transcript_dict['transcription'].append(transcription)

transcript_df = pd.DataFrame(transcript_dict)

In [12]:
transcript_df.head()

Unnamed: 0,video_id,transcription
0,7236076693822246170,Outro
1,7295142634719415595,"Yeah, yeah, uh, hang up. Huh?"
2,7305273374962371882,Here's Curry. Curry looking to take Wembley. ...
3,7299203592739704106,We have an awesome bathtub. It's a jacuzzi tu...
4,7295523111103991083,Lalalilililililililililililililililililililil...


In [None]:
date_df = combined_df[['video_id', 'video_timestamp']]
date_df['video_date'] = date_df['video_timestamp'].str[:10]

In [3]:
# load the Universal Sentence Encoder's TF Hub module
embed = hub.load("https://tfhub.dev/google/universal-sentence-encoder/4")

In [2]:
# cosine similarity function, from week 7 notebook
from numpy.linalg import norm

def cosineSimilarity(vec1, vec2):
    """Calculate the cosine similarity between two vectors."""
    V1 = np.array(vec1)
    V2 = np.array(vec2)
    cosine = np.dot(V1, V2)/(norm(V1)*norm(V2))
    return cosine

In [5]:
from get_nyt_articles_revised import filter_by_date

In [6]:
def get_abstract(headline, date):
    """Given a NYT headline and date, returns the abstract for comparison"""
    nyt_df = filter_by_date(date)

    for index, nyt_row in nyt_df.iterrows():
        if nyt_row['headline'] == headline:
            abstract = nyt_row['abstract']

    return abstract
    

In [None]:
# create df with video_id, date, transcription, and abstract


In [None]:
# calculate cosine similarities and add them to the dictionary

cosine_similarities = {}

for index, row in comparison_df.iterrows():
    video_id = row['video_id']
    video_sentence = row['video_sentences']
    #print(type(video_sentence))
    nyt_sentence = row['nyt_sentences']
    
    # using universal sentence encoder
    #video_embedding = embed([video_sentence])[0]    # not sure why??
    #nyt_embedding = embed([nyt_sentence])[0]

    #using sbert model 
    video_embedding = sbert_model.encode([video_sentence])[0]
    nyt_embedding = sbert_model.encode([nyt_sentence])[0]

    # calculate cosine similarity
    cosine_sim = cosineSimilarity(video_embedding, nyt_embedding)

    # add to dictionary
    cosine_similarities[video_id] = cosine_sim

cosine_similarities