In [7]:
import requests
import json
import math
import numpy as np
import pandas as pd
import warnings
warnings.filterwarnings("ignore")

def cosine_similarity(vec1, vec2):
    dot_product = np.dot(vec1, vec2)
    norm_vec1 = np.linalg.norm(vec1)
    norm_vec2 = np.linalg.norm(vec2)
    return dot_product / (norm_vec1 * norm_vec2)


def euclidean_distance(vector1, vector2):
    if len(vector1) != len(vector2):
        raise ValueError("Vectors must be of the same length")
    
    distance = math.sqrt(sum((a - b) ** 2 for a, b in zip(vector1, vector2)))
    return distance



In [8]:
def get_ocean_vector(text):
    ocean = json.loads(requests.post(url='http://127.0.0.1:3000/predict',json={
        "id" :None,
        "text":text,
        "model":"ocean"
    }).text)
    predictions = ocean['predictions']
    vector = [predictions['O'], predictions['C'], predictions['E'], predictions['A'], predictions['N']]
    return vector

In [9]:
import glob,json
files = glob.glob('../dataset/*.json')

In [10]:
def find_similarity(pair,type='cos'):
    with open(pair[0],'r') as file:
        text1 = json.loads(file.read())['text']
        file.close()
    with open(pair[1],'r') as file:
        text2 = json.loads(file.read())['text']
        file.close()
    vec1 = get_ocean_vector(text1)
    vec2 = get_ocean_vector(text2)

    if type == 'cos': similarity = cosine_similarity(vec1,vec2)
    else: similarity = euclidean_distance(vec1,vec2)

    return [pair[0],pair[1],round(float(similarity),5)]
    

In [35]:
from itertools import combinations
pairs = list(combinations(files, 2))

dataset = []

for pair in pairs:
    dataset.append(find_similarity(pair,'ec'))

similarity_df = pd.DataFrame(dataset,columns=['Transcript_One','Transcript_Two','Similarity Index'])


In [36]:
similarity_df = similarity_df.sort_values(by='Similarity Index',ascending=False)

In [37]:
similarity_df['Transcript_One'] = similarity_df['Transcript_One'].apply(lambda x: x.replace('../dataset/', ''))
similarity_df['Transcript_One'] = similarity_df['Transcript_One'].apply(lambda x: x.replace('.json', ''))
similarity_df['Transcript_Two'] = similarity_df['Transcript_Two'].apply(lambda x: x.replace('../dataset/', ''))
similarity_df['Transcript_Two'] = similarity_df['Transcript_Two'].apply(lambda x: x.replace('.json', ''))

In [38]:
similarity_df

Unnamed: 0,Transcript_One,Transcript_Two,Similarity Index
5,seeking_alpha_transcripts-4,video_transcripts-2,5.12543
3,seeking_alpha_transcripts-4,seeking_alpha_transcripts-3,4.72758
14,video_transcripts-1,video_transcripts-2,4.60977
12,video_transcripts-1,seeking_alpha_transcripts-3,4.2107
10,seeking_alpha_transcripts-5,video_transcripts-2,2.86007
4,seeking_alpha_transcripts-4,seeking_alpha_transcripts-1,2.82312
2,seeking_alpha_transcripts-4,seeking_alpha_transcripts-2,2.61916
17,seeking_alpha_transcripts-2,video_transcripts-2,2.51595
0,seeking_alpha_transcripts-4,seeking_alpha_transcripts-5,2.51595
8,seeking_alpha_transcripts-5,seeking_alpha_transcripts-3,2.46982


In [39]:
import pandas as pd
import plotly.express as px


df = similarity_df

df['hover_text'] = df['Transcript_One'] + " vs " + df['Transcript_Two'] + "<br>Similarity: " + df['Similarity Index'].astype(str)

# Plot using Plotly
fig = px.scatter(
    df,
    x=df.index,
    y='Similarity Index',
    color='Similarity Index',
    hover_name='hover_text',
    title='Similarity Plot',
    labels={'x': 'Pair Index', 'Similarity Index': 'Similarity Index'},
    color_continuous_scale='Viridis'
)

# Update layout
fig.update_traces(marker=dict(size=12))
fig.update_layout(
    xaxis_title="Pair Index",
    yaxis_title="Similarity Index",
    coloraxis_colorbar=dict(title="Similarity Index"),
    hovermode="closest"
)

# Show plot
fig.show()

In [3]:
!pip3 install jupyterlab "ipywidgets>=7.5"

Defaulting to user installation because normal site-packages is not writeable
Collecting jupyterlab
  Downloading jupyterlab-4.3.3-py3-none-any.whl (11.7 MB)
[K     |████████████████████████████████| 11.7 MB 2.0 MB/s eta 0:00:01
[?25hCollecting ipywidgets>=7.5
  Downloading ipywidgets-8.1.5-py3-none-any.whl (139 kB)
[K     |████████████████████████████████| 139 kB 1.2 MB/s eta 0:00:01
Collecting jupyter-server<3,>=2.4.0
  Downloading jupyter_server-2.14.2-py3-none-any.whl (383 kB)
[K     |████████████████████████████████| 383 kB 1.8 MB/s eta 0:00:01
[?25hCollecting notebook-shim>=0.2
  Downloading notebook_shim-0.2.4-py3-none-any.whl (13 kB)
Collecting tomli>=1.2.2
  Downloading tomli-2.2.1-py3-none-any.whl (14 kB)
Collecting async-lru>=1.0.0
  Downloading async_lru-2.0.4-py3-none-any.whl (6.1 kB)
Collecting httpx>=0.25.0
  Downloading httpx-0.28.1-py3-none-any.whl (73 kB)
[K     |████████████████████████████████| 73 kB 1.0 MB/s eta 0:00:01
[?25hCollecting jupyter-lsp>=2.0.0
  D

In [2]:
!pip3 install plotly

Defaulting to user installation because normal site-packages is not writeable
Collecting plotly
  Downloading plotly-5.24.1-py3-none-any.whl (19.1 MB)
[K     |████████████████████████████████| 19.1 MB 863 kB/s eta 0:00:01
Installing collected packages: plotly
Successfully installed plotly-5.24.1
You should consider upgrading via the '/Library/Developer/CommandLineTools/usr/bin/python3 -m pip install --upgrade pip' command.[0m
