# TF-IDF & Word2Vec Analysis

In [None]:
#imports
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
#import word2vec
from gensim.models import Word2Vec

## TF-IDF Analysis

In [None]:
#load data set
df = pd.read_csv('../results/tfidf_enriched.csv')

In [None]:
# Extract words and scores into separate lists
words_lists = df['Top 10 TF-IDF Words'].apply(lambda x: [word for word, score in x])
scores_lists = df['Top 10 TF-IDF Words'].apply(lambda x: [score for word, score in x])

# Creating a violin plot of TF-IDF scores
plt.figure(figsize=(10, 6))
sns.violinplot(data=scores_lists, orient='h', inner='quartile', palette='viridis')
plt.yticks(ticks=range(len(df)), labels=df['Filename'])
plt.xlabel('TF-IDF Score')
plt.title('Distribution of TF-IDF Scores for Each Filename')

# Annotate violin plot with top TF-IDF words
for i, (words, scores) in enumerate(zip(words_lists, scores_lists)):
    for word, score in zip(words, scores):
        plt.text(score + 0.01, i, word, va='center')  # Adjust text position for better visibility

plt.show()

In [None]:
# Create TF-IDF matrix
word_to_index = {}
tfidf_matrix = []

for word_score_list in df['Top 10 TF-IDF Words']:
    row = []
    for item in word_score_list:
        if len(item) == 2:
            word, score = item
            if word not in word_to_index:
                word_to_index[word] = len(word_to_index)
            row.append(score)
    tfidf_matrix.append(row)

# Convert tfidf_matrix to numpy array
tfidf_matrix = np.array(tfidf_matrix)

# Create a heatmap of TF-IDF scores
plt.figure(figsize=(10, 8))
sns.heatmap(tfidf_matrix, cmap='viridis', annot=True, fmt='.2f',
            xticklabels=list(word_to_index.keys()), yticklabels=df['Filename'])
plt.xlabel('Word')
plt.ylabel('Filename')
plt.title('TF-IDF Scores for Each Filename and Word')
plt.show()

## Word2Vec Analysis

In [None]:
#load word2vec model from models folder
model = Word2Vec.load('../models/word2vec_model.model')