In [3]:
import pandas as pd
import math
from collections import Counter

# Load the uploaded file
file_path = "Processed_Reviews.csv"
df = pd.read_csv(file_path)

# Extract the tokenized column and convert string representations of lists back to lists
tokenized_reviews = df['tokenized'].dropna().apply(eval)

# TF computation
def compute_tf(document):
    word_count = Counter(document)
    tf = {word: count / len(document) for word, count in word_count.items()}
    return tf

# IDF computation
def compute_idf(documents):
    N = len(documents)
    idf = {}
    all_words = set(word for doc in documents for word in doc)
    for word in all_words:
        count = sum(1 for doc in documents if word in doc)
        idf[word] = math.log(N / count)
    return idf

# TF-IDF computation
def compute_tfidf(document, idf):
    tfidf = {}
    tf = compute_tf(document)
    for word, tf_value in tf.items():
        tfidf[word] = tf_value * idf[word]
    return tfidf

# Convert tokenized reviews to list of lists
documents = tokenized_reviews.tolist()

# Compute TF for each document
tf_data = [compute_tf(doc) for doc in documents]

# Create DataFrame for TF
tf_df = pd.DataFrame(tf_data).fillna(0)
tf_df.to_csv("tf_scores.csv", index=False)

# Compute IDF
idf = compute_idf(documents)
idf_df = pd.DataFrame([idf]).fillna(0)
idf_df.to_csv("idf_scores.csv", index=False)

# Compute TF-IDF for each document
tfidf_data = [compute_tfidf(doc, idf) for doc in documents]

# Create DataFrame for TF-IDF
tfidf_df = pd.DataFrame(tfidf_data).fillna(0)
tfidf_df.to_csv("tfidf_scores.csv", index=False)

print("TF Scores saved to /mnt/data/tf_scores.csv")
print("IDF Scores saved to /mnt/data/idf_scores.csv")
print("TF-IDF Scores saved to /mnt/data/tfidf_scores.csv")


TF Scores saved to /mnt/data/tf_scores.csv
IDF Scores saved to /mnt/data/idf_scores.csv
TF-IDF Scores saved to /mnt/data/tfidf_scores.csv
