In [None]:
import pandas as pd
import re
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.cluster import KMeans
from sklearn.metrics import pairwise_distances
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer

In [None]:
import os
import re
import math
from collections import Counter

In [None]:
import nltk
nltk.download('punkt')
nltk.download('stopwords')

In [None]:
def clean_text(text):
    # Tokenization
    tokens = word_tokenize(text.lower())  # Tokenization and convert to lowercase

    # Remove stopwords and punctuation, and perform stemming
    stop_words = set(stopwords.words('english'))
    ps = PorterStemmer()

    cleaned_tokens = []
    for token in tokens:
        if token.isalpha() and token not in stop_words:
            cleaned_tokens.append(ps.stem(token))

    return ' '.join(cleaned_tokens)


In [None]:
# Step 2: Calculating TF-IDF
def calculate_tf(text):
    words = re.findall(r'\b\w+\b', text.lower())

    word_counts = Counter(words)
    total_words = len(words)
    tf = {word: count/total_words for word, count in word_counts.items()}
    return tf

def calculate_idf(documents):
    total_documents = len(documents)
    idf = {}
    for doc in documents:
        words = re.findall(r'\b\w+\b', doc.lower())
        for word in words:
        #for word in set(doc):
            idf[word] = idf.get(word, 0) + 1

    idf = {word: math.log(total_documents / count) for word, count in idf.items()}
    return idf

In [None]:
import os
from zipfile import ZipFile

# Assuming 'stories.zip' is in the current working directory
stories_folder = 'stories.zip'
story_texts = []
story_og = []

# Extract the contents of the zip file
with ZipFile(stories_folder, 'r') as zip_ref:
    zip_ref.extractall('extracted_stories')

# Iterate through the extracted files
extracted_folder = 'extracted_stories'
for file in os.listdir(extracted_folder):
    if file.endswith(".txt"):
        with open(os.path.join(extracted_folder, file), 'r', encoding='ISO-8859-1') as f:
            story_text = f.read()
            story_og.append(story_text)
            cleaned_story = clean_text(story_text)  # Assuming you have a clean_text function
            story_texts.append(cleaned_story)

print(story_texts[0])

In [None]:
stories_folder = 'extracted_stories/stories'
story_texts = []
story_og=[]
for file in os.listdir(stories_folder):
    if file.endswith(".txt"):
        with open(os.path.join(stories_folder, file), 'r', encoding='ISO-8859-1') as f:
            story_text = f.read()
            story_og.append(story_text)
            cleaned_story = clean_text(story_text)
            story_texts.append(cleaned_story)


print(story_texts[0])

[{'s': 0.047086991221069435, 'a': 0.08200319233838788, 'y': 0.018355945730247406, ' ': 0.17657621707901036, 'd': 0.029329608938547486, 'v': 0.014365522745411013, 'e': 0.08659217877094973, 'n': 0.0528731045490822, 't': 0.05407023144453312, 'u': 0.02274541101356744, 'r': 0.0548683160415004, 'p': 0.017158818834796488, 'o': 0.04868316041500399, 'i': 0.06344772545889864, 'm': 0.034118116520351155, 'l': 0.0456903431763767, 'c': 0.02773343974461293, 'b': 0.011771747805267359, 'z': 0.003391859537110934, 'w': 0.02114924181963288, 'g': 0.02274541101356744, 'h': 0.0355147645650439, 'x': 0.0013966480446927375, 'f': 0.012370311252992818, 'k': 0.014365522745411013, 'j': 0.0013966480446927375, 'q': 0.00019952114924181964}, {'a': 0.06766917293233082, 'i': 0.05545112781954887, 's': 0.05404135338345865, 'l': 0.051221804511278196, ' ': 0.17058270676691728, 'x': 0.0042293233082706765, 'm': 0.023496240601503758, 'o': 0.07612781954887218, 'r': 0.0587406015037594, 'p': 0.024906015037593984, 'h': 0.0319548872

In [None]:
# Calculate TF for each story
tf_values = [calculate_tf(story) for story in story_texts]
print(tf_values)

# Calculate IDF for all documents
idf_values = calculate_idf(story_texts)

# Calculate TF-IDF
# Calculate TF-IDF
tfidf_values = []
for tf in tf_values:
    tfidf = {word: tf_val * idf_values.get(word, 0) for word, tf_val in tf.items()}
    tfidf_values.append(tfidf)


In [None]:
idf_values

In [None]:
tf_values

In [None]:
tfidf_values

In [None]:
features = set(word for doc in tfidf_values for word in doc)

# Print the TF-IDF matrix
print("TF-IDF Matrix:")
for i, tfidf in enumerate(tfidf_values, start=1):
    row = [tfidf.get(word, 0) for word in features]
    print(f"Document {i}: {row}")

# Print feature names (words)
print("\nFeature Names:")
print(list(features))

In [None]:
# Step 3: Ranking using Cosine Similarity
def cosine_similarity(tfidf1, tfidf2):
    dot_product = sum(tfidf1.get(word, 0) * tfidf2.get(word, 0) for word in set(tfidf1) | set(tfidf2))
    magnitude1 = math.sqrt(sum(val ** 2 for val in tfidf1.values()))
    magnitude2 = math.sqrt(sum(val ** 2 for val in tfidf2.values()))
    similarity = dot_product / (magnitude1 * magnitude2)
    return similarity

def jaccard_similarity(tfidf1, tfidf2):
    intersection_size = len(set(tfidf1) & set(tfidf2))
    union_size = len(set(tfidf1) | set(tfidf2))
    similarity = intersection_size / union_size if union_size != 0 else 0
    return similarity

def euclidean_distance(tfidf1, tfidf2):
    squared_distances = sum((tfidf1.get(word, 0) - tfidf2.get(word, 0))**2 for word in set(tfidf1) | set(tfidf2))
    euclidean_distance = math.sqrt(squared_distances)
    return euclidean_distance

# Example: Calculate cosine similarity for a specific story with others
input_story_index = 40  # Change this to calculate for different stories
similarities = []
for i, tfidf in enumerate(tfidf_values):
    if i != input_story_index:
        similarity = jaccard_similarity(tfidf_values[input_story_index], tfidf)
        similarities.append((i, similarity))

# Sort and get top 3 similar stories
similarities.sort(key=lambda x: x[1],reverse= True)
top_similar_stories = similarities[:3]

# Print top similar stories with their similarity scores
for index, similarity_score in top_similar_stories:
    print(f"Similarity Score: {similarity_score}")
    #print(f"Similar Story: {story_og[index]}\n")


Similarity Score: 0.9916783172290046
Similarity Score: 0.9857054080410017
Similarity Score: 0.9766441806514474
