Dataset: https://www.kaggle.com/datasets/annbengardt/fairy-tales-from-around-the-world

In [1]:
import pandas as pd
import os
from sklearn.feature_extraction.text import TfidfVectorizer
import json
import math

In [2]:
# Load blood cell dataset from Kaggle to Google Colab
!pip install kaggle

# Upload Kaggle API key (kaggle.json)
from google.colab import files
uploaded = files.upload()

!mkdir ~/.kaggle
!cp kaggle.json ~/.kaggle/
!chmod 600 ~/.kaggle/kaggle.json



Saving kaggle.json to kaggle.json


In [3]:
!kaggle datasets download -d annbengardt/fairy-tales-from-around-the-world
!unzip fairy-tales-from-around-the-world.zip

Dataset URL: https://www.kaggle.com/datasets/annbengardt/fairy-tales-from-around-the-world
License(s): unknown
Downloading fairy-tales-from-around-the-world.zip to /content
 72% 5.00M/6.95M [00:00<00:00, 40.5MB/s]
100% 6.95M/6.95M [00:00<00:00, 52.1MB/s]
Archive:  fairy-tales-from-around-the-world.zip
  inflating: fairy_tales/1.txt       
  inflating: fairy_tales/10.txt      
  inflating: fairy_tales/100.txt     
  inflating: fairy_tales/1000.txt    
  inflating: fairy_tales/1001.txt    
  inflating: fairy_tales/1002.txt    
  inflating: fairy_tales/1003.txt    
  inflating: fairy_tales/1004.txt    
  inflating: fairy_tales/1005.txt    
  inflating: fairy_tales/1006.txt    
  inflating: fairy_tales/1007.txt    
  inflating: fairy_tales/1008.txt    
  inflating: fairy_tales/1009.txt    
  inflating: fairy_tales/101.txt     
  inflating: fairy_tales/1010.txt    
  inflating: fairy_tales/1011.txt    
  inflating: fairy_tales/1012.txt    
  inflating: fairy_tales/1013.txt    
  inflating: 

In [5]:
folder_path = '/content/fairy_tales'
file_names = [f for f in os.listdir(folder_path) if f.endswith('.txt')]
texts = [] # merge txts into a single corpus
for file_name in file_names:
  with open(os.path.join(folder_path, file_name), 'r', encoding='ISO-8859-1') as file:
    texts.append(file.read())

print(texts[0][:1000])
print(texts[1][:1000])
print(len(texts))

   There was once a Fox and a Wolf, who set up house together in a cave near the sea-shore. Although you may not think so, they got on very well for a time, for they went out hunting all day, and when they came back at night they were generally too tired to do anything but to eat their supper and go to bed.  They might have lived together always had it not been for the slyness and greediness of the Fox, who tried to over-reach his companion, who was not nearly so clever as he was.  And this was how it came about.  It chanced, one dark December night, that there was a dreadful storm at sea, and in the morning the beach was all strewn with wreckage. So as soon as it was daylight the two friends went down to the shore to see if they could find anything to eat.  They had the good fortune to light on a great Keg of Butter, which had been washed overboard from some ship on its way home from Ireland, where, as all the world knows, folk are famous for their butter.  The simple Wolf danced with

# Determine frequencies of adjacent words

In [7]:
from collections import defaultdict
import re
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize
import string
from nltk.corpus import stopwords
from nltk.corpus import wordnet
import nltk
nltk.download('punkt')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

In [8]:
def tokenize(sentence):
    sentence = sentence.lower()
    tokenized = word_tokenize(sentence.translate(str.maketrans('', '', string.punctuation)))
    return [lemmatizer.lemmatize(token) for token in tokenized if token not in en_stopwords and wordnet.synsets(token)]

In [13]:
lemmatizer = WordNetLemmatizer()
nltk.download('stopwords')
nltk.download('wordnet')
en_stopwords = set(stopwords.words('english'))

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...


In [None]:
adj_dict = defaultdict(lambda: defaultdict(int))

for story in texts:
  # story = re.sub(r'[^\w\s]', '', story)
  # print(story)
  story = tokenize(story)
  for i, word in enumerate(story):
    # word = word.lower()
    word = lemmatizer.lemmatize(word)
    if i > 0:
      prev_word = story[i-1].lower()
      adj_dict[prev_word][word] += 1
    if i < len(story) - 1:
      next_word = story[i+1].lower()
      adj_dict[word][next_word] += 1

adj_dict = dict(adj_dict)
with open('adj_dict.json', 'w') as f:
  json.dump(adj_dict, f, indent=4)

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...


# Calculate all word frequencies in corpus

In [None]:
from collections import Counter
def calculate_word_frequencies(texts):
    word_frequencies = Counter()
    for text in texts:
        words = tokenize(text)
        word_frequencies.update(words)
    word_frequencies = {word: freq for word, freq in word_frequencies.items() if freq >= 5}
    return word_frequencies

word_frequencies = calculate_word_frequencies(texts)
with open('word_frequencies.json', 'w') as f:
  json.dump(word_frequencies, f, indent=4)

# Calculate TF-IDF for KQMW

### TF-IDF Formula:

TD-IDF (word, adjacent) = TF(word, adjacent) x IDF(adjacent)

TF = freq of adjacent word / total count of word

IDF = log (total txt files / freq of adj word in corpus)

In [10]:
with open('/content/adj_dict.json', 'r') as f:
    adj_frequencies = json.load(f) # {word: {adjacent_word: frequency, ...}, ...}

with open('/content/word_frequencies.json', 'r') as f:
    word_frequencies = json.load(f) # {word: frequency}

FileNotFoundError: [Errno 2] No such file or directory: '/content/adj_dict.json'

## Just calculates the TF-IDF values

In [None]:
def calculate_tfidfs(adj_freq, word_freq):
  tf_idf_dict = {}
  for word, adjacent_freq in adj_freq.items():
    tf_idf_dict[word] = {}
    total_word_count = word_freq.get(word, 0)
    for adjacent_word, frequency in adjacent_freq.items():
      # Calculate TF
      tf = 0
      if total_word_count is not None and total_word_count > 0:
        tf = frequency / total_word_count

      # Calculate IDF
      total_adjacent_word_count = word_freq.get(adjacent_word, 0)
      idf = 0
      if total_adjacent_word_count is not None and total_adjacent_word_count > 0:
        idf = math.log(len(texts) / 1 + total_adjacent_word_count)
      else:
        idf = 0

      # TF-IDF Value
      tf_idf_dict[word][adjacent_word] = tf * idf

  with open('tf_idf_dict.json', 'w') as f:
        json.dump(tf_idf_dict, f, indent=4)

calculate_tfidfs(adj_frequencies, word_frequencies)
print("Saved TF-IDF scores in tf_idf_dict.json")

## Calculate both TF-IDF values and Bloom Filter indicies

In [11]:
!pip install mmh3
import mmh3

def hash_digests(token, bits):
    return [mmh3.hash(token, i) % bits for i in range(3)]

Collecting mmh3
  Downloading mmh3-5.0.1-cp310-cp310-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (14 kB)
Downloading mmh3-5.0.1-cp310-cp310-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl (93 kB)
[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/93.2 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m93.2/93.2 kB[0m [31m3.5 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: mmh3
Successfully installed mmh3-5.0.1


In [None]:
def calculate_all(adj_freq, word_freq):
  word_dict = {}
  for word, adjacent_freq in adj_freq.items():
    word_dict[word] = {}
    total_word_count = word_freq.get(word, 0)
    for adjacent_word, frequency in adjacent_freq.items():
      # Calculate TF
      tf = 0
      if total_word_count is not None and total_word_count > 0:
        tf = frequency / total_word_count

      # Calculate IDF
      total_adjacent_word_count = word_freq.get(adjacent_word, 0)
      idf = 0
      if total_adjacent_word_count is not None and total_adjacent_word_count > 0:
        idf = math.log(len(texts) / 1 + total_adjacent_word_count)
      else:
        idf = 0

      # TF-IDF Value
      tf_idf_value = tf * idf
      bloom_filter_indices = hash_digests(adjacent_word, 32)
      word_dict[word][adjacent_word] = {
          'tf-idf': tf_idf_value,
          'bloom_filter': bloom_filter_indices
      }

  with open('word_dict.json', 'w') as f:
        json.dump(word_dict, f, indent=4)

calculate_all(adj_frequencies, word_frequencies)
print("Saved TF-IDF and Bloom Filters in word_dict.json")

Saved TF-IDF and Bloom Filters in word_dict.json


# Output files for +4 and -4 words around QKMW

In [17]:
keywords = ["queen", "king", "man", "woman"]

folder_path = '/content/fairy_tales'
# file_names = [f for f in os.listdir(folder_path) if f.endswith('.txt')]
# texts = [] # merge txts into a single corpus
# for file_name in file_names:
#   with open(os.path.join(folder_path, file_name), 'r', encoding='ISO-8859-1') as file:
#     texts.append(file.read())

def extract_sentences(text, keyword, window=4):
    sentences = []
    words = tokenize(text)
    for i, word in enumerate(words):
        if word.lower() == keyword.lower():
            start = max(0, i - window)
            end = min(len(words), i + window + 1)
            context = ' '.join(words[start:end])
            sentences.append(context)
    return sentences

def output_to_json(data, keyword):
    filename = f'{keyword}_sentences.json'
    with open(filename, 'w') as json_file:
        json.dump(data, json_file, indent=4)

def process_dataset_file(filepath, keywords, sent_dict):
    with open(filepath, 'r', encoding='ISO-8859-1') as file:
        text = file.read()
    filename = os.path.basename(filepath)
    for keyword in keywords:
        if keyword not in sent_dict:
            sent_dict[keyword] = {}
        sent_dict[keyword][filename] = extract_sentences(text, keyword)

sent_dict = {}

for file in os.listdir(folder_path):
    if file.endswith('.txt'):
        filepath = os.path.join(folder_path, file)
        process_dataset_file(filepath, keywords, sent_dict)

for keyword in keywords:
    output_to_json(sent_dict[keyword], keyword)