In [1]:
# For simulation time calculation

import timeit

start = timeit.default_timer()

In [2]:
# "Query" Search  within the given Dataset i.e. Stories

# Natural Language Toolkit, or more commonly NLTK, is a suite of libraries and programs for symbolic and 
# statistical natural language processing.

import nltk 
#-------------------------------------------------------------------------------

# OS module provides functions for interacting with the operating system. 

import os
#----------------------------------------------------------------------------------

# pandas for data manipulation and analysis.

import pandas as pd
#----------------------------------------------------------------------------------

# A RegEx, or Regular Expression, is a sequence of characters that forms a search pattern.
# RegEx can be used to check if a string contains the specified search pattern. 

import re
#------------------------------------------------------------------------------------

# Mathematical functions
# This module provides access to the mathematical functions.

import math
#------------------------------------------------------------------------------------

# NumPy is a Python package which stands for ‘Numerical Python’. 
#It is the core library for scientific computing, which contains a powerful n-dimensional array object, 
#provide tools for integrating C, C++ etc. It is also useful in linear algebra, random number capability etc.

import numpy as np
#-----------------------------------------------------------------------------------

from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer
from collections import Counter
from num2words import num2words

In [3]:
# 1.  Taking all folders : loaded as... (in stories)
# 2.  Python method walk() generates the file names in a directory tree 
#     by walking the tree either top-down or bottom-up.

# folders[0]
# folders[1]
# folders[2]

# List comprehensions provide a concise way to create lists.
# For More details visit: https://www.pythonforbeginners.com/basics/list-comprehensions-in-python

folders = [x[0] for x in os.walk(str(os.getcwd())+'/'+"stories"+'/')]
folders

['C:\\Users\\sumit/stories/',
 'C:\\Users\\sumit/stories/FARNON',
 'C:\\Users\\sumit/stories/SRE']

In [4]:
# we have code to retrieve the values from index, 
# we just need to iterate to all the folders and 
# get the title and file name from all the index.html files.

# 1. Help: file.read() reads entire file's contents, unless you specify max length.
# 2.  (Help: The strip() method removes any leading (spaces at the beginning) and 
#    trailing (spaces at the end) characters (space is the default leading character to remove))
# 3. Re.findall() module is used when you want to iterate over the lines of the file, 
#    it will return a list of all the matches in a single step. 

dataset = [] # start an empty list
c = False
for i in folders:
    file = open(i+   "/index.html", 'r')
    text = file.read().strip() 
    file.close()
    file_name = re.findall('><A HREF="(.*)">', text)
    file_title = re.findall('<BR><TD> (.*)\n', text)
    if c == False:
        file_name = file_name[2:]
        c = True
    print(len(file_name), len(file_title))

    for j in range(len(file_name)):
        dataset.append((str(i) +"/"+ str(file_name[j]), file_title[j]))

452 452
0 0
15 15


In [5]:
file_name

['sre01.txt',
 'sre02.txt',
 'sre03.txt',
 'sre04.txt',
 'sre05.txt',
 'sre06.txt',
 'sre07.txt',
 'sre08.txt',
 'sre09.txt',
 'sre10.txt',
 'sre_feqh.txt',
 'sre_finl.txt',
 'sre_sei.txt',
 'sretrade.txt',
 'srex.txt']

In [6]:
# Random view inside the dataset
dataset[400][1]

'The Dark Sucker, by Quantum Mechanic'

In [7]:
# View dataset as fie_name and file_title
dataset

[('C:\\Users\\sumit/stories//100west.txt',
  'Going 100 West by 53 North by Jim Prentice (1990)'),
 ('C:\\Users\\sumit/stories//13chil.txt', 'The Story of the Sly Fox'),
 ('C:\\Users\\sumit/stories//14.lws', 'A Smart Bomb with a Language Parser'),
 ('C:\\Users\\sumit/stories//16.lws', 'Two Guys in a Garage, by M. Pshota'),
 ('C:\\Users\\sumit/stories//17.lws',
  'The Early Days of a High-Tech Start-up are Magic (November 18, 1991) by M. Peshota'),
 ('C:\\Users\\sumit/stories//18.lws',
  'The Couch, the File Cabinet, and the Calendar, by M. Peshota (December 9, 1991)'),
 ('C:\\Users\\sumit/stories//19.lws',
  'Engineering the Future of American Technology by M. Peshota (January 5, 1992)'),
 ('C:\\Users\\sumit/stories//20.lws',
  'What Research and Development Was Always Meant to Be, by M. Peshota'),
 ('C:\\Users\\sumit/stories//3gables.txt',
  'The Adventure of the Three Gables'),
 ('C:\\Users\\sumit/stories//3lpigs.txt', 'The Story of the 3 Little Pigs'),
 ('C:\\Users\\sumit/stories//3

In [8]:
# Calculate the length of the dataset
len(dataset)

467

In [9]:
N = len (dataset)

In [10]:
def convert_lower_case(data):
    return np.char.lower(data)

In [11]:
def remove_stop_words(data):
    stop_words = stopwords.words('english')
    words = word_tokenize(str(data))
    new_text = ""
    for w in words:
        if w not in stop_words and len(w) > 1:
            new_text = new_text + " " + w
    return new_text

In [12]:
def remove_punctuation(data):
    symbols = "!\"#$%&()*+-./:;<=>?@[\]^_`{|}~\n"
    for i in range(len(symbols)):
        data = np.char.replace(data, symbols[i], ' ')
        data = np.char.replace(data, "  ", " ")
    data = np.char.replace(data, ',', '')
    return data

In [13]:
def remove_apostrophe(data):
    return np.char.replace(data, "'", "")

In [14]:
def stemming(data):
    stemmer= PorterStemmer()
    
    tokens = word_tokenize(str(data))
    new_text = ""
    for w in tokens:
        new_text = new_text + " " + stemmer.stem(w)
    return new_text

In [15]:
def convert_numbers(data):
    tokens = word_tokenize(str(data))
    new_text = ""
    for w in tokens:
        try:
            w = num2words(int(w))
        except:
            a = 0
        new_text = new_text + " " + w
    new_text = np.char.replace(new_text, "-", " ")
    return new_text

In [16]:
# The process of converting data to something a computer can understand is referred to as pre-processing.
# Preprocessing

def preprocess(data):
    data = convert_lower_case(data)
    data = remove_stop_words(data)
    data = remove_punctuation(data) #remove comma seperately
    data = remove_apostrophe(data)
    data = stemming(data)
    data = convert_numbers(data)
    return data

In [17]:
# Extracting Data
processed_text = []
processed_title = []

for i in dataset[:N]:
    file = open(i[0], 'r', encoding="utf8", errors='ignore')
    text = file.read().strip()
    file.close()

    processed_text.append(word_tokenize(str(preprocess(text))))
    processed_title.append(word_tokenize(str(preprocess(i[1]))))

In [18]:
# Calculating DF for all words
DF = {}

for i in range(N):
    tokens = processed_text[i]
    for w in tokens:
        try:
            DF[w].add(i)
        except:
            DF[w] = {i}

    tokens = processed_title[i]
    for w in tokens:
        try:
            DF[w].add(i)
        except:
            DF[w] = {i}
for i in DF:
    DF[i] = len(DF[i])

In [19]:
len(DF)

32533

In [20]:
# DF
total_vocab_size = len(DF)

In [21]:
total_vocab_size

32533

In [22]:
# List comprehensions provide a concise way to create lists.
# For More details visit: https://www.pythonforbeginners.com/basics/list-comprehensions-in-python

total_vocab = [x for x in DF]

In [23]:
print(total_vocab[:])



In [24]:
def doc_freq(word):
    c = 0
    try:
        c = DF[word]
    except:
        pass
    return c

In [25]:
# Calculating TF-IDF for body, we will consider this as the actual tf-idf as we will add the title weight to this.
doc = 0

tf_idf = {}

for i in range(N):
    
    tokens = processed_text[i]
    
    counter = Counter(tokens + processed_title[i])
    words_count = len(tokens + processed_title[i])
    
    for token in np.unique(tokens):
        
        tf = counter[token]/words_count
        df = doc_freq(token)
        idf = np.log((N+1)/(df+1))
        
        tf_idf[doc, token] = tf*idf

    doc += 1

In [26]:
tf_idf

{(0, ','): 0.0008585159210279618,
 (0, '4x4'): 0.0030391761088343744,
 (0, 'a'): 0.0008653751787092239,
 (0, 'abl'): 0.0005200738374978614,
 (0, 'abound'): 0.0018150955643573717,
 (0, 'absenc'): 0.0014136770937456392,
 (0, 'access'): 0.0012240805444770025,
 (0, 'accustom'): 0.0030610260883069543,
 (0, 'activ'): 0.0018246337486359852,
 (0, 'acut'): 0.0019166674622927225,
 (0, 'ad'): 0.0007723088362784906,
 (0, 'adapt'): 0.001632084942088828,
 (0, 'adjust'): 0.001268672582178137,
 (0, 'adult'): 0.0015305130441534772,
 (0, 'advanc'): 0.0009635808847471024,
 (0, 'adventuresom'): 0.0030391761088343744,
 (0, 'afford'): 0.0011729342774836195,
 (0, 'ago'): 0.0005674482345373667,
 (0, 'aid'): 0.0010351024260552956,
 (0, 'air'): 0.0009528283801595035,
 (0, 'aircraft'): 0.026414999789959405,
 (0, 'airplan'): 0.0022012499824966172,
 (0, 'airport'): 0.0071398981988884825,
 (0, 'airstrip'): 0.0030391761088343744,
 (0, 'allow'): 0.0007093431602877412,
 (0, 'alright'): 0.0015122457333885796,
 (0, 'als

In [27]:
len(tf_idf)

347644

In [28]:
# TF-IDF Matching Score Ranking

# Note: put "reverse = True" to make it sort highest to lowest 
# Note: #mylist = [[7, 8], [4, 2, 3], [9, 5, 6]]
# list(map(lambda x: x[1], mylist)) returns [8, 2 ,5]
#mylistSort = sorted(mylist, key = lambda x: x[1], reverse=True)
# will sort the nested list based on the result of the lambda function 

def matching_score(k, query):
    preprocessed_query = preprocess(query)
    tokens = word_tokenize(str(preprocessed_query))

    print("Matching Score")
    print("\nQuery:", query)
    print("")
    print(tokens)
    
    query_weights = {}

    for key in tf_idf:
        
        if key[1] in tokens:
            try:
                query_weights[key[0]] = query_weights[key[0]] + tf_idf[key]
            except:
                query_weights[key[0]] = tf_idf[key]
    
    query_weights = sorted(query_weights.items(), key=lambda x: x[1], reverse=True)
    
    return query_weights[:10]
    print("")

In [29]:
# Document print by assigned id - used in end of the program

def print_doc(id):
    print(dataset[id])
    file = open(dataset[id][0], 'r', encoding='cp1250')
    text = file.read().strip()
    file.close()
    print(text)

In [35]:
query = "what is capital of india"
result = matching_score(10, query)
file_score = []
result_file_name = [(dataset[a][1], b) for (a, b) in result]

Matching Score

Query: what is capital of india

['capit', 'india']


In [36]:
result_file_name

[('The Spirit and the Flute (How Flutes Relate to History)',
  0.009844832248505797),
 ('Introduction to the Neolithic Great Goddess, by Ben Blumenberg (February 12, 1992)',
  0.009568808914435541),
 ("The World's Fastest Spider, by Andrew Varga (1991)", 0.008338007184656698),
 ('FICTION: Yahoo! The Ohio VOW has The Bomb by Gregory S. Swann (1986)',
  0.0049611559621561),
 ('The Story of Hop Frog', 0.0043170703125886235),
 ('Branded, by John R. Hillman, Jr.', 0.004189208779367098),
 ('The Story of the Six Able Men', 0.004007913809008088),
 ('The Empress Jowka', 0.0037257474632589557),
 ('The SRE Commerce and Trade Theories, by Josh Renaud', 0.003185651393646478),
 ('The Early Days of a High-Tech Start-up are Magic (November 18, 1991) by M. Peshota',
  0.0026379656504865036)]

In [32]:
result

[(182, 0.00960626668079936),
 (199, 0.004611008006783693),
 (4, 0.0037526006158972065),
 (307, 0.0036704541347531883),
 (439, 0.003666805571994984),
 (264, 0.002665322547273811),
 (303, 0.002632981017435371),
 (116, 0.0024857186020397267),
 (309, 0.0023170894506450716),
 (398, 0.0021957180984684254)]

In [33]:
# For simulation time calculation

stop = timeit.default_timer()

print('Time (seconds): ', stop - start) 

Time (seconds):  220.4733851


In [34]:
print_doc(117)

('C:\\Users\\sumit/stories//contrad1.hum', 'Contradiction 1, by Rick Brunet')
Contradiction 1
by Rick Brunet


There once was a man.  He was a good man.

In his middle years, the man was considered by all who knew him Friend.  He had
no enemies, no evil inside him, no "debts to society".  Only good came from
him, and only good came back to him.  A full circle of light.

The man had a woman, a woman he had known half his life, who loved and
cherished him, and that he loved and cherished in kind.  He was her knight in
shining armour, and she was his his Guinivere.	Together they shared a
happiness, a contentment in their own little Garden of Eden.

All was good in this man's life.  This man died at the ripe old age of
forty-nine.

Did God "call him home"?  Was this man's purity and shining strength so solid
that it could no longer be kept apart from His Paradise?  Did the very
"foundation of heaven" scream for this man's Soul?  No.

You see, he had a particular fondness for eggs as his br