# EDAN20 Laboration 1 - 12/9
### Author: Sepehr Tayari

The objective of the laboration is to create a program which reads one file, 'file_name.txt' and outputs an index file 'file_name.idx'.

In addition to this, the program should also create different dictionaries. One dictionary which saves words as keys, and list of index position of each word as value. A master index shall also be created, which has words as key, and another dictionary as value. The second dictionary should have file name as key and list of word position as value. The master indexer should represent data of in which text files a certain word appears in.

We start with importing all modules we need, and creating the dictionaries which will be used

In [19]:
import regex as re
import pickle
import os
import math
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np
import pandas as pd

word_dict = dict()
book_dict = dict()
master_dict = dict()
corpus = dict()
word_count = dict()


A data set of all words are saved in dictionary corpus. The dict is built, and the text is cleaned away from characters which are not needed.

In [20]:
def text_cleaner(text):
    """
    :param str text: File to be cleaned up. The function will remove all non letter characters and make all letters
    lower case
    :return str text: the new text file
    """
    # Removes all new lines and replaces with spaces.
    text = re.sub("\n", " ", text)
    # Replace non-letters with nothing
    text = re.sub('[^a-zöäåA-ZÖÄÅ\s]', '', text)
    # Make all words lower case
    text = text.lower()
    return text

In [21]:
def build_dict(text):
    """
    Adds words to global dictionary 'corpus'
    :param str text: all words in the string text will be added to a dictionary. String text must be letters only and
    all lower case.
    """
    global corpus
    text = text_cleaner(text)
    for word in text.split(' '):
        corpus[word] = []

In order to read in our needed files we use the following method.

In [22]:
def get_files(dir, suffix):
    """
    Returns all the files in a folder ending with suffix
    :param dir: The directory of the files
    :param suffix: The suffix the files end in. Example '.txt'.
    :return: the list of file names
    """
    files = []
    for file in os.listdir(dir):
        if file.endswith(suffix):
            files.append(file)
    return files

To index all the files, the method word_indexer is used, which takes one file as an argument.

In [23]:
def word_indexer(text):
    """
    Indexes all the words in the text, and appends them to the global dictionay word_dict.
    :param str text: the string which should be indexed.
    """
    # Match every word in string text.
    for match in re.finditer(r'\p{L}+', text.lower()):
        match_word = match.group()
        # If the word exist in the dict, append the new index.
        if match_word in word_dict.keys():
            word_dict[match_word].append(match.start(0))
        # If the word does not yet exist as a key. Add it with its index for the first time, as a list.
        else:
            word_dict[match_word] = [match.start(0)]
    return word_dict

To index all words to their corresponding file, in the master index, following method is used

In [24]:
def master_indexer():
    """
    Indexes all words to which textfile they occur in, and at which index.
    {word: {file_name: [start0, start1, ...]}} to file master_dict.
    :return:
    """
    global master_dict
    global corpus
    for word in corpus:
        master_dict[word] = {}
        for file_name in files:
            try:
                master_dict[word][file_name] = book_dict[file_name][word]
            except:
                nop = 1

Now we can read in all the files.

In [25]:
files = get_files('Selma2', 'txt')

In [None]:
for file_name in files:

    fil = open('Selma2/'+file_name, 'r')
    fil = fil.read()

    # We also do a word count for each file. This will be needed later in
    # the tf-idf
    word_count[file_name]= len(fil.split(" "))


    word_dict = word_indexer(fil)
    build_dict(fil)
    # pickle.dump(word_dict, open('{}.idx'.format(file_name), 'wb'))
    # Building master index:
    book_dict[file_name] = word_dict

    # Reset the word_dict so it can store new text file
    word_dict = dict()

Example as seen on the course website. Start index of word 'gjord' in file 'bannlyst.txt'

In [27]:
book_dict['bannlyst.txt']['gjord']

[8551, 183692, 220875]

We create the master index, and look for the word 'samlar'. We get the same start index as given on the course webpage.

In [31]:
master_indexer()
master_dict['samlar']

{'nils.txt': [53499, 120336],
 'osynliga.txt': [410995, 871322],
 'gosta.txt': [317119, 414300, 543686]}

# Representing Documents with tf-idf

By representing the documents with tf-idf, and look at cosinus similarities we can determine how similar the different texts are, with no regard of the order of the words. 

In [32]:
def tf_idf(files):
    """
    Calculates the tf idf for the gives files.
    :param files:
    :return: dict tf_dict:
    """
    n = 9
    global word_count
    tf_dict = dict()
    df = 0
    for file_name in files:  # In each file, how many times do the word occur
        tf_dict[file_name] = dict()
        for word in master_dict:
            df = len(master_dict[word]) # In how many stories does the word occur
            try:
                tf = len(master_dict[word][file_name])
            except:
                nop = 1
            if df != 0:
                weigh = math.log10(9/df)*tf/word_count[file_name]
                tf_dict[file_name][word] = weigh
                df = 0
                tf = 0
    return tf_dict

We run our files through the tf_idf function.

In [34]:
tf_dict = tf_idf(files)

We now got the tf-idf values. And wish to create a matrix with the cosinus similarity. 

In [45]:
def build_cos_sim_matrix(tf_dict):
    """

    :param tf_dict: dictionary with {file_name: {word : tf-value}}
    :return:
    """
    doc_matrix = np.zeros((9,len(corpus.keys())))
    word_list = corpus.keys()
    file_list = dict_dict.keys()
    for i, word in enumerate(word_list):
        for j, file in enumerate(file_list):
            try:
                doc_matrix[j, i] =tf_dict[file][word]# tf_dict[file][word]
                #print(doc_matrix[j, i], 'hej', tf_dict[file][word])
            except:
                #print('didnt work')
                nop = 1
    df = pd.DataFrame(doc_matrix)
    similarity_matrix = cosine_similarity(df)
    np.around(similarity_matrix, 2)
    temp_matrix = similarity_matrix
    

    for i, row in enumerate(temp_matrix):
        row[i] = 0
    #print(temp_matrix)
    max_tal = np.amax(temp_matrix)
    print(max_tal)
    result = np.where(temp_matrix == np.amax(temp_matrix))
    print(result)
    for key in dict_dict:
        print(key)

In [46]:
build_cos_sim_matrix(tf_dict)

[[0.         0.08838273 0.02484158 0.00399036 0.01940174 0.02829534
  0.00705616 0.00734371 0.03259857]
 [0.08838273 0.         0.04622003 0.00089369 0.00437859 0.0055027
  0.00206008 0.00214647 0.00653284]
 [0.02484158 0.04622003 0.         0.01232804 0.04236505 0.04437697
  0.01359971 0.00536902 0.0301599 ]
 [0.00399036 0.00089369 0.01232804 0.         0.0141521  0.02126363
  0.00762255 0.00129912 0.01517731]
 [0.01940174 0.00437859 0.04236505 0.0141521  0.         0.03213535
  0.01309413 0.00534542 0.02506581]
 [0.02829534 0.0055027  0.04437697 0.02126363 0.03213535 0.
  0.04185523 0.0059857  0.05585791]
 [0.00705616 0.00206008 0.01359971 0.00762255 0.01309413 0.04185523
  0.         0.00731916 0.00855811]
 [0.00734371 0.00214647 0.00536902 0.00129912 0.00534542 0.0059857
  0.00731916 0.         0.00524188]
 [0.03259857 0.00653284 0.0301599  0.01517731 0.02506581 0.05585791
  0.00855811 0.00524188 0.        ]]
0.08838272929311124
(array([0, 1]), array([1, 0]))
troll.txt
kejsaren.txt