In [2]:
from string import punctuation
from stemming.porter2 import stem
import csv
import re
import math
from os import listdir
from os.path import isfile, join
from collections import Counter

In [3]:
def loadWords(fname):
    with open(fname, encoding="UTF-8") as file:
        return [word for line in file for word in line.split()]

stopWords = loadWords("stopwords_en.txt")
def filterWords(words):
    words = filter(lambda word: len(word) > 0, [transform(word) for word in words])
    words = difference(words, stopWords)
    return words

In [4]:
def difference(words, stopWords):
    return [word for word in words if word not in stopWords]

In [5]:
def transform(word):
    return stem(word.lower().translate(str.maketrans('', '', punctuation)))

In [6]:
def countOccurences(words):
    occurences = { word : 1 for word in words }
    for word in words:
        occurences[word] += 1
    return occurences

In [7]:
def writeCSVToFile(data, fname):
    with open(fname,'w') as out:
        csv_out=csv.writer(out)
        csv_out.writerow(['weight','word'])
        csv_out.writerows(data)
        
def writeToFile(fname, data):
    with open(fname, 'w') as out:
        out.write(data)

In [8]:
def divideIntoChapters(fname):
    pattern = r"####-\s*\n*(Chapter \d+\.?(\s[\w-]+)*)"
    with open(fname, 'r', encoding='UTF-8') as file:
        book = file.read()
        chapters = [term[0] for term in re.findall(pattern, book, re.IGNORECASE)]
        zipped = zip(chapters, re.split(pattern, book)[0::3][1:])
        return { name : content for name, content in zipped}

In [9]:
words = filterWords(loadWords("lotr.txt"))

In [10]:
wordDict = countOccurences(words)
pairs = list(wordDict.items())

In [11]:
pairs.sort(key=lambda pair: pair[1], reverse=True); pairs[:10]

[('frodo', 1992),
 ('great', 1408),
 ('long', 1382),
 ('gandalf', 1305),
 ('sam', 1289),
 ('befor', 1233),
 ('back', 1227),
 ('dark', 1208),
 ('time', 1050),
 ('day', 993)]

In [12]:
def computeTF(wordDict):
    tfDict = {}
    bagOfWordsCount = sum(wordDict.values())
    for word, count in wordDict.items():
        tfDict[word] = count / float(bagOfWordsCount)
    return tfDict

In [13]:
def computeIDF(documents):
    N = len(documents)
    allWords = [word for document in documents for word in document.keys()]
    idfDict = dict.fromkeys(allWords, 0)
    for document in documents:
        for word, val in document.items():
            if val > 0:
                idfDict[word] += 1
    
    for word, val in idfDict.items():
        idfDict[word] = math.log(N / float(val))
    return idfDict

In [14]:
def computeTFIDF(document, idfs):
    tfidf = {}
    for word, val in document.items():
        tfidf[word] = val * idfs[word]
    return tfidf

In [15]:
# chapters = divideIntoChapters("lotr.txt");
# for name, content in chapters.items():
#     writeToFile("chapters/" + name + ".txt", content)

In [16]:
mypath = "chapters/"
chapterFileNames = [f for f in listdir(mypath) if isfile(join(mypath, f))];
documents = [countOccurences(filterWords(loadWords(mypath + fname))) for fname in chapterFileNames]

In [17]:
documentsTFs = [computeTF(doc) for doc in documents]
idfs = computeIDF(documents)

In [18]:
documentsTFIDFs = [computeTFIDF(doc, idfs) for doc in documentsTFs]

In [19]:
def tfidfsToData(document):
    return sorted(map(lambda pair: (math.floor(pair[1] * 100000), pair[0]), document.items()), key=lambda pair: pair[0], reverse=True)

In [20]:
for chapterName, document in zip(chapterFileNames, documentsTFIDFs):
    data = tfidfsToData(document)
    writeCSVToFile(data, 'weighted/' + chapterName + '.csv')

In [21]:
def getMerged(documents):
    allPairs = [(word, val) for doc in documentsTFIDFs for word, val in doc.items()]
    tfidf = dict.fromkeys([word for word, val in allPairs], 0)
    for word, val in allPairs:
        tfidf[word] += val
    return tfidf

In [22]:
mergedDocuments = getMerged(documentsTFIDFs)
writeCSVToFile(tfidfsToData(mergedDocuments), 'wordcloud_tfidf.csv')

In [23]:
def findWord(word):
    zipped = dict(zip(chapterFileNames, documentsTFIDFs))
    appearIn = [name for name, stats in zipped.items() if word in stats.keys()]
    return sorted([(name, math.floor(100000 * zipped[name][word])) for name in appearIn], key=lambda pair: pair[1], reverse=True)

In [29]:
findWord("lake")

[('Chapter 4. A Journey in the Dark.txt', 163),
 ('Chapter 2.txt', 121),
 ('Chapter 9. The Great River.txt', 107),
 ('Chapter 7. Homeward Bound.txt', 105),
 ('Chapter 8. The Road to Isengard.txt', 99),
 ('Chapter 10. The Breaking of the Fellowship.txt', 88),
 ('Chapter 1.txt', 70),
 ('Chapter 3. Three is Company.txt', 66),
 ('Chapter 8. Farewell to Lurien.txt', 64),
 ('Chapter 3. The Black Gate is Closed.txt', 64),
 ('Chapter 6. Many Partings.txt', 59),
 ('Chapter 5. The Steward and the King.txt', 58),
 ('Chapter 9. Flotsam and Jetsam.txt', 56),
 ('Chapter 2. The Land of Shadow.txt', 48),
 ('Chapter 10. The Black Gate Opens.txt', 42),
 ('Chapter 5. A Conspiracy Unmasked.txt', 36),
 ('Chapter 3. The Uruk-Hai.txt', 32),
 ('Chapter 2. The Council of Elrond.txt', 29)]

In [25]:
allWords = filterWords(loadWords("lotr.txt"))
def nextToWord(word):
    try:
        indices = [i for i, x in enumerate(allWords) if x == word]
        wordsAfter = [allWords[i+1] for i in indices]
        return list(map(lambda pair: pair[0], sorted(Counter(wordsAfter).items(), key=lambda pair: pair[1], reverse=True)))[:5]
    except:
        return []

In [26]:
next5 = { word : nextToWord(word) for word in set(allWords[:1000]) }

In [27]:
next5

{'short': ['cut', 'time', 'sword', 'black', 'long'],
 'toe': ['neat', 'fall', 'shook', 'fire', 'fight'],
 'paint': ['shield', 'red', 'green', 'week', 'white'],
 'fashion': ['kind', 'hill', 'mutton', 'larg', 'dwarv'],
 'hope': ['find', 'hope', 'escap', 'left', 'ani'],
 'move': ['slowli', 'great', 'sam', 'veri', 'frodo'],
 'hang': ['peg', 'em', 'row', 'abov', 'twilight'],
 'nasti': ['thing', 'littl', 'job', 'end', 'cruel'],
 'tree': ['grew', 'tree', 'stood', 'flower', 'great'],
 'passag': ['led', 'side', 'dwarv', 'mountain', 'lead'],
 'begin': ['feel', 'sing', 'journey', 'understand', 'hope'],
 'point': ['light', 'road', 'left', 'ahead', 'littl'],
 'gasp': ['frodo', 'breath', 'sam', 'bombur', 'clutch'],
 'life': ['hope', 'home', 'mere', 'give', 'sam'],
 'beermug': ['pantri'],
 'saucer': ['extra', 'green'],
 'bathroom': ['cellar', 'mix'],
 'chanc': ['escap', 'chanc', 'make', 'manag', 'onc'],
 'smoker': ['morn', 'wherev', 'short', 'green', 'wind'],
 'belt': ['tree', 'gold', 'round', 'pearl