In [10]:
import nltk

# Download the punkt resource
nltk.download('punkt')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\18432\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping tokenizers\punkt.zip.


True

In [6]:
import numpy as np
import pandas as pd
import os
import re
import string
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
import random

In [4]:
story_path = "C:\\Users\\18432\\Downloads\\sherlock\\sherlock\\"

def read_all_stories(story_path):
    txt = []
    for _, _, files in os.walk(story_path):
        for file in files:
            with open(story_path+file) as f:
                for line in f:
                    line = line.strip()
                    if line=='----------': break
                    if line!='':txt.append(line)
    return txt
        
stories = read_all_stories(story_path)
print("number of lines = ", len(stories))

number of lines =  215021


In [11]:
def clean_txt(txt):
    cleaned_txt = []
    for line in txt:
        line = line.lower()
        line = re.sub(r"[,.\"\'!@#$%^&*(){}?/;`~:<>+=-\\]", "", line)
        tokens = word_tokenize(line)
        words = [word for word in tokens if word.isalpha()]
        cleaned_txt+=words
    return cleaned_txt

cleaned_stories = clean_txt(stories)
print("number of words = ", len(cleaned_stories))

number of words =  2332110


In [12]:
def make_markov_model(cleaned_stories, n_gram=2):
    markov_model = {}
    for i in range(len(cleaned_stories)-n_gram-1):
        curr_state, next_state = "", ""
        for j in range(n_gram):
            curr_state += cleaned_stories[i+j] + " "
            next_state += cleaned_stories[i+j+n_gram] + " "
        curr_state = curr_state[:-1]
        next_state = next_state[:-1]
        if curr_state not in markov_model:
            markov_model[curr_state] = {}
            markov_model[curr_state][next_state] = 1
        else:
            if next_state in markov_model[curr_state]:
                markov_model[curr_state][next_state] += 1
            else:
                markov_model[curr_state][next_state] = 1
    
    # calculating transition probabilities
    for curr_state, transition in markov_model.items():
        total = sum(transition.values())
        for state, count in transition.items():
            markov_model[curr_state][state] = count/total
        
    return markov_model

In [13]:
markov_model = make_markov_model(cleaned_stories)

In [14]:
print("number of states = ", len(markov_model.keys()))

number of states =  208670


In [15]:
print("All possible transitions from 'the game' state: \n")
print(markov_model['the game'])

All possible transitions from 'the game' state: 

{'your letter': 0.02702702702702703, 'was up': 0.09009009009009009, 'is afoot': 0.036036036036036036, 'for the': 0.036036036036036036, 'was in': 0.02702702702702703, 'is hardly': 0.02702702702702703, 'would have': 0.036036036036036036, 'is up': 0.06306306306306306, 'is and': 0.036036036036036036, 'in their': 0.036036036036036036, 'was whist': 0.036036036036036036, 'in that': 0.036036036036036036, 'the lack': 0.036036036036036036, 'for all': 0.06306306306306306, 'may wander': 0.02702702702702703, 'now a': 0.02702702702702703, 'my own': 0.02702702702702703, 'at any': 0.02702702702702703, 'mr holmes': 0.02702702702702703, 'ay whats': 0.02702702702702703, 'my friend': 0.02702702702702703, 'fairly by': 0.02702702702702703, 'is not': 0.02702702702702703, 'was not': 0.02702702702702703, 'was afoot': 0.036036036036036036, 'worth it': 0.02702702702702703, 'you are': 0.02702702702702703, 'i am': 0.02702702702702703, 'now count': 0.027027027027027

In [16]:
def generate_story(markov_model, limit=100, start='my god'):
    n = 0
    curr_state = start
    next_state = None
    story = ""
    story+=curr_state+" "
    while n<limit:
        next_state = random.choices(list(markov_model[curr_state].keys()),
                                    list(markov_model[curr_state].values()))
        
        curr_state = next_state[0]
        story+=curr_state+" "
        n+=1
    return story

In [17]:
for i in range(20):
    print(str(i)+". ", generate_story(markov_model, start="dear holmes", limit=8))

0.  dear holmes if i see her dainty form about the paper which he had wonderful polite ways with 
1.  dear holmes what do you suggest i know it and attacked it there were three possible explanations of 
2.  dear holmes he has been judged over there thats no business of the abernetty family was first brought 
3.  dear holmes i ejaculated well really it seems that it is from his agents the blow but my 
4.  dear holmes i have a vague impression of a plump little hand extended murmuring his regret for having 
5.  dear holmes i thought of what passes between us we picked up his leather gripsack and was about 
6.  dear holmes oh yes there are one or two very singular points about this strange and painful episodes 
7.  dear holmes i have heard coldstream guards thank you i need not discuss that was the motive which 
8.  dear holmes oh yes in a perplexing position look here watson there is one of those who sat 
9.  dear holmes said i i give you a family i have no objection to my looking at you

In [18]:
for i in range(20):
    print(str(i)+". ", generate_story(markov_model, start="i am", limit=5))

0.  i am not mistaken and we were aware of it yes but 
1.  i am here again if you will permit us and my heart 
2.  i am quite at home with his heart that he had done 
3.  i am relieved but i think none said holmes but the dearest 
4.  i am at last at last to have the hall and a 
5.  i am sure they were real friends well then we must take 
6.  i am afraid so i had been less than six and a 
7.  i am sure but i know holmes took a from the wall 
8.  i am connected with our inquiry has already crossed your mind said 
9.  i am correct colonel in saying that the breach is quite healed 
10.  i am reckoned fleet of foot and firm of grasp climbing apparently 
11.  i am not the murderer no the murderer has escaped there is 
12.  i am not aware of the opposite houses and the door as 
13.  i am not tired i have had several others of the fiercest 
14.  i am we have dreamed of doing so i took a cab 
15.  i am afraid there are no great number of those bulky boxes 
16.  i am still so weak that i have d