### Importing tools

In [26]:
import numpy as np
import pandas as pd
import os
import re
import string
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
import random

### Reading every Sherlock Holmes adventure!

In [27]:
story_path = "/Users/ziko/Desktop/CSFall23/CS695_Machine_Learning/python_code/sherlock/sherlock_holmes"

def read_all_stories(story_path):
    txt = []
    for _, _, files in os.walk(story_path):
        for file in files:
            with open(os.path.join(story_path, file)) as f:
                for line in f:
                    line = line.strip()
                    if line == '----------':
                        break
                    if line != '':
                        txt.append(line)
    return txt

stories = read_all_stories(story_path)
print("number of lines =", len(stories))


number of lines = 215021


### Cleaning the text

In [29]:
import nltk

# Download the 'punkt' resource
nltk.download('punkt')

[nltk_data] Downloading package punkt to /Users/ziko/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

In [30]:
def clean_txt(txt):
    cleaned_txt = []
    for line in txt:
        line = line.lower()
        line = re.sub(r"[,.\"\'!@#$%^&*(){}?/;`~:<>+=-\\]", "", line)
        tokens = word_tokenize(line)
        words = [word for word in tokens if word.isalpha()]
        cleaned_txt+=words
    return cleaned_txt

cleaned_stories = clean_txt(stories)
print("number of words = ", len(cleaned_stories))

number of words =  2332247


### Creating the Markov Model

In [31]:
def make_markov_model(cleaned_stories, n_gram=2):
    markov_model = {}
    for i in range(len(cleaned_stories)-n_gram-1):
        curr_state, next_state = "", ""
        for j in range(n_gram):
            curr_state += cleaned_stories[i+j] + " "
            next_state += cleaned_stories[i+j+n_gram] + " "
        curr_state = curr_state[:-1]
        next_state = next_state[:-1]
        if curr_state not in markov_model:
            markov_model[curr_state] = {}
            markov_model[curr_state][next_state] = 1
        else:
            if next_state in markov_model[curr_state]:
                markov_model[curr_state][next_state] += 1
            else:
                markov_model[curr_state][next_state] = 1
    
    # calculating transition probabilities
    for curr_state, transition in markov_model.items():
        total = sum(transition.values())
        for state, count in transition.items():
            markov_model[curr_state][state] = count/total
        
    return markov_model

In [32]:
markov_model = make_markov_model(cleaned_stories)

In [33]:
print("number of states = ", len(markov_model.keys()))

number of states =  208717


In [34]:
print("All possible transitions from 'the game' state: \n")
print(markov_model['the game'])

All possible transitions from 'the game' state: 

{'is up': 0.06306306306306306, 'is and': 0.036036036036036036, 'was afoot': 0.036036036036036036, 'for the': 0.036036036036036036, 'was whist': 0.036036036036036036, 'would have': 0.036036036036036036, 'in their': 0.036036036036036036, 'was up': 0.09009009009009009, 'in that': 0.036036036036036036, 'the lack': 0.036036036036036036, 'for all': 0.06306306306306306, 'is afoot': 0.036036036036036036, 'was in': 0.02702702702702703, 'is hardly': 0.02702702702702703, 'may wander': 0.02702702702702703, 'now a': 0.02702702702702703, 'my own': 0.02702702702702703, 'at any': 0.02702702702702703, 'mr holmes': 0.02702702702702703, 'ay whats': 0.02702702702702703, 'my friend': 0.02702702702702703, 'fairly by': 0.02702702702702703, 'is not': 0.02702702702702703, 'was not': 0.02702702702702703, 'worth it': 0.02702702702702703, 'you are': 0.02702702702702703, 'i am': 0.02702702702702703, 'now count': 0.02702702702702703, 'your letter': 0.027027027027027

### Generating Sherlock Holmes stories!

In [35]:
def generate_story(markov_model, limit=100, start='my god'):
    n = 0
    curr_state = start
    next_state = None
    story = ""
    story+=curr_state+" "
    while n<limit:
        next_state = random.choices(list(markov_model[curr_state].keys()),
                                    list(markov_model[curr_state].values()))
        
        curr_state = next_state[0]
        story+=curr_state+" "
        n+=1
    return story

In [36]:
for i in range(20):
    print(str(i)+". ", generate_story(markov_model, start="dear holmes", limit=8))

0.  dear holmes said i putting down the box had been her last breath in pouring out the story 
1.  dear holmes said i you hear he gives no trouble can come round to me no surprise i 
2.  dear holmes what do you make of it and when i coupled it with his or on his 
3.  dear holmes it is very pleasant in his usual mood but there was evidence of some use at 
4.  dear holmes you are surrounded by none of my fathers case to a triumphant yelp sprang upon a 
5.  dear holmes i have loved each other as our voices disturbed their slumbers toby proved to be of 
6.  dear holmes he has found it it would go if a real piece of paper with writing upon 
7.  dear holmes said i i am sure you appear to take it into their intimate conversation seemed to 
8.  dear holmes what do they care for my poor father met his end the soil was imprinted all 
9.  dear holmes and tell us all good for him one moment i was clearing up the shutters for 
10.  dear holmes i ejaculated as a peacock very much surprised if it had

In [37]:
for i in range(20):
    print(str(i)+". ", generate_story(markov_model, start="my dear", limit=8))

0.  my dear holmes what can it be then again whom was he that i am very much indebted 
1.  my dear young lady too she was happy for i have promised to be settled the question clearly 
2.  my dear boy is nearly eighty cram full of gout too they have been infinitely more mysterious than 
3.  my dear young lady we have known i understand now mr trelawney hope i always smoke ships myself 
4.  my dear dear son the seamen had hauled the aback during the last two nights she had heard 
5.  my dear boy to put up her hand to him and it was late that night his remaining 
6.  my dear watson was enabled to produce his meretricious finales colonel emsworth the greatest living authority upon tropical 
7.  my dear watson this is the very man who had been sent off long ago to be definite 
8.  my dear holmes what do you remember a box an ivory box with a very full description of 
9.  my dear fellow for a german subject and that the man who has been a bad sign on 
10.  my dear watson yet another had the 

In [38]:
for i in range(20):
    print(str(i)+". ", generate_story(markov_model, start="i would", limit=8))

0.  i would sooner have a savage assault upon the old church crypt at night have you examined the 
1.  i would have waited for the deep and yet i was at this instant you slipped up there 
2.  i would have nothing more to do with violence so he went down to her feet with the 
3.  i would carry my stone to beat out his brains out as i planned them i was the 
4.  i would there was no reliable test now we have one belonging to my surprise and horror the 
5.  i would have the lantern ready to uncover that we should have much difficulty in determining that said 
6.  i would have it in me its a cold night with a constancy almost unparalleled in history the 
7.  i would warn your grace however that the duchess had anything to his mother and i shall be 
8.  i would ask then leave it where it has trickled down the facts you are right mr holmes 
9.  i would suggest that we have to take a train of thought which can now be removed i 
10.  i would not have a quieter lodger or one who is at his ease 

In [39]:
print(generate_story(markov_model, start="the case", limit=100))

the case i knew also that it was thick fog and one would like to know it lord but were lurking in some retreat which had never been to his son when we broke the sad moan in which it had been devoted to the child stood beside his table it argues the degree in which he polished off his bonds which he had reached the westminster wharf and found that mycroft had preserved my rooms quite so these are the family he came over and you are gettin set on my return i found holmes in animated conversation with his father and the interests which rise up around the man who gave evidence as to the young man who was clearly a dangerous suitor with his glib irish tongue and his sister were out spending the evening tired but happy man this is the young man the look of the offices round but i shall be very much obliged to him by will he knew that such a consideration of the mysterious disappearance of your people had never to return and to carry out your assertion no i have a and to put the police let th

In summary, this script uses Markov models to generate synthetic text resembling Sherlock Holmes stories based on the patterns observed in the original text. The generated stories are produced by probabilistically selecting the next word based on the current state in the Markov model.