<a href="https://colab.research.google.com/github/krdeepak39/Deep-learning-model/blob/main/Markov_model_on_sherlock_datasets.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import numpy as np
import pandas as pd
import sklearn
import os
import re
import nltk
nltk.download('punkt')
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


In [2]:
path ="/content/drive/MyDrive/Colab Notebooks/sherlock/"

In [3]:
def read_stories(path):
    text = []
    for _, _, files in os.walk(path):
        for file in files:
            with open(path + file) as f:
                for line in f:
                    line = line.strip()
                    if line == '----------':
                        break
                    if line != '':
                        text.append(line)
    return text

In [4]:
stories = read_stories(path)

In [5]:
print(len(stories))

215021


In [6]:
def clean_stories(text):
    clean_text = []
    for line in text:
        line = line.lower()
        line = re.sub(r",\"\'!@#$%^&*(){}!?/;`~:<>+=-\]", "", line)
        tokens = word_tokenize(line)
        words = []
        for word in tokens:
            words.append(word)
        clean_text += words
    return clean_text

In [7]:
clean_text = clean_stories(stories)

In [8]:
def markov_models(cleaned_text, ngram = 2):
    markov_model = {}
    for i in range(len(clean_text) - ngram - 1):
        curr_state, next_state = "", ""
        for j in range(ngram):
            curr_state += cleaned_text[i + j] + ' '
            next_state += cleaned_text[i + j + ngram] + ' '
        curr_state = curr_state[:-1]
        next_state = next_state[:-1]
        if curr_state not in markov_model:
            markov_model[curr_state] = {}
            markov_model[curr_state][next_state] = 1
        else:
            if next_state in markov_model[curr_state]:
                markov_model[curr_state][next_state] += 1
            else:
                markov_model[curr_state][next_state] = 1
    for curr_state, transition in markov_model.items():
        total = sum(transition.values())
        for state, count in transition.items():
            markov_model[curr_state][state] = count / total
    return markov_model

In [9]:
markov = markov_models(clean_text)

In [10]:
markov

{'the stock-broker': {"'s clerk": 1.0},
 "stock-broker 's": {'clerk arthur': 0.09090909090909091,
  'clerk had': 0.36363636363636365,
  'clerk the': 0.2727272727272727,
  'clerk shortly': 0.2727272727272727},
 "'s clerk": {'arthur conan': 0.09090909090909091,
  'had concluded': 0.36363636363636365,
  'the ``': 0.2727272727272727,
  'shortly after': 0.2727272727272727},
 'clerk arthur': {'conan doyle': 1.0},
 'arthur conan': {'doyle shortly': 0.014492753623188406,
  'doyle we': 0.043478260869565216,
  'doyle it': 0.15942028985507245,
  'doyle when': 0.043478260869565216,
  'doyle somewhere': 0.014492753623188406,
  'doyle [': 0.014492753623188406,
  'doyle from': 0.014492753623188406,
  'doyle sherlock': 0.028985507246376812,
  'doyle glancing': 0.014492753623188406,
  'doyle the': 0.07246376811594203,
  'doyle holmes': 0.028985507246376812,
  'doyle table': 0.2028985507246377,
  'doyle isa': 0.014492753623188406,
  'doyle on': 0.014492753623188406,
  'doyle in': 0.043478260869565216,
 

In [11]:
markov['my god']

{"! ''": 0.29896907216494845,
 '! to': 0.030927835051546393,
 '! it': 0.041237113402061855,
 '! what': 0.08247422680412371,
 '! helen': 0.041237113402061855,
 ', whoever': 0.041237113402061855,
 ', shall': 0.041237113402061855,
 ', what': 0.07216494845360824,
 ', can': 0.030927835051546393,
 ', those': 0.030927835051546393,
 ', my': 0.041237113402061855,
 ', it': 0.12371134020618557,
 '! was': 0.041237113402061855,
 '! are': 0.041237113402061855,
 '! how': 0.041237113402061855}

In [12]:
import random
def generate_story(markov_chain, limit = 100, start = 'men whom'):
    n = 0
    curr_state = start
    next_state = None
    story = ''
    story += curr_state + ' '
    while n < limit:
        next_state = random.choices(
        list(markov_chain[curr_state].keys()),
        list(markov_chain[curr_state].values())
        )
        curr_state = next_state[0]
        story += curr_state + ' '
        n += 1
    return story

In [13]:
print(generate_story(markov, limit = 10))

men whom we had to put the matter . lord st. simon , second son of mr . holmes ? '' `` 


In [14]:
for i in range(20):
    print(generate_story(markov, start = 'i would', limit = 5))

i would not have done more than just give me a telegraph 
i would have waited until their advertisements were tried , '' said 
i would have spoken to anyone else , and yet even here 
i would send you word that she is now the window was 
i would promise for three months in the advertisement column , with 
i would only ask this . grasp the alternative , i began 
i would have nothing to do inside his neckcloth and half-strangling him 
i would do nothing . '' i was . '' `` and 
i would be accused in their minds , and argue from them 
i would ask you a few months . there were also two 
i would have thought possible , and although a large practice . 
i would , in a voice which i read in his eyes 
i would say nothing to him when there were these men americans 
i would n't answer for our assistance . wait a bit , 
i would only observe that there were some points about the case 
i would do what i have hoped to find the new guns 
i would only ask you now suggest , watson ? '' he 
i would do just