<a href="https://colab.research.google.com/github/jben-hun/colab_notebooks/blob/master/algorithms/markov_sentences.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Implementation

In [None]:
!pip install -q praw

import praw
import re
import random
import tqdm
import numpy as np
import pandas as pd
from collections import defaultdict
from collections import deque

pd.set_option("max_colwidth", None)

client_id = "" #@param {type:"string"}
client_secret = "" #@param {type:"string"}
user_agent = "" #@param {type:"string"}

reddit = praw.Reddit(
    client_id=client_id,
    client_secret=client_secret,
    user_agent=user_agent)

[K     |████████████████████████████████| 153kB 3.0MB/s 
[K     |████████████████████████████████| 204kB 9.2MB/s 
[?25h

In [None]:
class RedditMarkovChain:
  def __init__(
      self,
      *subreddits,
      sentence_limit=1000,
      begin_str = "*BEGIN*",
      end_str = "*END*",
      cycle_str = "*CYCLE*",
      train_split = (0.9)):
    self.__subreddits = subreddits
    self.sentence_limit = sentence_limit
    self.begin_str = begin_str
    self.end_str = end_str
    self.cycle_str = cycle_str
    self.__train_split = train_split
    self.__test_split = (1.0 - train_split)

    sentence_data = {}

    for subreddit in self.subreddits:
      sentence_data[subreddit] = self.mine_subreddit(
          subreddit=reddit.subreddit(subreddit),
          sentence_limit=self.sentence_limit)
      
    self.__sentence_data = sentence_data

    self.models = self.__build_models(self.__sentence_data)


  def __build_models(self, sentence_data):
    """Build markov chain models from extracted sentences"""
    
    models = defaultdict(lambda: defaultdict(lambda: defaultdict(lambda: 0)))

    for subreddit_name, sentences in sentence_data.items():
      for sentence in sentences[:int(self.train_split*len(sentences))]:
        words = self.split_sentence(sentence)
        models[subreddit_name][self.begin_str][words[0]] += 1
        models[subreddit_name][words[-1]][self.end_str] += 1
        for i in range(len(words) - 1):
          models[subreddit_name][words[i]][words[i + 1]] += 1

    return models


  @property
  def subreddits(self):
    return self.__subreddits


  @property
  def train_split(self):
    return self.__train_split


  @property
  def test_split(self):
    return self.__test_split


  @property
  def sentence_data(self):
    return self.__sentence_data


  def generate(self, model_name, method):
    """Generate text using the created markov chain models

    method:
      expected: choose most likely words, infinite cycles are possible
      random: choose words uniformly
      sample: choose words based on the modeled probabilities
    """

    sentence = ""
    word = self.begin_str

    if method == "expected":
      used = set()

    while True:
      if method == "expected":
        word = max(
            self.models[model_name][word].items(), key=lambda x: x[1])[0]
      elif method == "random":
        word = random.choice(tuple(self.models[model_name][word].items()))[0]
      elif method == "sample":
        words = tuple(self.models[model_name][word].keys())
        probs = self.get_probs(self.models[model_name][word])
        word = np.random.choice(words, p=probs)
      if word == self.end_str:
        break
      if word not in ".?!,":
        sentence += " "
      sentence += word

      if method == "expected":
        if word in used:
          sentence += f" {self.cycle_str}"
          break
        used.add(word)

    return sentence


  def classify(self, sentence):
    """Deduce the most likely source of a sentence"""
    result = {}
    for subreddit_name, model in self.models.items():
      words = self.split_sentence(sentence)
      p = self.get_prob(model[self.begin_str], words[0])
      for i in range(len(words)-1):
        p *= self.get_prob(model[words[i]], words[i+1])
      p *= self.get_prob(model[words[-1]], self.end_str)
      result[subreddit_name] = p
    return result


  @classmethod
  def mine_subreddit(cls, subreddit, sentence_limit):
    """Extract clean sentences from submissions and comments"""

    # re that matches clean sentences
    matcher = re.compile(r"(?:[.!?] |^)[A-Z][\w', ]+[.!?](?= [A-Z]|$)")

    sentences = []
    with tqdm.tqdm(total=sentence_limit) as pbar:
      for submission in subreddit.hot(limit=None):
        sentences += matcher.findall(submission.title)
        sentences += matcher.findall(submission.selftext)

        submission.comment_sort = "best"

        comments = [comment.body for comment in submission.comments.list()
                    if not isinstance(comment, praw.models.MoreComments)]

        for comment in comments:
          sentences += matcher.findall(comment)

        len_sentences = len(sentences)
        if len_sentences >= sentence_limit:
          random.shuffle(sentences)
          pbar.update(sentence_limit - pbar.n)
          break
        else:
          pbar.update(len_sentences - pbar.n)
    
    return [cls.process_sentence(sentence) for sentence
            in sentences[:sentence_limit]]


  @staticmethod
  def process_sentence(sentence):
    """Clean up sentences"""
    return (sentence.lstrip(".!? ")
                    .replace("won't", "will not")
                    .replace("n't", " not")
                    .replace("'m", " am")
                    .replace("'re", " are"))


  @staticmethod
  def split_sentence(sentence):
    """Split sentences into words"""
    return re.findall(r"((?:[\w']+)|(?:[,!.?]))", sentence)


  @staticmethod
  def get_prob(d, word):
    """Get single probability from word counts"""
    return 0 if word not in d else d[word]/sum(d.values())


  @staticmethod
  def get_probs(d):
    """Get all probabilities from word counts"""
    n = sum(d.values())
    return [v/n for v in d.values()]


  @staticmethod
  def traverse_comments(comments, *, breadth_first=False):
    queue = deque(comments[:])
    result = []
    while queue:
      e = queue.pop()
      if isinstance(e, praw.models.MoreComments):
        if breadth_first:
          queue.extendleft(e.comments())
        else:
          queue.extend(e.comments())
      else:
        if breadth_first:
          queue.extendleft(e.replies)
        else:
          queue.extend(e.replies)
        result.append(e)
    return result

# Demo

In [None]:
rmc = RedditMarkovChain("explainlikeimfive", "askreddit", "dankmemes")

100%|██████████| 1000/1000 [00:15<00:00, 66.17it/s]
100%|██████████| 1000/1000 [00:11<00:00, 83.42it/s]
100%|██████████| 1000/1000 [00:27<00:00, 36.72it/s]


**Deriving most probable sentence for each model**

In [None]:
for subreddit_name in rmc.subreddits:
  print(f"{subreddit_name}: {rmc.generate(subreddit_name, 'expected')}")

explainlikeimfive:  I am not know what you are not *CYCLE*
askreddit:  I am a lot of the same.
dankmemes:  I am not know how to be the same as a lot of the *CYCLE*


**Generating new text**

In [None]:
dict_data = defaultdict(lambda: [])
for subreddit_name in rmc.subreddits:
  for i in range(5):
    sentence = (rmc.generate(subreddit_name, "sample"))
    dict_data["sentence"].append(sentence)
    dict_data["model"].append(subreddit_name)
    res = rmc.classify(sentence)
    for k, v in res.items():
      dict_data[f"P({k})"].append(v)
display(pd.DataFrame(dict_data))

Unnamed: 0,sentence,model,P(explainlikeimfive),P(askreddit),P(dankmemes)
0,That map.,explainlikeimfive,0.0002469136,0.0,0.0
1,"Terminal velocity, but they the top and answers with boiling, featuring the heat food not a good answer I am all be said, but you are they needed to do astronomers really see.",explainlikeimfive,3.4615679999999997e-48,0.0,0.0
2,A huge portion of historical baggage.,explainlikeimfive,1.722653e-06,0.0,0.0
3,You can be able to stop the center point.,explainlikeimfive,1.854271e-10,0.0,0.0
4,Picking scabs.,explainlikeimfive,0.0005555556,0.0,0.0
5,"Once I wish I heard this is gone, Jews pray for 10 seashells each bring back.",askreddit,0.0,1.3640189999999998e-19,0.0
6,Just beware of the flip he was and office.,askreddit,0.0,1.125643e-12,0.0
7,Regulators!,askreddit,0.0,0.001111111,0.0
8,Sad to win the coup failed to begin with a lot of equipment from paying job and maintain power.,askreddit,0.0,2.284713e-22,0.0
9,Sometimes parents.,askreddit,0.0,0.0003968254,0.0


**Classifying real text**

In [None]:
dict_data = defaultdict(lambda: [])
for subreddit_name, sentences in rmc.sentence_data.items():
  for i in tuple(range(int(rmc.test_split*len(sentences))))[:5]:
    sentence = sentences[int(rmc.train_split*len(sentences)) + i]
    dict_data["sentence"].append(sentence)
    dict_data["source"].append(subreddit_name)
    res = rmc.classify(sentence)
    for k, v in res.items():
      dict_data[f"P({k})"].append(v)
display(pd.DataFrame(dict_data))

Unnamed: 0,sentence,source,P(explainlikeimfive),P(askreddit),P(dankmemes)
0,Take a hose and run some water through it.,explainlikeimfive,0.0,0.0,0.0
1,"Contact area, air convection, the type of meat are all important.",explainlikeimfive,7.146701e-19,0.0,0.0
2,"Op seems happy with that answer, but why is 400F optimal in a physics sense?",explainlikeimfive,0.0,0.0,0.0
3,"Maillard reaction is a big swath of reactions but, basically, you need to be hot enough for stuff to start breaking down and reacting but not so hot that the carbohydrate completely breaks down to carbon.",explainlikeimfive,0.0,0.0,0.0
4,Ca not do it super accurately but that 77ft wave was measured from the video with some AI algorithms.,explainlikeimfive,0.0,0.0,0.0
5,When it was time to get going I woke everyone and told them we were heading out in 30 minutes.,askreddit,0.0,0.0,0.0
6,Yes I am a conservative.,askreddit,0.0,0.0,0.0
7,She was an amazing woman.,askreddit,0.0,1.772107e-07,0.0
8,"To her and the rest of the royal family in France, they only had their people's interests at heart and it was more that she was hurt and confused that they could not see that.",askreddit,0.0,0.0,0.0
9,To pick up the pieces and help the citizenry rebuild rather than leaving the ruins to smolder?,askreddit,0.0,0.0,0.0


# TODO

*   Second order markov chains: P(AB->C)

# References

*   https://en.wikipedia.org/wiki/Markov_chain
*   https://www.reddit.com/r/SubredditSimulator/comments/3g9ioz/what_is_rsubredditsimulator/
*   https://www.reddit.com/r/SubSimulatorGPT2/comments/btfhks/what_is_rsubsimulatorgpt2/