<a href="https://colab.research.google.com/github/jben-hun/colab_notebooks/blob/master/algorithms/markov_sentences.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Implementation

In [1]:
!pip install -q praw

import praw
import re
import random
import tqdm
import numpy as np
import pandas as pd
from collections import defaultdict
from collections import deque

pd.set_option("max_colwidth", None)

client_id = "" #@param {type:"string"}
client_secret = "" #@param {type:"string"}
user_agent = "" #@param {type:"string"}

reddit = praw.Reddit(
    client_id=client_id,
    client_secret=client_secret,
    user_agent=user_agent)

[K     |████████████████████████████████| 153kB 2.7MB/s 
[K     |████████████████████████████████| 204kB 4.7MB/s 
[?25h

In [2]:
SUBREDDITS = ("explainlikeimfive", "askreddit", "dankmemes")
SENTENCE_LIMIT = 1000
BEGIN_STR = "*BEGIN*"
END_STR = "*END*"
CYCLE_STR = "*CYCLE*"
TRAIN_SPLIT = (0.9)
TEST_SPLIT = (1.0 - TRAIN_SPLIT)


def process_sentence(sentence):
  """Clean up sentences"""
  return (sentence.lstrip(".!? ")
                  .replace("won't", "will not")
                  .replace("n't", " not")
                  .replace("'m", " am")
                  .replace("'re", " are"))


def split_sentence(sentence):
  """Split sentences into words"""
  return re.findall(r"((?:[\w']+)|(?:[,!.?]))", sentence)


def mine_subreddit(subreddit, sentence_limit):
  """Extract clean sentences from submissions and comments"""

  # re that matches clean sentences
  matcher = re.compile(r"(?:[.!?] |^)[A-Z][\w', ]+[.!?](?= [A-Z]|$)")

  sentences = []
  with tqdm.tqdm(total=sentence_limit) as pbar:
    for submission in subreddit.hot(limit=None):
      sentences += matcher.findall(submission.title)
      sentences += matcher.findall(submission.selftext)

      submission.comment_sort = "best"

      comments = [comment.body for comment in submission.comments.list()
                  if not isinstance(comment, praw.models.MoreComments)]

      for comment in comments:
        sentences += matcher.findall(comment)

      len_sentences = len(sentences)
      if len_sentences >= sentence_limit:
        random.shuffle(sentences)
        pbar.update(sentence_limit - pbar.n)
        break
      else:
        pbar.update(len_sentences - pbar.n)
  
  return [process_sentence(sentence) for sentence in sentences[:sentence_limit]]


def make_models(sentence_data):
  """Build markov chain models from extracted sentences"""

  models = defaultdict(lambda: defaultdict(lambda: defaultdict(lambda: 0)))

  for subreddit_name, sentences in sentence_data.items():
    for sentence in sentences[:int(TRAIN_SPLIT*len(sentences))]:
      words = split_sentence(sentence)
      models[subreddit_name][BEGIN_STR][words[0]] += 1
      models[subreddit_name][words[-1]][END_STR] += 1
      for i in range(len(words) - 1):
        models[subreddit_name][words[i]][words[i + 1]] += 1

  # for subreddit_name, model in models.items():
  #   for word1, counts in model.items():
  #     sum_counts = sum(counts.values())
  #     for word2 in counts:
  #       models[subreddit_name][word1][word2] /= sum_counts

  return models


def generate(models, model_name, method):
  """Generate text using the created markov chain models

  method:
    expected: choose most likely words, infinite cycles are possible
    random: choose words uniformly
    sample: choose words based on the modeled probabilities
  """

  sentence = ""
  word = BEGIN_STR

  if method == "expected":
    used = set()

  while True:
    if method == "expected":
      word = max(models[model_name][word].items(), key=lambda x: x[1])[0]
    elif method == "random":
      word = random.choice(tuple(models[model_name][word].items()))[0]
    elif method == "sample":
      words = tuple(models[model_name][word].keys())
      probs = get_probs(models[model_name][word])
      word = np.random.choice(words, p=probs)
    if word == END_STR:
      break
    if word not in ".?!,":
      sentence += " "
    sentence += word

    if method == "expected":
      if word in used:
        sentence += f" {CYCLE_STR}"
        break
      used.add(word)

  return sentence

def classify(models, sentence):
  """Deduce the most likely source of a sentence"""
  result = {}
  for subreddit_name, model in models.items():
    words = split_sentence(sentence)
    p = get_prob(model[BEGIN_STR], words[0])
    for i in range(len(words)-1):
      p *= get_prob(model[words[i]], words[i+1])
    p *= get_prob(model[words[-1]], END_STR)
    result[subreddit_name] = p
  return result


def get_prob(d, word):
  """Get single probability from word counts"""
  return 0 if word not in d else d[word]/sum(d.values())


def get_probs(d):
  """Get all probabilities from word counts"""
  n = sum(d.values())
  return [v/n for v in d.values()]


# def traverse_comments(comments, *, breadth_first=False):
#   queue = deque(comments[:])
#   result = []
#   while queue:
#     e = queue.pop()
#     if isinstance(e, praw.models.MoreComments):
#       if breadth_first:
#         queue.extendleft(e.comments())
#       else:
#         queue.extend(e.comments())
#     else:
#       if breadth_first:
#         queue.extendleft(e.replies)
#       else:
#         queue.extend(e.replies)
#       result.append(e)
#   return result

# Demo

In [3]:
sentence_data = {}
for subreddit in SUBREDDITS:
  sentence_data[subreddit] = mine_subreddit(
      subreddit=reddit.subreddit(subreddit),
      sentence_limit=SENTENCE_LIMIT)
  
models = make_models(sentence_data)

100%|██████████| 1000/1000 [00:16<00:00, 61.15it/s]
100%|██████████| 1000/1000 [00:12<00:00, 79.11it/s]
100%|██████████| 1000/1000 [00:23<00:00, 41.98it/s]


**Deriving most probable sentence for each model**

In [4]:
for subreddit_name in SUBREDDITS:
  print(f"{subreddit_name}: {generate(models, subreddit_name, 'expected')}")

explainlikeimfive:  I am not know about the same way to the *CYCLE*
askreddit:  I am a lot of the first time.
dankmemes:  I am not a lot of the other.


**Generating new text**

In [5]:
dict_data = defaultdict(lambda: [])
for subreddit_name, model in models.items():
  for i in range(5):
    sentence = (generate(models, subreddit_name, "sample"))
    dict_data["sentence"].append(sentence)
    dict_data["model"].append(subreddit_name)
    res = classify(models, sentence)
    for k, v in res.items():
      dict_data[f"P({k})"].append(v)
display(pd.DataFrame(dict_data))

Unnamed: 0,sentence,model,P(explainlikeimfive),P(askreddit),P(dankmemes)
0,"Which means your desired result is a quarter of objects that gives you have to believe the atmosphere it because you will group New World, or so will glare at 350 was a philosophical movement that way in Oceania but in facilitating financial abilities.",explainlikeimfive,5.254677000000001e-52,0.0,0.0
1,"Greece, the US to the title and decide what other factors determine that are Central European.",explainlikeimfive,3.3734390000000003e-22,0.0,0.0
2,"While I used to carbon dating, Czechs, how it possible for example, European.",explainlikeimfive,5.251718e-23,0.0,0.0
3,I shall leave the Pacific Ocean is an awful experience?,explainlikeimfive,9.834314e-11,0.0,0.0
4,They can warm yellow tones lead to be half the food is super simple puzzles have been that touches it.,explainlikeimfive,5.9467869999999996e-21,0.0,0.0
5,Golden deer currently.,askreddit,0.0,0.0005555556,0.0
6,"Otherwise the first time it seems to care less in the usual suspects, I dare you not trust anyone who lived down.",askreddit,0.0,3.5618210000000004e-27,0.0
7,Considering it's own weight.,askreddit,0.0,2.469136e-05,0.0
8,Played CTR as often the GOP care less than on the afternoon.,askreddit,0.0,2.843878e-14,0.0
9,The 7pm cheering for memes because deep down so abortion rights to both on Earth.,askreddit,0.0,6.331422e-16,0.0


**Classifying real text**

In [6]:
dict_data = defaultdict(lambda: [])
for subreddit_name, sentences in sentence_data.items():
  for i in tuple(range(int(TEST_SPLIT*len(sentences))))[:5]:
    sentence = sentences[int(TRAIN_SPLIT*len(sentences)) + i]
    dict_data["sentence"].append(sentence)
    dict_data["source"].append(subreddit_name)
    res = classify(models, sentence)
    for k, v in res.items():
      dict_data[f"P({k})"].append(v)
display(pd.DataFrame(dict_data))

Unnamed: 0,sentence,source,P(explainlikeimfive),P(askreddit),P(dankmemes)
0,"Latin America is usually referred to as Latin America, The Global South, or developing countries.",explainlikeimfive,0.0,0.0,0.0
1,Feels good after having a meal so cheap and not unhealthy.,explainlikeimfive,0.0,0.0,0.0
2,It got used so regularly in our house that it just always sat on the counter.,explainlikeimfive,0.0,0.0,0.0
3,"However, when a seed sprouts, lots of changed occur in the building blocks that make up the seed, which let it to be able to grow.",explainlikeimfive,0.0,0.0,0.0
4,Perhaps Orthodox Christianity plays a role?,explainlikeimfive,0.0,0.0,0.0
5,HONK.,askreddit,0.0,0.0,0.0
6,"I do not even understand his excuse why it's okay this year and not in the previous election year, what does he mean it's only inappropriate if two different parties are being represented this year are not there two parties being represented on the ballot Republicans and Democrats, am I having a brain fart?",askreddit,0.0,0.0,0.0
7,I still have not rewatched that movie as the reveal was so epic I knew I could never feel that way about it again.,askreddit,0.0,0.0,0.0
8,"However, I've noticed that lots of the civilians in NY do not get that emotional over politics, they are mostly level headed.",askreddit,0.0,0.0,0.0
9,In the middle of a pandemic you do not get much more win than that.,askreddit,0.0,0.0,0.0


# TODO

*   Second order markov chains: P(AB->C)

# References

*   https://en.wikipedia.org/wiki/Markov_chain
*   https://www.reddit.com/r/SubredditSimulator/comments/3g9ioz/what_is_rsubredditsimulator/
*   https://www.reddit.com/r/SubSimulatorGPT2/comments/btfhks/what_is_rsubsimulatorgpt2/