In [None]:
# import packages
import pandas as pd
import numpy as np
import re, time, random
from collections import Counter

# Create 3 gram dataset

In [None]:
%%time

# set the 3 gram counter
trigram_counter = Counter()

# file name
file_path = "en_US.twitter.txt"

# open the file and read by line
with open(file_path, 'r', encoding='utf-8') as f:
    
    for line in f:
        clean_line = re.sub(r'[^\w\s]', '', line.lower())
        words = clean_line.split()

        for i in range(len(words) - 2):
            trigram = (words[i], words[i + 1], words[i + 2])
            trigram_counter[trigram] += 1

In [None]:
%%time

df = pd.DataFrame([(w1, w2, w3, cnt) for (w1, w2, w3), cnt in trigram_counter.items()],
                  columns=['word1', 'word2', 'word3', 'count']).sort_values('count', ascending = False).reset_index(drop = True)

df['count_ratio'] = df['count'] / df.groupby(['word1', 'word2'])['count'].transform('sum')

In [None]:
%%time

top_df = df.sort_values('count', ascending = False).groupby(['word1', 'word2']).head(5).reset_index(drop = True)

for i in np.arange(0, 1, 0.2):
    print(round(i, 2), top_df['count'].quantile(i))
for i in np.arange(0.91, 1, 0.01):
    print(round(i, 2), top_df['count'].quantile(i))

In [None]:
simplifed_df = top_df[top_df['count'] > 3]

In [None]:
print('orginal rows:', df.shape[0])
print('head 5 rows:', top_df.shape[0])
print('simplifed rows:', simplifed_df.shape[0])

# Phrase Prediction

In [None]:
input_text = 'I really'

In [None]:
try:
    input_text = input_text.lower()
    input_word1, input_word2 = input_text.split()[0], input_text.split()[1]
except:
    print('input is wrong')

In [None]:
simplifed_df[(simplifed_df['word1'] == input_word1) & (simplifed_df['word2'] == input_word2)]

# Sentence Generation

In [None]:
first_word, second_word = input_word1, input_word2
create_text_list = []
create_text_list.append(first_word)
create_text_list.append(second_word)

In [None]:
while len(list(simplifed_df.loc[(simplifed_df['word1'] == first_word) & (simplifed_df['word2'] == second_word), 'word3'])) != 0:
    third_word_list = list(simplifed_df.loc[(simplifed_df['word1'] == first_word) & (simplifed_df['word2'] == second_word), 'word3'])
    third_word = third_word_list[random.randint(0, len(third_word_list) - 1)]
    create_text_list.append(third_word)
    first_word = second_word
    second_word = third_word

In [None]:
' '.join(create_text_list)