In [1]:
# import packages
import pandas as pd
import numpy as np
import re, random
from collections import Counter

import gradio as gr

# Create 3 Gram Dataset

In [2]:
%%time

# set the 3 gram counter
trigram_counter = Counter()

# file name
file_path = "en_US_twitter.txt"

# open the file and read by line
with open(file_path, 'r', encoding='utf-8') as f:
    
    for line in f:
        clean_line = re.sub(r'[^\w\s]', '', line.lower())
        words = clean_line.split()

        for i in range(len(words) - 2):
            trigram = (words[i], words[i + 1], words[i + 2])
            trigram_counter[trigram] += 1

CPU times: total: 34.9 s
Wall time: 35.1 s


In [3]:
%%time

# change the 3 grams to dataframe
df = pd.DataFrame([(w1, w2, w3, cnt) for (w1, w2, w3), cnt in trigram_counter.items()],
                  columns=['word1', 'word2', 'word3', 'count']).sort_values('count', ascending = False).reset_index(drop = True)

# calcualte the ratio of count by the first 2 words
df['count_ratio'] = df['count'] / df.groupby(['word1', 'word2'])['count'].transform('sum')

CPU times: total: 17.1 s
Wall time: 17.2 s


In [4]:
%%time

# get the top 5 common 3 gram
top_df = df.sort_values('count', ascending = False).groupby(['word1', 'word2']).head(5).reset_index(drop = True)

# print the quantile
for i in np.arange(0, 1, 0.2):
    print(f"Quantile {i*100:>5.1f}%: {top_df['count'].quantile(i):.2f}")
for i in np.arange(0.91, 1, 0.01):
    print(f"Quantile {i*100:>5.1f}%: {top_df['count'].quantile(i):.2f}")

Quantile   0.0%: 1.00
Quantile  20.0%: 1.00
Quantile  40.0%: 1.00
Quantile  60.0%: 1.00
Quantile  80.0%: 1.00
Quantile  91.0%: 2.00
Quantile  92.0%: 2.00
Quantile  93.0%: 2.00
Quantile  94.0%: 3.00
Quantile  95.0%: 3.00
Quantile  96.0%: 4.00
Quantile  97.0%: 5.00
Quantile  98.0%: 7.00
Quantile  99.0%: 13.00
CPU times: total: 13.4 s
Wall time: 13.6 s


In [5]:
# based on the quantile, use 3 has the threshold for simplified data
simplifed_df = top_df[top_df['count'] > 3]

# print the number of rows
print('orginal dataset rows:', df.shape[0])
print('top 5 dataset rows:', top_df.shape[0])
print('simplifed dataset rows:', simplifed_df.shape[0])

# # save the simplified dataset
# simplifed_df.to_csv('simplified_data.csv')

orginal rows: 13778311
head 5 rows: 7693892
simplifed rows: 328781


# Phrase Prediction

In [6]:
# define the function of checking the input text
def word_check(input_text):
    cleaned_text = re.sub(r'[^\w\s]', '', input_text.lower())
    words = cleaned_text.split()

    if len(words) < 2:
        raise ValueError("Invalid input: please provide at least two words.")
    if len(words) > 2:
        raise ValueError("Invalid input: please provide no more than two words.")

    return words[0], words[1]

In [66]:
# define the function of word prediction
def word_prediction(input_text):
    first_word, second_word = word_check(input_text)
    predicted_table =  simplifed_df[(simplifed_df['word1'] == first_word) & (simplifed_df['word2'] == second_word)].reset_index(drop = True)
    predicted_table = predicted_table[['word3', 'count_ratio']].rename(columns = {'word3': 'predicted word', 'count_ratio': 'predicted ratio'})

    predicted_table['predicted ratio'] = predicted_table['predicted ratio'].apply(lambda x: round(x * 100, 2))
    return predicted_table, round(predicted_table['predicted ratio'].sum(), 2)

In [63]:
# input the text
input_text = 'Information is'

In [64]:
# predict word based on input text
word_prediction(input_text)

(  predicted word  predicted ratio
 0            not             8.45
 1            the             7.04,
 15.489999999999998)

# Sentence Generation

In [42]:
# define the function of generating sentence
def sentence_generation(input_text):
    first_word, second_word = word_check(input_text)
    
    create_text_list = input_text.split()
    
    while len(list(simplifed_df.loc[(simplifed_df['word1'] == first_word) & (simplifed_df['word2'] == second_word), 'word3'])) != 0:
        third_word_list = list(simplifed_df.loc[(simplifed_df['word1'] == first_word) & (simplifed_df['word2'] == second_word), 'word3'])
        third_word = third_word_list[random.randint(0, len(third_word_list) - 1)]
        create_text_list.append(third_word)
        first_word, second_word = second_word, third_word

    return ' '.join(create_text_list)

In [10]:
# generate sentence based on input text
sentence_generation(input_text)

'Information is the most amazing thing to happen to have an extra day of the week is the best part of a new phone is off limits to growth and development'

# Gradio

In [67]:
with gr.Blocks() as demo:
    with gr.Row():
        with gr.Column():
            gr.Markdown("## Word Prediction")
            input1 = gr.Textbox(label="Please input 2 words")
            output1 = gr.Dataframe(headers=["predicted word", "predicted ratio"], label = "Prediction Table")
            output2 = gr.Number(label = "Prediction Total Ratio")
            btn1 = gr.Button("Predict")
            btn1.click(fn = word_prediction, inputs = input1, outputs=[output1, output2])
            
        with gr.Column():
            gr.Markdown("## Sentence Generation")
            input2 = gr.Textbox(label="Please input 2 words")
            output2 = gr.Textbox(label="Generated Sentence")
            btn2 = gr.Button("Generate")
            btn2.click(fn = sentence_generation, inputs = input2, outputs = output2)

demo.launch(inline = True)

* Running on local URL:  http://127.0.0.1:7884
* To create a public link, set `share=True` in `launch()`.


