In [None]:
from google.colab import files
uploaded = files.upload()

In [None]:
import pandas as pd
import numpy as np

In [None]:
import string
import random
import time
from typing import List

In [None]:
df = pd.read_csv('new_human_fake_reviews.csv')

In [None]:
text = df['text']

In [None]:
# ideally we would use some smart text tokenizer, but for simplicity use this one
def tokenize(text: str) -> List[str]:
    """
    :param text: Takes input sentence
    :return: tokenized sentence
    """
    for punct in string.punctuation:
        text = text.replace(punct, ' '+punct+' ')
    t = text.split()
    return t

def get_ngrams(n: int, tokens: list) -> list:
    """
    :param n: n-gram size
    :param tokens: tokenized sentence
    :return: list of ngrams
    ngrams of tuple form: ((previous wordS!), target word)
    """
    # tokens.append('<END>')
    tokens = (n-1)*['<START>']+tokens
    l = [(tuple([tokens[i-p-1] for p in reversed(range(n-1))]), tokens[i]) for i in range(n-1, len(tokens))]
    return l


In [None]:
class NgramModel(object):

    def __init__(self, n):
        self.n = n

        # dictionary that keeps list of candidate words given context
        self.context = {}

        # keeps track of how many times ngram has appeared in the text before
        self.ngram_counter = {}

    def update(self, sentence: str) -> None:
        """
        Updates Language Model
        :param sentence: input text
        """
        n = self.n
        ngrams = get_ngrams(n, tokenize(sentence))
        for ngram in ngrams:
            if ngram in self.ngram_counter:
                self.ngram_counter[ngram] += 1.0
            else:
                self.ngram_counter[ngram] = 1.0

            prev_words, target_word = ngram
            if prev_words in self.context:
                self.context[prev_words].append(target_word)
            else:
                self.context[prev_words] = [target_word]

    def prob(self, context, token):
        """
        Calculates probability of a candidate token to be generated given a context
        :return: conditional probability
        """
        try:
            count_of_token = self.ngram_counter[(context, token)]
            count_of_context = float(len(self.context[context]))
            result = count_of_token / count_of_context

        except KeyError:
            result = 0.0
        return result

    def random_token(self, context):
        """
        Given a context we "semi-randomly" select the next word to append in a sequence
        :param context:
        :return:
        """
        r = random.random()
        map_to_probs = {}
        token_of_interest = self.context[context]
        for token in token_of_interest:
            map_to_probs[token] = self.prob(context, token)

        summ = 0
        for token in sorted(map_to_probs):
            summ += map_to_probs[token]
            if summ > r:
                return token

    def generate_text(self, token_count: int):
        """
        :param token_count: number of words to be produced
        :return: generated text
        """
        n = self.n
        context_queue = (n - 1) * ['<START>']
        result = []
        for _ in range(token_count):
            obj = self.random_token(tuple(context_queue))
            result.append(obj)
            if n > 1:
                context_queue.pop(0)
                if obj == '.':
                    context_queue = (n - 1) * ['<START>']
                else:
                    context_queue.append(obj)
        return ' '.join(result)


def create_ngram_model(n, path):
    m = NgramModel(n)
    with open(path, 'r') as f:
        text = f.read()
        text = text.split('.')
        for sentence in text:
            # add back the fullstop
            sentence += '.'
            m.update(sentence)
    return m


In [None]:
if __name__ == "__main__":
    start = time.time()
    m = create_ngram_model(3, 'new_human_fake_reviews.csv')

    print (f'Language Model creating time: {time.time() - start}')
    start = time.time()
    random.seed(7)
    print(f'{"="*50}\nGenerated text:')
    print(m.generate_text(20))
    print(f'{"="*50}')


Language Model creating time: 0.009264230728149414
Generated text:
He gave me a haircut that made me feel a sense of confidence that I don ' t get food


In [None]:
generated_text = []
for i in np.arange(1, 50):
  m = create_ngram_model(i, 'new_human_fake_reviews.csv')
  text = m.generate_text(50)
  generated_text.append(text)


In [None]:
txt = pd.DataFrame(data = generated_text)
pd.set_option('display.max_colwidth', None)
txt.head(5)

Unnamed: 0,0
0,"will "" there and Beckie . as than I local new complains it , ' Once over fire ate looked friend another such place a like i though ruined and with . everything she Everything had "" order sitting like tier ate place manicures lived from the was good on"
1,""" Sorry , we were crafted with more dirt roads . "" "" Tacos were crafted with only greens and ordered their teriyaki bentos . I got us seated almost right . Definitely a long line out beverages , but wow what the piece for Henry ! I was so"
2,"I showed my stylist a picture and they have a great experience , location was super difficult to find and parking was really worth the overpriced ice cream . Every order also comes with chips . Meat was a little too much hair than I would have sued them for"
3,"WILL NOT COME BACK "" I was really hoping I could show off my new do with school starting again but no . And they even charged tip when I don ' t wanna give more of my money to this establishment . it did not match the amazing scents"
4,""" No complains here , the burgers were great ! ! "" Terrible service ! I had to wait 30min until my order was taken and then even after my plates were cleared I waited for nearly an hour to get my bill . My wife and I shared their"


In [None]:
txt.shape

(49, 1)

N-Gram with train.csv 


In [None]:
df_1 = pd.read_csv('train.tsv', delimiter='\t', header=None)
df_1 = df_1[[3]][0:500]
df_1.head()

Unnamed: 0,3
0,The grilled yellow tail salmon here is out of this world! I have eaten here twice in last 8 weeks and the sushi is fresh. The miso soup is great. Everything I have tried has been delicious. The staff is fairly friendly. Looking forward to happy hour here in the near future.
1,"what can I say - we're CF regulars...just love the place. I guess we are lucky because we have never had to wait more than 15-20 minutes for 3 of us. We eat here at least twice a month and we always start with the avocado egg rolls - they are to die for and the dipping sauce is yummy! I usually get a Glam burger - the only time I eat red meat - and so far have not had one I did not like. I often take 1/2 home so I can save room for some cheesecake, an absolute must when at CF. We have never had a bad cheesecake and try a different one each time. This is the only hamburger that I have liked the next day - usually hamburger is the one food that is not a good take home food as it just does not taste good as a left over - but I have actually eaten it cold because I took a bite before heating it and just kept going...yes, weird I know. If nothing else, go there for some avocado egg rolls and then treat yourself to a cheesecake....I'm totally addicted! Absolute divine decadence! ;D"
2,"The only thing you can say critically about the Thai House is the location isn't a home run. But get past that, and get inside because the food is top shelf and the recepies are authentic making this a fave five spot for my family. Start with a Thai Iced Tea, try a spicy soup, and don't miss the curry - but beware of the heat index and order according to your ability to enjoy flame in your flavor! My personal favorite is tradtional Pad Thai - the perfect balance of peanut and chili spice with sticky noodles & chicken..."
3,"Brought a group to Metro for brunch, made the reservation a month ago, called three days ago to update and ask to be outside and people arrived and were told there was no reservation. Finally spoke with the manager and had that sorted out but when I asked for our tables outside and they couldn't accommodate. Having the bartender wait on a party of 18 was idiotic. Not enough menus, slow, horrible drinks, no silverware, soup was burnt-how do you burn soup??? Attempting to fix the situation with beignets was a nice attempt until we heard the server swearing about having to deal with us. I host events all over the valley and never have I experienced service as horrible as I did at Metro. They have recently changed management, unfortunately the great reviews I read when I made the reservation don't hold true anymore. Update 11/8/2010 Since my first review I received a $50 gift certificate from the management of Metro. I decided to go for dessert last Friday night with 5 of my friends. I knew I couldn't just go with one other person, it had to be a group to show what they could do. We arrived at 9:15 and were seated on the patio. I met the general manager and she was pleasant enough, I never saw her again. There was no one seated inside and the patio was half full. Jesse greeted us and was the most attentive and understanding server when two guests needed to return their drinks to the bar because they were not made properly - it was a specialty drink from their cocktail menu. We ordered beignets, apple bread pudding and triple chocolate delight. The beignets and apple bread pudding were good. The triple chocolate delight had absolutely no chocolate flavor at all. Jesse was by far an excellent server and if you go ask to be seated in his section."
4,Great food and even better beer. Staff are always friendly. My favorite microbrewery in the Valley.


In [None]:
#text_1 = df_1['text']

In [None]:
# ideally we would use some smart text tokenizer, but for simplicity use this one
def tokenize(text: str) -> List[str]:
    """
    :param text: Takes input sentence
    :return: tokenized sentence
    """
    for punct in string.punctuation:
        text = text.replace(punct, ' '+punct+' ')
    t = text.split()
    return t

def get_ngrams(n: int, tokens: list) -> list:
    """
    :param n: n-gram size
    :param tokens: tokenized sentence
    :return: list of ngrams
    ngrams of tuple form: ((previous wordS!), target word)
    """
    # tokens.append('<END>')
    tokens = (n-1)*['<START>']+tokens
    l = [(tuple([tokens[i-p-1] for p in reversed(range(n-1))]), tokens[i]) for i in range(n-1, len(tokens))]
    return l

In [None]:
class NgramModel(object):

    def __init__(self, n):
        self.n = n

        # dictionary that keeps list of candidate words given context
        self.context = {}

        # keeps track of how many times ngram has appeared in the text before
        self.ngram_counter = {}

    def update(self, sentence: str) -> None:
        """
        Updates Language Model
        :param sentence: input text
        """
        n = self.n
        ngrams = get_ngrams(n, tokenize(sentence))
        for ngram in ngrams:
            if ngram in self.ngram_counter:
                self.ngram_counter[ngram] += 1.0
            else:
                self.ngram_counter[ngram] = 1.0

            prev_words, target_word = ngram
            if prev_words in self.context:
                self.context[prev_words].append(target_word)
            else:
                self.context[prev_words] = [target_word]

    def prob(self, context, token):
        """
        Calculates probability of a candidate token to be generated given a context
        :return: conditional probability
        """
        try:
            count_of_token = self.ngram_counter[(context, token)]
            count_of_context = float(len(self.context[context]))
            result = count_of_token / count_of_context

        except KeyError:
            result = 0.0
        return result

    def random_token(self, context):
        """
        Given a context we "semi-randomly" select the next word to append in a sequence
        :param context:
        :return:
        """
        r = random.random()
        map_to_probs = {}
        token_of_interest = self.context[context]
        for token in token_of_interest:
            map_to_probs[token] = self.prob(context, token)

        summ = 0
        for token in sorted(map_to_probs):
            summ += map_to_probs[token]
            if summ > r:
                return token

    def generate_text(self, token_count: int):
        """
        :param token_count: number of words to be produced
        :return: generated text
        """
        n = self.n
        context_queue = (n - 1) * ['<START>']
        result = []
        for _ in range(token_count):
            obj = self.random_token(tuple(context_queue))
            result.append(obj)
            if n > 1:
                context_queue.pop(0)
                if obj == '.':
                    context_queue = (n - 1) * ['<START>']
                else:
                    context_queue.append(obj)
        return ' '.join(result)


def create_ngram_model(n, path):
    m = NgramModel(n)
    with open(path, 'r') as f:
        text = f.read()
        text = text.split('.')
        for sentence in text:
            # add back the fullstop
            sentence += '.'
            m.update(sentence)
    return m


In [None]:
if __name__ == "__main__":
    start = time.time()
    m = create_ngram_model(3, 'train.tsv')

    print (f'Language Model creating time: {time.time() - start}')
    start = time.time()
    random.seed(7)
    print(f'{"="*50}\nGenerated text:')
    print(m.generate_text(20))
    print(f'{"="*50}')


Language Model creating time: 8.66172456741333
Generated text:
Delicious ! I ' m hard up and allow the blood , dealing with a cup of water before getting


In [None]:
generated_text = []
for i in np.arange(1, 1000):
  m = create_ngram_model(i, 'train.tsv')
  text = m.generate_text(1000)
  generated_text.append(text)

In [None]:
txt = pd.DataFrame(data = generated_text)
pd.set_option('display.max_colwidth', None)
txt.head(5)

In [None]:
txt.shape