In [1]:
import pandas as pd
import numpy as np
import re
import spacy
from spacy.tokenizer import Tokenizer
from nltk.util import ngrams

In [2]:
nlp = spacy.load('en_core_web_lg')

In [3]:
reviews = pd.read_csv('data/reviews.csv')

In [13]:
reviews.drop(reviews[reviews['reviews'] == '[]'].index, axis=0, inplace=True)
reviews.reset_index(drop=True, inplace=True)
reviews = reviews['reviews']

In [14]:
def clean_reviews(reviews):
    reviews_ls = [r.strip('][') for r in reviews.split('\\r\\n\\r\\n')]
    return reviews_ls

In [16]:
reviews_clean = [clean_reviews(r) for r in reviews]
reviews_clean

[["'(As I\\'m writing this review, Darth Vader\\'s theme music begins to build in my mind...)",
  'Well, it actually has a title, what the Darth Vader theme. And that title is "The Imperial March", composed by the great John Williams, whom, as many of you may already know, also composed the theme music for "Jaws" - that legendary score simply titled, "Main Title (Theme From Jaws)".',
  'Now, with that lil\\\' bit of trivia aside, let us procede with the fabled film currently under review: Star Wars. It had been at a drive-in theater in some small Illinois town or other where my mother, my older brother, and I had spent our weekly "Movie Date Night" watching this George Lucas directed cult masterpiece from our car in the parking lot. On the huge outdoor screen, the film appeared to be a silent one, but thanks to an old wire-attached speaker, we were able to hear both the character dialogue and soundtrack loud and clear. We even had ourselves a carful of vittles and snacks - walked back 

In [22]:
rev = reviews_clean[0][1]

In [23]:
doc = nlp(rev)
sentences = [sent for sent in doc.sents]
sentences

[Well, it actually has a title, what the Darth Vader theme.,
 And that title is "The Imperial March", composed by the great John Williams, whom, as many of you may already know, also composed the theme music for "Jaws" - that legendary score simply titled, "Main Title (Theme From Jaws)".]

In [24]:
tokens_clean_lemma = [token.lemma_ for token in doc if not (token.is_stop or token.is_punct)]
tokens_clean_lemma

['actually',
 'title',
 'Darth',
 'Vader',
 'theme',
 'title',
 'Imperial',
 'March',
 'compose',
 'great',
 'John',
 'Williams',
 'know',
 'compose',
 'theme',
 'music',
 'Jaws',
 'legendary',
 'score',
 'simply',
 'title',
 'main',
 'title',
 'theme',
 'Jaws']

In [25]:
def append_ngrams(tokens):
    bigrams = [' '.join(bigram) for bigram in list(ngrams(tokens, 2))]
    trigrams = [' '.join(trigram) for trigram in list(ngrams(tokens, 3))]
    return tokens + bigrams + trigrams

In [26]:
append_ngrams(tokens_clean_lemma)

['actually',
 'title',
 'Darth',
 'Vader',
 'theme',
 'title',
 'Imperial',
 'March',
 'compose',
 'great',
 'John',
 'Williams',
 'know',
 'compose',
 'theme',
 'music',
 'Jaws',
 'legendary',
 'score',
 'simply',
 'title',
 'main',
 'title',
 'theme',
 'Jaws',
 'actually title',
 'title Darth',
 'Darth Vader',
 'Vader theme',
 'theme title',
 'title Imperial',
 'Imperial March',
 'March compose',
 'compose great',
 'great John',
 'John Williams',
 'Williams know',
 'know compose',
 'compose theme',
 'theme music',
 'music Jaws',
 'Jaws legendary',
 'legendary score',
 'score simply',
 'simply title',
 'title main',
 'main title',
 'title theme',
 'theme Jaws',
 'actually title Darth',
 'title Darth Vader',
 'Darth Vader theme',
 'Vader theme title',
 'theme title Imperial',
 'title Imperial March',
 'Imperial March compose',
 'March compose great',
 'compose great John',
 'great John Williams',
 'John Williams know',
 'Williams know compose',
 'know compose theme',
 'compose theme mu