## Importing All the Needed Libraries

In [2]:
import pandas as pd
import re
from nltk.tokenize import word_tokenize
from nltk.stem.porter import PorterStemmer
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import CountVectorizer

## Loading the Data

In [4]:
reviews_data = pd.read_csv("data.tsv", sep = "\t")
reviews_data.columns = ["review", "liked"]
reviews_data.head()

Unnamed: 0,review,liked
0,Wow... Loved this place.,1
1,Crust is not good.,0
2,Not tasty and the texture was just nasty.,0
3,Stopped by during the late May bank holiday of...,1
4,The selection on the menu was great and so wer...,1


## Preprate the Data

In [9]:
def clean_review(review):
    """
    Receives a raw review and clean it using the following steps:
    1. Remove all non-words
    2. Transform the review in lower case
    3. Remove all stop words
    4. Perform stemming

    Args:
        review: the review that iwill be cleaned
    Returns:
        a clean review using the mentioned steps above.
    """

    review = re.sub("[^A-Za-z]", " ", review)
    review = review.lower()
    review = word_tokenize(review)
    stemmer = PorterStemmer()
    review = [stemmer.stem(word) for word in review if word not in set(stopwords.words("english"))]
    review = " ".join(review)
    return review

In [14]:
review = reviews_data.review[2]
print(review)

cleaned_review = clean_review(review)
print(cleaned_review)

Not tasty and the texture was just nasty.
tasti textur nasti


In [16]:
corpus = []
for i in range(0, len(reviews_data)):
    review = clean_review(reviews_data.review[i])
    corpus.append(review)

corpus[:5]

['wow love place',
 'crust good',
 'tasti textur nasti',
 'stop late may bank holiday rick steve recommend love',
 'select menu great price']

## Extracting a Features using the Bag-of-Words Model

In [3]:
count_vectorizer = CountVectorizer()
features = count_vectorizer.fit_transform(corpus).toarray()

NameError: name 'corpus' is not defined

0.0019181585677749361