In [1]:
import numpy as np
import pandas as pd

import nltk
from nltk.corpus import stopwords, wordnet
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import TfidfVectorizer
import re

In [2]:
x_test_df = pd.read_csv('data_reviews/x_test.csv')

In [3]:
# Special characters and punctuation
df = x_test_df.copy()
punc_list = list("?:!.,;()")
df["text_1"] = df["text"].str.replace("\n", " ")
df["text_1"] = df["text_1"].str.replace('"', " ")
df["text_1"] = df["text_1"].str.replace("'s", " ")
for punc in punc_list:
    df["text_1"] = df["text_1"].str.replace(punc, " ")
    
#lowering cases
df["text_1"] = df["text_1"].str.lower()
df.head()

Unnamed: 0,website_name,text,text_1
0,imdb,"Technically, the film is well made with impres...",technically the film is well made with impres...
1,yelp,!....THE OWNERS REALLY REALLY need to quit bei...,the owners really really need to quit bei...
2,amazon,what a disappointment,what a disappointment
3,imdb,The movie is terribly boring in places.,the movie is terribly boring in places
4,imdb,"One of the best mexican movies ever!, and one ...",one of the best mexican movies ever and one ...


In [4]:
#stemming and lemmatization
nltk.download('punkt')
nltk.download('wordnet')

def get_wordnet_pos(word):
    """Map POS tag to first character lemmatize() accepts"""
    if word == '':
        return ''
    tag = nltk.pos_tag([word])[0][1][0].upper()
    tag_dict = {"J": wordnet.ADJ,
                "N": wordnet.NOUN,
                "V": wordnet.VERB,
                "R": wordnet.ADV}

    return tag_dict.get(tag, wordnet.NOUN)

wordnet_lemmatizer = WordNetLemmatizer()
nrows = len(df)
lemmatized_text_list = []

for row in range(0, nrows):
    
    # Create an empty list containing lemmatized words
    lemmatized_list = []
    
    # Save the text and its words into an object
    text = df.loc[row]['text_1']
    text_words = text.split(" ")
    # Iterate through every word to lemmatize
    for word in text_words:
        pos = get_wordnet_pos(word)
        if pos != '':
            lemmatized_list.append(wordnet_lemmatizer.lemmatize(word, pos=pos))
        
    # Join the list
    lemmatized_text = " ".join(lemmatized_list)
    # Append to the list containing the texts
    lemmatized_text_list.append(lemmatized_text)

df['text_2'] = lemmatized_text_list
df.head()

[nltk_data] Downloading package punkt to
[nltk_data]     /Users/irenechang/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/irenechang/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


Unnamed: 0,website_name,text,text_1,text_2
0,imdb,"Technically, the film is well made with impres...",technically the film is well made with impres...,technically the film be well make with impress...
1,yelp,!....THE OWNERS REALLY REALLY need to quit bei...,the owners really really need to quit bei...,the owner really really need to quit be sooooo...
2,amazon,what a disappointment,what a disappointment,what a disappointment
3,imdb,The movie is terribly boring in places.,the movie is terribly boring in places,the movie be terribly boring in place
4,imdb,"One of the best mexican movies ever!, and one ...",one of the best mexican movies ever and one ...,one of the best mexican movie ever and one of ...


In [5]:
#stopwords
# nltk.download('stopwords')
# stop_words = list(stopwords.words('english'))
# re_stop_words = re.compile(r"\b(" + "|".join(stop_words) + ")\\W", re.I)
# def removeStopWords(sentence):
#     # removing stop words
#     global re_stop_words
#     return re_stop_words.sub(" ", sentence)
# df['text_3'] = df['text_2'].apply(removeStopWords)
# for stop_word in stop_words:
#     regex_stopword = r"\b" + stop_word + r"\b"
#     df['text_3'] = df['text_2'].str.replace(regex_stopword, '')

#remove numbers
pattern = r'[0-9]'
remove_number = []
for text in df["text_2"].tolist():
    remove_number.append(re.sub(pattern, '', text))
    
df["text_3"] = remove_number
df

Unnamed: 0,website_name,text,text_1,text_2,text_3
0,imdb,"Technically, the film is well made with impres...",technically the film is well made with impres...,technically the film be well make with impress...,technically the film be well make with impress...
1,yelp,!....THE OWNERS REALLY REALLY need to quit bei...,the owners really really need to quit bei...,the owner really really need to quit be sooooo...,the owner really really need to quit be sooooo...
2,amazon,what a disappointment,what a disappointment,what a disappointment,what a disappointment
3,imdb,The movie is terribly boring in places.,the movie is terribly boring in places,the movie be terribly boring in place,the movie be terribly boring in place
4,imdb,"One of the best mexican movies ever!, and one ...",one of the best mexican movies ever and one ...,one of the best mexican movie ever and one of ...,one of the best mexican movie ever and one of ...
...,...,...,...,...,...
595,yelp,This is a great restaurant at the Mandalay Bay.,this is a great restaurant at the mandalay bay,this be a great restaurant at the mandalay bay,this be a great restaurant at the mandalay bay
596,yelp,I could care less... The interior is just beau...,i could care less the interior is just beau...,i could care less the interior be just beautiful,i could care less the interior be just beautiful
597,imdb,The only consistent thread holding the series ...,the only consistent thread holding the series ...,the only consistent thread hold the series tog...,the only consistent thread hold the series tog...
598,yelp,My side Greek salad with the Greek dressing wa...,my side greek salad with the greek dressing wa...,my side greek salad with the greek dress be so...,my side greek salad with the greek dress be so...


In [6]:
list_columns = ["website_name", "text_3"]
df_clean = df.copy()
df_clean = df_clean[list_columns]
df_clean = df_clean.rename(columns={'text_3': 'text'})
df_clean.head()

Unnamed: 0,website_name,text
0,imdb,technically the film be well make with impress...
1,yelp,the owner really really need to quit be sooooo...
2,amazon,what a disappointment
3,imdb,the movie be terribly boring in place
4,imdb,one of the best mexican movie ever and one of ...


In [7]:
from spellchecker import SpellChecker
spell = SpellChecker(distance=4)
list_words = []
for index, txt in enumerate(df_clean.text):
    words = spell.split_words(txt)
    list_words = [spell.correction(word) for word in words]
    df_clean.loc[index, 'text'] = ' '.join(list_words)

In [8]:
nltk.download('stopwords')
stop_words = list(stopwords.words('english'))
to_remove = ['against', 'into', 'above', 'below', 'up', 'down', 'on', 'off', 'again', 'few', 'more', 'most', 'no', 'not', 'only', 'same']
for word in to_remove:
    stop_words.remove(word)

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/irenechang/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [11]:
clean = pd.read_csv('important_words.csv')

In [12]:
zeros = np.zeros((len(df_clean), len(clean.columns)))
dummies = pd.DataFrame(zeros, columns = clean.columns)

In [13]:
for index, i in enumerate(df_clean.text):
    strs = i.split(' ')
    for word in strs:
        if word in dummies.columns.tolist():
            dummies.loc[index, word] = 1

In [14]:
dummies.to_csv('test_data.csv', index=False)

In [17]:
# x_test_df.text.values