# Data Preprocessing

In [1]:
import nltk
import csv
import ssl ## workaround for my specific authorization issue
import pandas as pd
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem.wordnet import WordNetLemmatizer
from nltk.stem.porter import PorterStemmer

try:
    _create_unverified_https_context = ssl._create_unverified_context
except AttributeError:
    pass
else:
    ssl._create_default_https_context = _create_unverified_https_context

DATA_PATH = 'out.csv'
nltk.download('stopwords')
nltk.download('wordnet')

df = pd.read_csv(DATA_PATH)
####### FOR TESTING ONLY ###########
#df = df.head(100)

####### DROP USELESS COLUMNS  ###########
df = df.drop('review_id', axis=1)
df = df.drop('business_id', axis=1)
df = df.drop('funny', axis=1)
df = df.drop('cool', axis=1)
df = df.drop('date', axis=1)

[nltk_data] Error loading stopwords: <urlopen error [Errno 61]
[nltk_data]     Connection refused>
[nltk_data] Error loading wordnet: <urlopen error [Errno 61]
[nltk_data]     Connection refused>


### NEGATION HANDLING

In [2]:
from nltk.util import pairwise
NEGATE = {
        "aint",
        "arent",
        "cannot",
        "cant",
        "couldnt",
        "darent",
        "didnt",
        "doesnt",
        "ain't",
        "aren't",
        "can't",
        "couldn't",
        "daren't",
        "didn't",
        "doesn't",
        "dont",
        "hadnt",
        "hasnt",
        "havent",
        "isnt",
        "mightnt",
        "mustnt",
        "neither",
        "don't",
        "hadn't",
        "hasn't",
        "haven't",
        "isn't",
        "mightn't",
        "mustn't",
        "neednt",
        "needn't",
        "never",
        "none",
        "nope",
        "nor",
        "not",
        "nothing",
        "nowhere",
        "oughtnt",
        "shant",
        "shouldnt",
        "uhuh",
        "wasnt",
        "werent",
        "oughtn't",
        "shan't",
        "shouldn't",
        "uh-uh",
        "wasn't",
        "weren't",
        "without",
        "wont",
        "wouldnt",
        "won't",
        "wouldn't",
        "rarely",
        "seldom",
        "despite",
    }

def negated(word, include_nt=True):
    """
    Determine if input contains negation words
    """
    neg_words = NEGATE
    if word.lower() in neg_words:
        return True
    if include_nt:
        if "n't" in word.lower():
            return True
    return False


def toNegate(stri):
    tt = stri.split(" ")
    for i in range(len(tt)-1):
        tt[i]=tt[i].lower()
        if negated(tt[i]):
            tt[i+1] = '!' + tt[i+1]
    return tt

def join(stri):
    tt = ""
    for item in stri:
        tt += item + " "
    return tt

df['text_negated'] = df["text"].apply(toNegate)
df['text_negated'] = df["text_negated"].apply(join)
df = df.drop('text', axis=1) ## SAVE SPACE BY DROPPING THIS COLUMN.

In [3]:
####### TOKENIZATION  #########
df['tokenized_text'] = df['text_negated'].apply(word_tokenize)
####### STOPWORDS REMOVAL  ###########
stop = stopwords.words('english')
df['tokenized_text'] = df['tokenized_text'].apply(lambda x: [item for item in x if item not in stop])
####### PUNCTUATION REMOVAL ###########
import string
df['tokenized_text'] = df['tokenized_text'].apply(lambda x: [item for item in x if item not in string.punctuation])
####### STEMMING  ###########
stemmer = PorterStemmer()
df['tokenized_text'] = df['tokenized_text'].apply(lambda x: [stemmer.stem(y) for y in x])
####### Lemmatizating  ###########
lmtzr = WordNetLemmatizer()
df['tokenized_text'] = df['tokenized_text'].apply(lambda x: [lmtzr.lemmatize(y) for y in x])
######## LISTS JOIN AS SINGLE STRING ############
df['tokenized_text_string'] = [' '.join(map(str, l)) for l in df['tokenized_text']]
##0# POLARITY ###
from nltk.sentiment.vader import SentimentIntensityAnalyzer
nltk.download('vader_lexicon')
analyzer = SentimentIntensityAnalyzer()
df['compound'] = [analyzer.polarity_scores(x)['compound'] for x in df['tokenized_text_string']]
df['neg'] = [analyzer.polarity_scores(x)['neg'] for x in df['tokenized_text_string']]
df['neu'] = [analyzer.polarity_scores(x)['neu'] for x in df['tokenized_text_string']]
df['pos'] = [analyzer.polarity_scores(x)['pos'] for x in df['tokenized_text_string']]
##########################
from rake_nltk import Rake
def star_revised_logic(x):
    if x == 3:
        return 0
    elif x>3:
        return 1
    else:
        return -1
r = Rake()
r.extract_keywords_from_text(df['tokenized_text_string'][0])
structure = r.get_ranked_phrases()[0:10]
df['star_revised'] = df['stars'].apply(lambda x: star_revised_logic(x))
##########################

[nltk_data] Downloading package vader_lexicon to
[nltk_data]     /Users/diwang/nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!


# word2vec

### Params -
1. size: The number of dimensions of the embeddings and the default is 100.
2. window: The maximum distance between a target word and words around the target word. The default window is 5.
3. min_count: The minimum count of words to consider when training the model; words with occurrence less than this count will be ignored. The default for min_count is 5.
4. workers: The number of partitions during training and the default workers is 3.
5. sg: The training algorithm, either CBOW(0) or skip gram(1). The default training algorithm is CBOW.


### Getting word2vec model

In [4]:
from gensim.models import Word2Vec
import time

size = 100
window = 3
min_count = 1
workers = 3
sg = 0

word2vec_model_file = 'word2vec' + str(size) + '.model'
start_time = time.time()
stemmed_tokens = df['tokenized_text_string'].str.split()
w2v_model = Word2Vec(stemmed_tokens, min_count=min_count, size=size, workers = workers, window=window, sg=sg)
print("Time taken to train word2vec model: " + str(time.time() - start_time))
w2v_model.save(word2vec_model_file)

Time taken to train word2vec model: 61.65305757522583


### Reload word2vec model

In [3]:
import numpy as np

sg_w2v_model = Word2Vec.load(word2vec_model_file)

print("Index of the word 'food': ")
print(sg_w2v_model.wv.vocab["food"].index)
print(len(sg_w2v_model.wv.vocab))
print("Length of the vector generated for a word")
print(len(sg_w2v_model['food']))
print("Print the length after taking average of all word vectors in a sentence:")
print(np.mean([sg_w2v_model[token] for token in stemmed_tokens[0]], axis=0))

Index of the word 'food': 
3
126515
Length of the vector generated for a word
100
Print the length after taking average of all word vectors in a sentence:
[-0.22678614 -0.2897635  -0.32459044  0.15575048 -0.38304508 -0.1541775
  0.36523107  0.06684726  0.12628984  0.00530223  0.5027002   0.3054193
  0.01242023  0.02354489 -0.14487053 -0.1689697  -0.1280526  -0.2797667
  0.46458685 -0.10119892  0.14930712  0.14399776 -0.40607435  0.03560648
  0.03004741  0.0279792  -0.17510611 -0.28204522  0.07279837 -0.02034456
  0.20961498 -0.1175883  -0.02026132  0.02836462  0.12384608  0.06297066
 -0.38680604 -0.09468395  0.02825443 -0.15541245  0.14202408  0.09927452
  0.38493225 -0.38259593  0.30376422 -0.06327409 -0.1925044   0.22711995
  0.19274372  0.19146255  0.08326176  0.23516697 -0.31119418  0.02592788
 -0.43836656 -0.16568975 -0.23031999 -0.15469138 -0.09247862 -0.17795675
  0.07299511  0.06540577  0.18265754  0.49318033 -0.18133762  0.3161753
 -0.12969163 -0.14465114  0.31130642  0.101414

  print(len(sg_w2v_model['food']))
  print(np.mean([sg_w2v_model[token] for token in stemmed_tokens[0]], axis=0))


### Vectorize all verses

In [4]:
def vecc(stem):
    return np.mean([sg_w2v_model[token] for token in stem], axis=0)
df['word2vec'] = df["tokenized_text"].apply(vecc)

  return np.mean([sg_w2v_model[token] for token in stem], axis=0)


In [5]:
compression_opts = dict(method='zip', archive_name='out.csv')
df.to_csv('0426.zip', index=False, compression=compression_opts)