# Data Preprocessing

In [1]:
import nltk
import csv
import ssl ## workaround for my specific authorization issue
import pandas as pd
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem.wordnet import WordNetLemmatizer
from nltk.stem.porter import PorterStemmer

try:
    _create_unverified_https_context = ssl._create_unverified_context
except AttributeError:
    pass
else:
    ssl._create_default_https_context = _create_unverified_https_context

DATA_PATH = 'out.csv'
nltk.download('stopwords')
nltk.download('wordnet')

df = pd.read_csv(DATA_PATH)
####### FOR TESTING ONLY ###########
#df = df.head(100)

####### DROP USELESS COLUMNS  ###########
df = df.drop('review_id', axis=1)
df = df.drop('business_id', axis=1)
df = df.drop('funny', axis=1)
df = df.drop('cool', axis=1)
df = df.drop('date', axis=1)
####### LOWERCASE  ###########
df['text-lowered'] = df['text'].str.lower()
####### TOKENIZATION  ###########
df['tokenized_text'] = df['text-lowered'].apply(word_tokenize)
df = df.drop('text-lowered', axis=1)
####### STOPWORDS REMOVAL  ###########
stop = stopwords.words('english')
df['tokenized_text'] = df['tokenized_text'].apply(lambda x: [item for item in x if item not in stop])
####### PUNCTUATION REMOVAL ###########
import string
df['tokenized_text'] = df['tokenized_text'].apply(lambda x: [item for item in x if item not in string.punctuation])
####### STEMMING  ###########
stemmer = PorterStemmer()
df['tokenized_text'] = df['tokenized_text'].apply(lambda x: [stemmer.stem(y) for y in x])
####### Lemmatizating  ###########
lmtzr = WordNetLemmatizer()
df['tokenized_text'] = df['tokenized_text'].apply(lambda x: [lmtzr.lemmatize(y) for y in x])
######## LISTS JOIN AS SINGLE STRING ############
df['tokenized_text_string'] = [' '.join(map(str, l)) for l in df['tokenized_text']]
### POLARITY ###
from nltk.sentiment.vader import SentimentIntensityAnalyzer
nltk.download('vader_lexicon')
analyzer = SentimentIntensityAnalyzer()
df['compound'] = [analyzer.polarity_scores(x)['compound'] for x in df['tokenized_text_string']]
df['neg'] = [analyzer.polarity_scores(x)['neg'] for x in df['tokenized_text_string']]
df['neu'] = [analyzer.polarity_scores(x)['neu'] for x in df['tokenized_text_string']]
df['pos'] = [analyzer.polarity_scores(x)['pos'] for x in df['tokenized_text_string']]
##########################
from rake_nltk import Rake
def star_revised_logic(x):
    if x == 3:
        return 0
    elif x>3:
        return 1
    else:
        return -1
r = Rake()
r.extract_keywords_from_text(df['tokenized_text_string'][0])
structure = r.get_ranked_phrases()[0:10]
df['star_revised'] = df['stars'].apply(lambda x: star_revised_logic(x))
##########################
compression_opts = dict(method='zip', archive_name='out.csv')
df.to_csv('0425.zip', index=False, compression=compression_opts)

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/diwang/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /Users/diwang/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package vader_lexicon to
[nltk_data]     /Users/diwang/nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!


# word2vec

### Params -
1. size: The number of dimensions of the embeddings and the default is 100.
2. window: The maximum distance between a target word and words around the target word. The default window is 5.
3. min_count: The minimum count of words to consider when training the model; words with occurrence less than this count will be ignored. The default for min_count is 5.
4. workers: The number of partitions during training and the default workers is 3.
5. sg: The training algorithm, either CBOW(0) or skip gram(1). The default training algorithm is CBOW.


### Getting word2vec model

In [2]:
from gensim.models import Word2Vec
import time

size = 100
window = 3
min_count = 1
workers = 3
sg = 1

word2vec_model_file = 'word2vec' + str(size) + '.model'
start_time = time.time()
stemmed_tokens = df['tokenized_text_string'].str.split()
w2v_model = Word2Vec(stemmed_tokens, min_count=min_count, size=size, workers = workers, window=window, sg=sg)
print("Time taken to train word2vec model: " + str(time.time() - start_time))
w2v_model.save(word2vec_model_file)

Time taken to train word2vec model: 0.5043501853942871


### Reload word2vec model

In [3]:
import numpy as np

sg_w2v_model = Word2Vec.load(word2vec_model_file)

print("Index of the word 'food': ")
print(sg_w2v_model.wv.vocab["food"].index)
print(len(sg_w2v_model.wv.vocab))
print("Length of the vector generated for a word")
print(len(sg_w2v_model['food']))
print("Print the length after taking average of all word vectors in a sentence:")
print(np.mean([sg_w2v_model[token] for token in stemmed_tokens[0]], axis=0))

Index of the word 'food': 
4
2023
Length of the vector generated for a word
100
Print the length after taking average of all word vectors in a sentence:
[ 7.9359282e-03 -1.6347413e-03 -1.0476384e-03  7.6740729e-03
 -3.3623604e-03 -1.6317959e-03 -5.1636659e-03 -5.0062621e-03
  1.0712271e-03  5.4935734e-03  2.3654650e-03 -2.3675319e-03
 -3.0726084e-04  2.0927154e-03 -1.8430916e-03  8.2158030e-04
 -5.8790823e-03  7.4607111e-04  5.2704909e-03 -5.5219396e-03
 -2.9401504e-03  6.0420199e-03  1.4385673e-03  1.1921864e-03
  2.4162787e-03 -3.7967751e-03 -5.2548652e-03 -3.3770683e-03
 -6.5771939e-04 -5.7264920e-03 -1.4074090e-03 -7.5838887e-03
 -4.1027218e-03 -8.3763682e-04  1.4167560e-03 -3.1605226e-03
  5.7131723e-03 -1.5577336e-03  3.5226780e-03 -2.0072930e-03
 -3.7661747e-03  2.1296570e-03  9.6288045e-07  2.5317471e-03
 -2.7224957e-03 -3.6933930e-03 -1.2942123e-03  3.2137595e-03
 -1.8522376e-03 -4.3493593e-03 -4.3271121e-04  9.6427696e-03
 -1.3334079e-03  4.2522806e-03 -1.9908906e-03  3.52140

  print(len(sg_w2v_model['food']))
  print(np.mean([sg_w2v_model[token] for token in stemmed_tokens[0]], axis=0))


### Vectorize all verses

In [5]:
def vecc(stem):
    return np.mean([sg_w2v_model[token] for token in stem], axis=0)
df['word2vec'] = df["tokenized_text"].apply(vecc)

  return np.mean([sg_w2v_model[token] for token in stem], axis=0)
