In [1]:
import pyspark as ps
from pyspark import SparkConf, SparkContext
from __future__ import unicode_literals
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import time
import json
import gzip
import spacy
%matplotlib inline
np.random.seed(32113)
import pickle
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.feature_extraction.stop_words import ENGLISH_STOP_WORDS
from sklearn.decomposition import NMF
from nltk.corpus import stopwords
from spacy.en import English
from sklearn.grid_search import GridSearchCV
import string
import data_prep_for_test_run as dp 
parser = English()
from pyspark.sql.types import *
from pyspark.sql.functions import monotonically_increasing_id
from pyspark.sql.functions import isnan, when, count, col
import pyspark.sql.functions as pys_fun
import math



In [2]:
hc = ps.HiveContext(sc)
sql = ps.SQLContext(sc)

In [3]:
df = spark.read.csv('sparkprocess1.csv', header=True, inferSchema=True)

In [30]:
print((df.count(), len(df.columns)))

(4739, 111)


#### NOTES:
I notice something. I realize that there are 25 samples where reviewText is null(!?).  
I was not able to run TFIDF for this reason (or collect tokenized data)...  

In [4]:
df.filter(df.reviewText.isNotNull()).count()

4714

In [5]:
df.filter(df.reviewText.isNull()).count()

25

In [6]:
df.select([count(when(isnan(c) | col(c).isNull(), c)).alias(c) for c in df.columns]).show()

+----------+-------+-------------------+--------------------+------------------+-----------+----+-----------+------------+-----------------+-------------+-------------+-------------+----------------+---------+--------+---------------+---+------------------+------------+--------+----+------------------+-------------------+------------------+-----------------------+----------+-----------+----------+-----------+------------+-----------+-----------+-----+---------------+-------------+-----------+----------+----------+--------+--------+-------------+------------+--------+-------------------+---+--------+--------------+------------+-----+------------------------+----+-------+----------+--------+--------+---------+-------+------------+--------+--------------------+------------------+--------------+----+--------------+-------+--------+----------+----------+---------+--------------------+-----------------+-------+------------+-----------+-----+--------+----------+--------------+------------+---

In [7]:
df = df.na.drop(subset=["reviewText"])

In [8]:
df.filter(df.reviewText.isNull()).count()

0

In [12]:
df.schema.names

['reviewText',
 'overall',
 'helpful_total_votes',
 'num_of_helpful_votes',
 'helpful_percentage',
 'Text_length',
 'asin',
 'rank_values',
 'num_category',
 'Screen Protectors',
 'PlayStation 4',
 'PlayStation 3',
 'PlayStation 2',
 'Game Boy Advance',
 'Joysticks',
 'GameCube',
 'Commodore Amiga',
 '3DO',
 'Cases & Protectors',
 'More Systems',
 'Sony PSP',
 'LIVE',
 'Sega Master System',
 'Fitness Accessories',
 'Subscription Cards',
 'Points & Currency Cards',
 'Atari 2600',
 'Controllers',
 'Light Guns',
 'Nintendo 64',
 'Nintendo 3DS',
 'Linux Games',
 'Accessories',
 'Skins',
 'Steering Wheels',
 'Racing Wheels',
 'PlayStation',
 'Networking',
 'Atari 7800',
 'Xbox 360',
 'Chargers',
 'Kids & Family',
 'ColecoVision',
 'Gamepads',
 'Digital Games & DLC',
 'PC',
 'Consoles',
 'Game Boy Color',
 'Nintendo NES',
 'Drums',
 'MMO & Free-to-Play Games',
 'Xbox',
 'Sega CD',
 'Faceplates',
 'Game Boy',
 'Adapters',
 'Batteries',
 'Fire TV',
 'Sega Genesis',
 'Speakers',
 'Batteries & C

In [15]:
column222 = [u'reviewerID', u'asin', u'reviewerName', u'reviewText', u'overall',
       u'summary', u'helpful_total_review', u'num_of_helpful_review',
       u'helpful_percent', u'text_length', u'price', u'rank_values',
       u'num category', u'Sony PSP', u'PlayStation', u'LIVE', u'PlayStation 3',
       u'PlayStation 2', u'Game Boy Advance', u'Xbox 360', u'Joysticks',
       u'GameCube', u'PC Game Downloads', u'Chargers', u'Kids & Family',
       u'Remotes', u'Memory', u'Gamepads', u'Networking',
       u'Digital Games & DLC', u'Nintendo DS', u'Cases & Protectors',
       u'More Systems', u'PlayStation Vita', u'Adapters', u'Digital Games',
       u'Mac', u'PC', u'Consoles', u'Game Boy Color', u'Dance Mats',
       u'Nintendo NES', u'Drums', u'Interactive Gaming Figures', u'Xbox One',
       u'Screen Protectors', u'Mac Game Downloads', u'Downloadable Content',
       u'Fitness Accessories', u'MMO & Free-to-Play Games',
       u'Subscription Cards', u'Points & Currency Cards', u'Flight Controls',
       u'Currency Cards', u'Xbox', u'Controllers', u'Cables & Adapters',
       u'Games', u'Batteries & Chargers', u'Game Boy', u'Light Guns',
       u'Nintendo 64', u'PlayStation 4', u'Super Nintendo', u'Guitars',
       u'Wii U', u'Nintendo 3DS', u'Steering Wheels', u'Headsets',
       u'Accessories', u'Wii', u'Skins', u'Hardware', u'Linux Games',
       u'Batteries', u'Gaming Mice', u'Sega Genesis', u'Accessory Kits',
       u'Fire TV', u'Cables', u'Gaming Keyboards', u'Casual Games',
       u'Racing Wheels', u'Software', u'Video Games']

In [17]:
[co for co in df.schema.names if co not in column222]

['helpful_total_votes',
 'num_of_helpful_votes',
 'helpful_percentage',
 'Text_length',
 'num_category',
 'Commodore Amiga',
 '3DO',
 'Sega Master System',
 'Atari 2600',
 'Atari 7800',
 'ColecoVision',
 'Sega CD',
 'Faceplates',
 'Speakers',
 'Sega Game Gear',
 'PDAs',
 'Sega Dreamcast',
 'Atari 5200',
 'Keyboards',
 'Commodore 64',
 'Microphones',
 'Atari Lynx',
 'Intellivision',
 'NEOGEO Pocket',
 'TurboGrafx 16',
 'Sensor Bars',
 'Atari Jaguar',
 'Sega Saturn',
 'id',
 'software',
 'Video_Games',
 'below20',
 'below50',
 'below100',
 'below300',
 'price_unknown']

## Tokenizer and Stopword in Spark
Here, I am applying Tokenizer and stopword to my review contents.  
I realize that Spark tokenizer does not lemmatize/tokenize word accurately unlike SpaCy that I used previously.  
So I played around with it and decided to do following process:
1. tokenize review text.  
2. use stopwordremover to remove custom stopword I created with NLTK, SKlearn Stopwords.  
3. run stopwordremover 1 more time. This time, run it with the spark stopword.  
  
I also realize that running tokenizer does not take good care of special symbols (!, ., ?, etc) so I repeated the same procedure with RegexTokenizer with Pattern set as \\W+  
this should get rid of all special character that is in review contents.  

In [55]:
from pyspark.ml.feature import Tokenizer,RegexTokenizer
from pyspark.ml.feature import StopWordsRemover

In [56]:
#ran both tokenizer and regextokenizer
tokenizer = Tokenizer(inputCol="reviewText", outputCol="words")
tokenized = tokenizer.transform(df)

#the regextokenizer here takes out any non-word character from a token.
#for instance, Spoiler! --> spoiler
regtokenizer = RegexTokenizer(inputCol="reviewText", outputCol="words", pattern="\\W+")
reg = regtokenizer.transform(df)

In [57]:
tokenized.select('words').show()

+--------------------+
|               words|
+--------------------+
|[this, does, not,...|
|[my, son, has, be...|
|[unfortunately, i...|
|[animal, crossing...|
|[got, this, game,...|
|[i'm, going, to, ...|
|[its, fun, and, i...|
|[the, all, new, s...|
|[the, starter, bu...|
|[atgames, is, a, ...|
|[i've, had, grand...|
|[**update:, many,...|
|[no, spoilers!, r...|
|[as, you'll, prob...|
|[i've, been, an, ...|
|[when, call, of, ...|
|[it's, amazing, w...|
|[imagine, you, ha...|
|[in, 2005,, f.e.a...|
|[i, know, that, a...|
+--------------------+
only showing top 20 rows



In [58]:
#my STOPLIST from the original project. mix NLTK and SKlearn english stopwords.
STOPLIST = list(set(stopwords.words('english') + ["n't", "'s", "'m", "ca"] + \
                list(ENGLISH_STOP_WORDS))) +\
                 " ".join(string.punctuation).split(" ") + \
                 ["-----", "---", "...", "..", "....", "", " ", "\n", "\n\n"]

In [59]:
stopwordremove = StopWordsRemover(inputCol="words", outputCol="tokenized_filt" ,stopWords=STOPLIST)

In [60]:
#applying customized stopwordlist to tokenized words
tokenized2 = stopwordremove.transform(tokenized)
reg2 = stopwordremove.transform(reg)

In [61]:
tokenized2.select("tokenized_filt").show()

+--------------------+
|      tokenized_filt|
+--------------------+
|[come, usb, hook,...|
|[son, eagerly, an...|
|[unfortunately, c...|
|[animal, crossing...|
|[got, game, day, ...|
|[i'm, going, pref...|
|[fun, better, sub...|
|[new, slimmer, ps...|
|[starter, bundle,...|
|[atgames, chinese...|
|[i've, grand, sla...|
|[**update:, revie...|
|[spoilers!, read,...|
|[you'll, probably...|
|[i've, xbox, 360,...|
|[duty, arrived, p...|
|[it's, amazing, s...|
|[imagine, wonderf...|
|[2005,, f.e.a.r.,...|
|[know, good, deal...|
+--------------------+
only showing top 20 rows



you see that custom stopword is not working for words like i'm,it's,i've.  
So I decided to run another stopwordremover.  
This time, I am applying spark stopword by not specifying any custom stopwordlist.  

In [62]:
remove = StopWordsRemover(inputCol="tokenized_filt", outputCol="tokenized")
tokenized3 = remove.transform(tokenized2)
reg3 = remove.transform(reg2)

In [64]:
#Comparison between normal tokenizer and regex tokenizer results.
tokenized3.select('tokenized').show()
reg3.select('tokenized').show()

+--------------------+
|           tokenized|
+--------------------+
|[come, usb, hook,...|
|[son, eagerly, an...|
|[unfortunately, r...|
|[animal, crossing...|
|[got, game, day, ...|
|[going, preface, ...|
|[fun, better, sub...|
|[new, slimmer, ps...|
|[starter, bundle,...|
|[atgames, chinese...|
|[grand, slam, ten...|
|[**update:, revie...|
|[spoilers!, read,...|
|[probably, know,,...|
|[xbox, 360, owner...|
|[duty, arrived, p...|
|[amazing, sony, t...|
|[imagine, wonderf...|
|[2005,, f.e.a.r.,...|
|[know, good, deal...|
+--------------------+
only showing top 20 rows

+--------------------+
|           tokenized|
+--------------------+
|[come, usb, hook,...|
|[son, eagerly, an...|
|[unfortunately, r...|
|[animal, crossing...|
|[got, game, day, ...|
|[going, preface, ...|
|[fun, better, sub...|
|[new, slimmer, ps...|
|[starter, bundle,...|
|[atgames, chinese...|
|[grand, slam, ten...|
|[update, reviewer...|
|[spoilers, read, ...|
|[probably, know, ...|
|[xbox, 360, owner...|
|[duty, 

In [65]:
reg3=reg3.drop('words','tokenized_filt')

### Tokenizer result
I would say that Regextokenizer is doing it's job the way I wanted.  
Although the process of making tokenizer is super fast and easy, I wish lemmatizer and tokenizer were smarter like the one in SpaCy.  
But again, I can not complain about the speed of the process :-)

## TFIDF MATRIX


In [66]:
from pyspark.ml.feature import HashingTF, IDF

In [71]:
hashingTF = HashingTF(inputCol="tokenized", outputCol="TTT", numFeatures=10000)
featurizedData = hashingTF.transform(reg3)

In [72]:
featurizedData.select('TTT').take(1)

[Row(TTT=SparseVector(10000, {1: 1.0, 253: 1.0, 404: 4.0, 677: 1.0, 721: 1.0, 1020: 1.0, 1274: 1.0, 1561: 1.0, 1742: 1.0, 1821: 1.0, 1964: 2.0, 2133: 2.0, 2157: 1.0, 2188: 1.0, 2587: 1.0, 2722: 1.0, 2766: 1.0, 2801: 1.0, 3051: 1.0, 3516: 1.0, 3525: 1.0, 3765: 1.0, 3913: 1.0, 4040: 1.0, 4099: 1.0, 4127: 1.0, 4227: 2.0, 4260: 2.0, 4467: 1.0, 4489: 1.0, 5015: 1.0, 5086: 2.0, 5089: 1.0, 5141: 1.0, 5483: 1.0, 5666: 2.0, 6252: 1.0, 6364: 1.0, 6537: 1.0, 6581: 1.0, 6692: 3.0, 7040: 1.0, 7477: 1.0, 7527: 2.0, 7550: 1.0, 7610: 2.0, 7705: 1.0, 7712: 1.0, 7779: 1.0, 7788: 1.0, 8117: 1.0, 8151: 2.0, 8567: 1.0, 8831: 1.0, 9161: 1.0, 9451: 1.0, 9527: 1.0, 9567: 1.0, 9623: 1.0, 9845: 1.0, 9977: 2.0}))]

In [73]:
idf = IDF(minDocFreq=2, inputCol="TTT", outputCol="TFIDF_features")
idfModel = idf.fit(featurizedData)
TFIDF_model = idfModel.transform(featurizedData)

In [76]:
TFIDF_mat = TFIDF_model.select("TFIDF_features").collect()

[Row(TFIDF_features=SparseVector(10000, {1: 2.5053, 253: 2.3763, 404: 13.6346, 677: 3.2328, 721: 1.6244, 1020: 4.3642, 1274: 4.8209, 1561: 2.9572, 1742: 2.2028, 1821: 1.6831, 1964: 3.2359, 2133: 4.1534, 2157: 3.9587, 2188: 4.3154, 2587: 4.3314, 2722: 3.9476, 2766: 4.21, 2801: 2.0733, 3051: 1.9262, 3516: 4.8209, 3525: 1.099, 3765: 2.8273, 3913: 3.8141, 4040: 5.8194, 4099: 2.0385, 4127: 3.9699, 4227: 7.0195, 4260: 2.1182, 4467: 3.5607, 4489: 1.2876, 5015: 6.0606, 5086: 8.1781, 5089: 2.05, 5141: 2.5724, 5483: 4.0159, 5666: 4.0901, 6252: 3.1404, 6364: 2.0533, 6537: 1.4738, 6581: 3.74, 6692: 12.63, 7040: 2.4949, 7477: 4.5873, 7527: 6.0848, 7550: 2.7091, 7610: 9.5392, 7705: 6.2613, 7712: 1.1513, 7779: 1.3634, 7788: 3.3465, 8117: 3.3526, 8151: 4.6115, 8567: 2.5316, 8831: 3.0469, 9161: 3.4681, 9451: 3.2824, 9527: 1.5568, 9567: 3.5385, 9623: 5.9736, 9845: 3.7672, 9977: 8.7283})),
 Row(TFIDF_features=SparseVector(10000, {76: 2.8309, 103: 3.6963, 114: 4.9928, 221: 2.31, 241: 3.7764, 362: 2.6296, 

In [34]:
from pyspark.ml.feature import HashingTF, IDF, Tokenizer

sentenceData = sqlContext.createDataFrame([
    (0, "Hi I heard about Spark"),
    (0, "I wish Java could use case classes"),
    (1, "Logistic regression models are neat")
], ["label", "sentence"])
tokenizer = Tokenizer(inputCol="sentence", outputCol="words")
wordsData = tokenizer.transform(sentenceData)
hashingTF = HashingTF(inputCol="words", outputCol="rawFeatures", numFeatures=20)
featurizedData = hashingTF.transform(wordsData)
idf = IDF(inputCol="rawFeatures", outputCol="features")
idfModel = idf.fit(featurizedData)
rescaledData = idfModel.transform(featurizedData)
for features_label in rescaledData.select("features", "label").take(3):
    print(features_label)

Row(features=SparseVector(20, {0: 0.6931, 5: 0.6931, 9: 0.2877, 17: 1.3863}), label=0)
Row(features=SparseVector(20, {2: 0.6931, 7: 0.6931, 9: 0.863, 13: 0.2877, 15: 0.2877}), label=0)
Row(features=SparseVector(20, {4: 0.6931, 6: 0.6931, 13: 0.2877, 15: 0.2877, 18: 0.6931}), label=1)


In [91]:
featurizedData.select("rawFeatures").collect()

[Row(rawFeatures=SparseVector(20, {0: 1.0, 5: 1.0, 9: 1.0, 17: 2.0})),
 Row(rawFeatures=SparseVector(20, {2: 1.0, 7: 1.0, 9: 3.0, 13: 1.0, 15: 1.0})),
 Row(rawFeatures=SparseVector(20, {4: 1.0, 6: 1.0, 13: 1.0, 15: 1.0, 18: 1.0}))]