In [1]:
import pyspark as ps
from pyspark import SparkConf, SparkContext
from __future__ import unicode_literals
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import time
import json
import gzip
import spacy
%matplotlib inline
np.random.seed(32113)
import pickle
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.feature_extraction.stop_words import ENGLISH_STOP_WORDS
from sklearn.decomposition import NMF
from nltk.corpus import stopwords
from spacy.en import English
from sklearn.grid_search import GridSearchCV
import string
import data_prep_for_test_run as dp 
parser = English()
from pyspark.sql.types import *
from pyspark.sql.functions import monotonically_increasing_id
from pyspark.sql.functions import isnan, when, count, col
import pyspark.sql.functions as pys_fun
from pyspark.mllib.recommendation import ALS,Rating
import math



In [2]:
hc = ps.HiveContext(sc)
sql = ps.SQLContext(sc)

### Load Data

In [3]:
df = spark.read.csv('sparkprocess1.csv', header=True, inferSchema=True)

In [4]:
print((df.count(), len(df.columns)))

(4234, 111)


#### NOTES:
I notice something. I realize that there are 25 samples where reviewText is null(!?).  
I was not able to run TFIDF for this reason (or collect tokenized data)...  
This happens whenever I ran data cleaning and store my data as csv file.  
There seems to be some issue whenever I save or load my data.  
Will investigate with other data format and see this issue persist.  

#### Removing Null values

In [5]:
df.filter(df.reviewText.isNotNull()).count()

4209

In [6]:
df.filter(df.reviewText.isNull()).count()

25

In [7]:
df.select([count(when(isnan(c) | col(c).isNull(), c)).alias(c) for c in df.columns]).show()

+----------+-------+-------------------+--------------------+------------------+-----------+----+-----------+------------+-----------------+-------------+-------------+-------------+----------------+---------+--------+---------------+---+------------------+------------+--------+----+------------------+-------------------+------------------+-----------------------+----------+-----------+----------+-----------+------------+-----------+-----------+-----+---------------+-------------+-----------+----------+----------+--------+--------+-------------+------------+--------+-------------------+---+--------+--------------+------------+-----+------------------------+----+-------+----------+--------+--------+---------+-------+------------+--------+--------------------+------------------+--------------+----+--------------+-------+--------+----------+----------+---------+--------------------+-----------------+-------+------------+-----------+-----+--------+----------+--------------+------------+---

In [8]:
df = df.na.drop(subset=["reviewText"])

In [9]:
df.filter(df.reviewText.isNull()).count()

0

### Columns that only exist in the pyspark ver. of my data
For some reason, I have more category features that were not in my original model.  
I double checked the code but I was not able to find what caused this.  

In [10]:
df.schema.names

['reviewText',
 'overall',
 'helpful_total_votes',
 'num_of_helpful_votes',
 'helpful_percentage',
 'Text_length',
 'asin',
 'rank_values',
 'num_category',
 'Screen Protectors',
 'PlayStation 4',
 'PlayStation 3',
 'PlayStation 2',
 'Game Boy Advance',
 'Joysticks',
 'GameCube',
 'Commodore Amiga',
 '3DO',
 'Cases & Protectors',
 'More Systems',
 'Sony PSP',
 'LIVE',
 'Sega Master System',
 'Fitness Accessories',
 'Subscription Cards',
 'Points & Currency Cards',
 'Atari 2600',
 'Controllers',
 'Light Guns',
 'Nintendo 64',
 'Nintendo 3DS',
 'Linux Games',
 'Accessories',
 'Skins',
 'Steering Wheels',
 'Racing Wheels',
 'PlayStation',
 'Networking',
 'Atari 7800',
 'Xbox 360',
 'Chargers',
 'Kids & Family',
 'ColecoVision',
 'Gamepads',
 'Digital Games & DLC',
 'PC',
 'Consoles',
 'Game Boy Color',
 'Nintendo NES',
 'Drums',
 'MMO & Free-to-Play Games',
 'Xbox',
 'Sega CD',
 'Faceplates',
 'Game Boy',
 'Adapters',
 'Batteries',
 'Fire TV',
 'Sega Genesis',
 'Speakers',
 'Batteries & C

In [12]:
columns_from_orig_ver = [u'reviewerID', u'asin', u'reviewerName', u'reviewText', u'overall',
       u'summary', u'helpful_total_review', u'num_of_helpful_review',
       u'helpful_percent', u'text_length', u'price', u'rank_values',
       u'num category', u'Sony PSP', u'PlayStation', u'LIVE', u'PlayStation 3',
       u'PlayStation 2', u'Game Boy Advance', u'Xbox 360', u'Joysticks',
       u'GameCube', u'PC Game Downloads', u'Chargers', u'Kids & Family',
       u'Remotes', u'Memory', u'Gamepads', u'Networking',
       u'Digital Games & DLC', u'Nintendo DS', u'Cases & Protectors',
       u'More Systems', u'PlayStation Vita', u'Adapters', u'Digital Games',
       u'Mac', u'PC', u'Consoles', u'Game Boy Color', u'Dance Mats',
       u'Nintendo NES', u'Drums', u'Interactive Gaming Figures', u'Xbox One',
       u'Screen Protectors', u'Mac Game Downloads', u'Downloadable Content',
       u'Fitness Accessories', u'MMO & Free-to-Play Games',
       u'Subscription Cards', u'Points & Currency Cards', u'Flight Controls',
       u'Currency Cards', u'Xbox', u'Controllers', u'Cables & Adapters',
       u'Games', u'Batteries & Chargers', u'Game Boy', u'Light Guns',
       u'Nintendo 64', u'PlayStation 4', u'Super Nintendo', u'Guitars',
       u'Wii U', u'Nintendo 3DS', u'Steering Wheels', u'Headsets',
       u'Accessories', u'Wii', u'Skins', u'Hardware', u'Linux Games',
       u'Batteries', u'Gaming Mice', u'Sega Genesis', u'Accessory Kits',
       u'Fire TV', u'Cables', u'Gaming Keyboards', u'Casual Games',
       u'Racing Wheels', u'Software', u'Video Games']

In [14]:
#columns that only exist in the pyspark data after data cleaning.
[co for co in df.schema.names if co not in columns_from_orig_ver]

['helpful_total_votes',
 'num_of_helpful_votes',
 'helpful_percentage',
 'Text_length',
 'num_category',
 'Commodore Amiga',
 '3DO',
 'Sega Master System',
 'Atari 2600',
 'Atari 7800',
 'ColecoVision',
 'Sega CD',
 'Faceplates',
 'Speakers',
 'Sega Game Gear',
 'PDAs',
 'Sega Dreamcast',
 'Atari 5200',
 'Keyboards',
 'Commodore 64',
 'Microphones',
 'Atari Lynx',
 'Intellivision',
 'NEOGEO Pocket',
 'TurboGrafx 16',
 'Sensor Bars',
 'Atari Jaguar',
 'Sega Saturn',
 'id',
 'software',
 'Video_Games',
 'below20',
 'below50',
 'below100',
 'below300',
 'price_unknown']

## Tokenizer and Stopword in Spark
Here, I am applying Tokenizer and stopword to my review contents.  
I realize that Spark tokenizer does not lemmatize/tokenize word accurately unlike SpaCy that I used previously.  
So I played around with it and decided to do following process:
1. tokenize review text.  
2. use stopwordremover to remove custom stopword I created with NLTK, SKlearn Stopwords.  
3. run stopwordremover 1 more time. This time, run it with the spark stopword.  
  
I also realize that running tokenizer does not take good care of special symbols (!, ., ?, etc) so I repeated the same procedure with RegexTokenizer with Pattern set as \\W+  
this should get rid of all special character that is in review contents.  

In [15]:
from pyspark.ml.feature import Tokenizer,RegexTokenizer
from pyspark.ml.feature import StopWordsRemover

In [16]:
#ran both tokenizer and regextokenizer
tokenizer = Tokenizer(inputCol="reviewText", outputCol="words")
tokenized = tokenizer.transform(df)

#the regextokenizer here takes out any non-word character from a token.
#for instance, Spoiler! --> spoiler
regtokenizer = RegexTokenizer(inputCol="reviewText", outputCol="words", pattern="\\W+")
reg = regtokenizer.transform(df)

In [17]:
tokenized.select('words').show()

+--------------------+
|               words|
+--------------------+
|[this, does, not,...|
|[my, son, has, be...|
|[unfortunately, i...|
|[animal, crossing...|
|[got, this, game,...|
|[i'm, going, to, ...|
|[the, all, new, s...|
|[the, starter, bu...|
|[atgames, is, a, ...|
|[i've, had, grand...|
|[**update:, many,...|
|[no, spoilers!, r...|
|[as, you'll, prob...|
|[i've, been, an, ...|
|[when, call, of, ...|
|[it's, amazing, w...|
|[imagine, you, ha...|
|[in, 2005,, f.e.a...|
|[while, the, past...|
|[we, got, our, ha...|
+--------------------+
only showing top 20 rows



In [18]:
#my STOPLIST from the original project. mix NLTK and SKlearn english stopwords.
STOPLIST = list(set(stopwords.words('english') + ["n't", "'s", "'m", "ca"] + \
                list(ENGLISH_STOP_WORDS))) +\
                 " ".join(string.punctuation).split(" ") + \
                 ["-----", "---", "...", "..", "....", "", " ", "\n", "\n\n"]

In [19]:
stopwordremove = StopWordsRemover(inputCol="words", outputCol="tokenized_filt" ,stopWords=STOPLIST)

In [20]:
#applying customized stopwordlist to tokenized words
tokenized2 = stopwordremove.transform(tokenized)
reg2 = stopwordremove.transform(reg)

In [21]:
tokenized2.select("tokenized_filt").show()

+--------------------+
|      tokenized_filt|
+--------------------+
|[come, usb, hook,...|
|[son, eagerly, an...|
|[unfortunately, c...|
|[animal, crossing...|
|[got, game, day, ...|
|[i'm, going, pref...|
|[new, slimmer, ps...|
|[starter, bundle,...|
|[atgames, chinese...|
|[i've, grand, sla...|
|[**update:, revie...|
|[spoilers!, read,...|
|[you'll, probably...|
|[i've, xbox, 360,...|
|[duty, arrived, p...|
|[it's, amazing, s...|
|[imagine, wonderf...|
|[2005,, f.e.a.r.,...|
|[past, year,, we'...|
|[got, hands, xbox...|
+--------------------+
only showing top 20 rows



you see that custom stopword is not working for words like i'm,it's,i've.  
So I decided to run another stopwordremover.  
This time, I am applying spark stopword by not specifying any custom stopwordlist.  

In [22]:
remove = StopWordsRemover(inputCol="tokenized_filt", outputCol="tokenized")
tokenized3 = remove.transform(tokenized2)
reg3 = remove.transform(reg2)

In [23]:
#Comparison between normal tokenizer and regex tokenizer results.
tokenized3.select('tokenized').show()
reg3.select('tokenized').show()

+--------------------+
|           tokenized|
+--------------------+
|[come, usb, hook,...|
|[son, eagerly, an...|
|[unfortunately, r...|
|[animal, crossing...|
|[got, game, day, ...|
|[going, preface, ...|
|[new, slimmer, ps...|
|[starter, bundle,...|
|[atgames, chinese...|
|[grand, slam, ten...|
|[**update:, revie...|
|[spoilers!, read,...|
|[probably, know,,...|
|[xbox, 360, owner...|
|[duty, arrived, p...|
|[amazing, sony, t...|
|[imagine, wonderf...|
|[2005,, f.e.a.r.,...|
|[past, year,, rea...|
|[got, hands, xbox...|
+--------------------+
only showing top 20 rows

+--------------------+
|           tokenized|
+--------------------+
|[come, usb, hook,...|
|[son, eagerly, an...|
|[unfortunately, r...|
|[animal, crossing...|
|[got, game, day, ...|
|[going, preface, ...|
|[new, slimmer, ps...|
|[starter, bundle,...|
|[atgames, chinese...|
|[grand, slam, ten...|
|[update, reviewer...|
|[spoilers, read, ...|
|[probably, know, ...|
|[xbox, 360, owner...|
|[duty, arrived, p...|
|[amazin

In [24]:
reg3=reg3.drop('words','tokenized_filt')

### Tokenizer result
I would say that Regextokenizer is doing it's job the way I wanted.  
Although the process of making tokenizer is super fast and easy, I wish lemmatizer and tokenizer were smarter like the one in SpaCy.  
But again, I can not complain about how simply it is to run tokenizer process :-)

## TFIDF MATRIX

In [25]:
from pyspark.ml.feature import HashingTF, IDF

In [26]:
#run TF first.
hashingTF = HashingTF(inputCol="tokenized", outputCol="TTT", numFeatures=10)
featurizedData = hashingTF.transform(reg3)

In [27]:
featurizedData.select('TTT').take(1)

[Row(TTT=SparseVector(10, {0: 8.0, 1: 13.0, 2: 7.0, 3: 6.0, 4: 8.0, 5: 5.0, 6: 6.0, 7: 17.0, 8: 2.0, 9: 4.0}))]

In [28]:
#now apply IDF to the TF feature created above.
idf = IDF(minDocFreq=2, inputCol="TTT", outputCol="TFIDF_features")
idfModel = idf.fit(featurizedData)
TFIDF_model = idfModel.transform(featurizedData)

TFIDF_model is the dataframe which contains TFIDF terms.  
However, unlike, TFIDF matrix from SKlearn, the spark TFIDF terms are stored in a sparse vector.  
So the TFIDF_features are consists of many sparse vectors (each sample has a vectors).   
I will use this list of sparse vectors to run NMF.  

## NMF
The hardest part about running NMF in Spark is that, It requires specific data format to run NMF.  
The NMF hidden layers can be calculated from this function called ALS from mllib.recommendation class.  
the input data format needs to be Rating matrix and so I first need to convert my sparse vectors to Rating matrix.  

In [29]:
# I am cheating here. Since creating an id column is such a pain in my butt, I'm making my cust_id in pandas dataframe.
# once I finish adding my custom id column in pandas, I convert that dataframe into spark/hc.dataframe.  
t2 = TFIDF_model.toPandas()

In [88]:
features = t2
features['new_id']=features.index
features['cust_id'] = features['new_id']
features = features.drop(['new_id','tokenized','TTT'],axis = 1)

In [89]:
features.head(5)

Unnamed: 0,reviewText,overall,helpful_total_votes,num_of_helpful_votes,helpful_percentage,Text_length,asin,rank_values,num_category,Screen Protectors,...,id,software,Video_Games,below20,below50,below100,below300,price_unknown,TFIDF_features,cust_id
0,This does not come with a USB hook up. I'm pe...,5.0,274,265,0.97,1053,B001FBHT34,131442,1,0.0,...,25769803779,0,1,1,0,0,0,0,"(0.290237392645, 0.740153459094, 0.34419198738...",0
1,My son has been eagerly anticipating this game...,2.0,133,127,0.95,2678,B000EXW52O,21032,1,0.0,...,154618822659,0,1,0,0,1,0,0,"(0.435356088968, 1.36643715525, 1.671789653, 0...",1
2,Unfortunately I can't review the product too m...,1.0,799,497,0.62,4713,B00BGA9WK2,13,4,0.0,...,274877907016,0,1,0,1,0,0,0,"(1.05211054834, 1.65111156259, 1.671789653, 0....",2
3,Animal Crossing was always a sleeper hit on th...,5.0,716,683,0.95,4694,B0009Z3MQK,1769,2,0.0,...,292057776131,0,1,0,1,0,0,0,"(1.56002598547, 1.19563251084, 1.37676794953, ...",3
4,Got this game the first day it came out and ha...,5.0,205,16,0.08,93,B000FKBCX4,368,2,0.0,...,326417514596,0,1,0,1,0,0,0,"(0.108839022242, 0.0, 0.0, 0.0, 0.0, 0.0311197...",4


In [90]:
id_df = hc.createDataFrame(features)

In [91]:
id_df.show()

+--------------------+-------+-------------------+--------------------+------------------+-----------+----------+-----------+------------+-----------------+-------------+-------------+-------------+----------------+---------+--------+---------------+---+------------------+------------+--------+----+------------------+-------------------+------------------+-----------------------+----------+-----------+----------+-----------+------------+-----------+-----------+-----+---------------+-------------+-----------+----------+----------+--------+--------+-------------+------------+--------+-------------------+---+--------+--------------+------------+-----+------------------------+----+-------+----------+--------+--------+---------+-------+------------+--------+--------------------+------------------+--------------+----+--------------+-------+--------+----------+----------+---------+--------------------+-----------------+-------+------------+-----------+-----+--------+----------+--------------+

### NMF problem:
Making a list of sparse vector into a rating matrix is not difficult.  
However, once you convert it, the rating matrix could contain many values that can not be stored in a laptop CPU.  
Since I am having problem with memory allocation, I only used 10 features for TFIDF and only take first 1000 samples from my model to run NMF.  
  
Although this will produce different results than the original version, I am going to use this subset of data and keep going (so that I can at least code my later process).  

In [93]:
TFIDF_rdd = id_df.select('TFIDF_features','cust_id').take(1000)
TFIDF_model2 = hc.createDataFrame(TFIDF_rdd)

In [94]:
rdd_id = TFIDF_model2.rdd.map(lambda x: [x.cust_id]*len(x.TFIDF_features.indices)).flatMap(lambda x: x)

In [95]:
rdd_indices = TFIDF_model2.rdd.map(lambda x: x.TFIDF_features.indices).flatMap(lambda x: x)

In [96]:
rdd_tfidf = TFIDF_model2.rdd.map(lambda x: x.TFIDF_features.values).flatMap(lambda x: x)

In [97]:
rd_zip1 = rdd_id.zip(rdd_indices)
rdd_zipped = rd_zip1.zip(rdd_tfidf).map(lambda x: (str(x[0][0]),int(x[0][1]),float(x[1])))

In [98]:
rdd_zipped.take(2)

[('0', 0, 0.29023739264518944), ('0', 1, 0.740153459093575)]

In [99]:
df_for_NMF = hc.createDataFrame(rdd_zipped)
df_for_NMF = df_for_NMF.select(col('_1').alias("USER_ID").cast(IntegerType()), col('_2').alias("tfidf_index").cast(IntegerType()), col('_3').alias("tfidf_term"))
df_for_NMF = df_for_NMF.withColumn("tfidf_ROUND", pys_fun.format_number(df_for_NMF.tfidf_term, 3).cast(FloatType()))
df_for_NMF = df_for_NMF.drop('tfidf_term')

In [100]:
df_for_NMF.printSchema()

root
 |-- USER_ID: integer (nullable = true)
 |-- tfidf_index: integer (nullable = true)
 |-- tfidf_ROUND: float (nullable = true)



MY dataframe is now ready to run ALS.  

In [42]:
print(rdd_id.count())
print(rdd_indices.count())
print(rdd_tfidf.count())

9704
9704
9704


So let me summarize what is happening with my data:
1. has the first 1000 data samples from the TFIDF_model.  
2. in df_for_NMF, it has 3 features, USER_ID,tfidf_index,tfidf_ROUND.    
    - USER_ID = data sample id (integer)  
    - tfidf_index = names of each tfidf terms (integer)  
    - tifdf_ROUND = tfidf values of the tfidf_index for that USER_ID. rounded to 2 decimal digits.  (float)

In [104]:
ratings = df_for_NMF.rdd.map(lambda l: Rating(int(l[0]), int(l[1]), float(l[2])))
rank = 15
numIterations = 15
#als = ALS(rank=8, maxIter=5, userCol="USER_ID", itemCol="tfidf_index", ratingCol="tfidf_ROUND", nonnegative=True)

In [105]:
#my model runs 10 iterations of ALS calculation and has 8 hidden features.  
model = ALS.train(ratings,rank,numIterations,nonnegative=True)

In [106]:
NMF_results = model.userFeatures()

In [107]:
NMF_results.take(5)

[(0,
  array('d', [0.16416122019290924, 0.0, 0.0, 0.0, 0.0, 0.0055698673240840435, 0.0, 0.4366821050643921, 0.47314947843551636, 0.0, 0.0, 0.0, 0.0, 0.2683221399784088, 0.0])),
 (4,
  array('d', [0.0, 0.0, 0.0, 0.0, 0.06606338918209076, 0.013389339670538902, 0.011553904972970486, 0.004726927727460861, 0.0691143274307251, 0.00015500385779887438, 0.0, 0.0, 0.0, 0.04424767568707466, 0.0])),
 (8,
  array('d', [0.7441716194152832, 0.0, 0.0, 0.0, 0.6991147398948669, 0.9363747835159302, 0.8712197542190552, 0.30249133706092834, 0.9294428825378418, 1.175255537033081, 0.8778303861618042, 0.0, 0.0, 0.6912622451782227, 0.0])),
 (12,
  array('d', [0.11978136003017426, 0.0, 0.0, 0.0, 0.12147659808397293, 0.11968875676393509, 0.05484406650066376, 0.1616278737783432, 0.0, 0.2573058605194092, 0.0, 0.0, 0.0, 0.0, 0.0])),
 (16,
  array('d', [0.02931974083185196, 0.0, 0.0, 0.0, 0.09359288960695267, 0.21137385070323944, 0.0, 0.1696062833070755, 0.0, 0.07691962271928787, 0.11201909929513931, 0.0, 0.0, 0.017

YAY! I got something here.

In [114]:
NMF_res = NMF_results.values().collect()

In [115]:
np_NMF = np.array(NMF_res)
pd_NMF = pd.DataFrame(np_NMF)

In [123]:
Pd_NMF = pd_NMF.rename(columns ={0:"NMF1", 1:"NMF2", 2:"NMF3", 3:"NMF4", 4:"NMF5", 5:"NMF6", 6:"NMF7",\
                                 7:"NMF8", 8:"NMF9", 9:"NMF10", 10:"NMF11", 11:"NMF12", 12:"NMF13",\
                                 13:"NMF14", 14:"NMF15"})

In [124]:
Pd_NMF['cust_id'] = Pd_NMF.index
#I'm repeating this step so that spark recognize cust_id as a feature and not as an index feature.  
pd_NMF['cust_id2'] = Pd_NMF['cust_id']

In [125]:
pd_NMF.head(10)

Unnamed: 0,NMF1,NMF2,NMF3,NMF4,NMF5,NMF6,NMF7,NMF8,NMF9,NMF10,NMF11,NMF12,NMF13,NMF14,NMF15,cust_id2
0,0.164161,0.0,0.0,0.0,0.0,0.00557,0.0,0.436682,0.473149,0.0,0.0,0.0,0.0,0.268322,0.0,0
1,0.0,0.0,0.0,0.0,0.066063,0.013389,0.011554,0.004727,0.069114,0.000155,0.0,0.0,0.0,0.044248,0.0,1
2,0.744172,0.0,0.0,0.0,0.699115,0.936375,0.87122,0.302491,0.929443,1.175256,0.87783,0.0,0.0,0.691262,0.0,2
3,0.119781,0.0,0.0,0.0,0.121477,0.119689,0.054844,0.161628,0.0,0.257306,0.0,0.0,0.0,0.0,0.0,3
4,0.02932,0.0,0.0,0.0,0.093593,0.211374,0.0,0.169606,0.0,0.07692,0.112019,0.0,0.0,0.01738,0.0,4
5,0.384303,0.0,0.0,0.0,0.369571,0.0,0.65986,2.320368,0.584154,0.9812,1.414535,0.0,0.0,1.074128,0.0,5
6,0.071392,0.0,0.0,0.0,0.0831,0.36564,0.482205,0.184634,0.308133,0.046077,0.214599,0.0,0.0,0.132512,0.0,6
7,4.625066,0.0,0.0,0.0,2.495781,3.440292,2.642541,3.441575,1.732401,0.694688,0.558181,0.0,0.0,1.489183,0.0,7
8,0.160408,0.0,0.0,0.0,0.043664,0.156508,0.161556,0.199418,0.073464,0.038703,0.031808,0.0,0.0,0.198606,0.0,8
9,0.772353,0.0,0.0,0.0,0.961693,1.878223,1.430986,0.927186,1.845948,0.786515,0.923379,0.0,0.0,1.553925,0.0,9


In [126]:
#making a schema for sp_NMF
schema = StructType([
        StructField('NMF1', FloatType(), True),
        StructField('NMF2', FloatType(), True),
        StructField('NMF3', FloatType(), True),
        StructField('NMF4', FloatType(), True),
        StructField('NMF5', FloatType(), True),
        StructField('NMF6', FloatType(), True),
        StructField('NMF7', FloatType(), True),
        StructField('NMF8', FloatType(), True),
        StructField('NMF9', FloatType(), True),
        StructField('NMF10', FloatType(), True),
        StructField('NMF11', FloatType(), True),
        StructField('NMF12', FloatType(), True),
        StructField('NMF13', FloatType(), True),
        StructField('NMF14', FloatType(), True),
        StructField('NMF15', FloatType(), True),
        StructField('cust_id', IntegerType(), True)])

In [127]:
sp_NMF =hc.createDataFrame(pd_NMF,schema)

In [128]:
sp_NMF.show()

+-----------+----+----+----+-----------+------------+-----------+------------+-----------+------------+-----------+-----+-----+-----------+-----+-------+
|       NMF1|NMF2|NMF3|NMF4|       NMF5|        NMF6|       NMF7|        NMF8|       NMF9|       NMF10|      NMF11|NMF12|NMF13|      NMF14|NMF15|cust_id|
+-----------+----+----+----+-----------+------------+-----------+------------+-----------+------------+-----------+-----+-----+-----------+-----+-------+
| 0.16416122| 0.0| 0.0| 0.0|        0.0|0.0055698673|        0.0|   0.4366821| 0.47314948|         0.0|        0.0|  0.0|  0.0| 0.26832214|  0.0|      0|
|        0.0| 0.0| 0.0| 0.0| 0.06606339|  0.01338934|0.011553905|0.0047269277| 0.06911433|1.5500386E-4|        0.0|  0.0|  0.0|0.044247676|  0.0|      1|
|  0.7441716| 0.0| 0.0| 0.0| 0.69911474|   0.9363748| 0.87121975|  0.30249134|  0.9294429|   1.1752555|  0.8778304|  0.0|  0.0| 0.69126225|  0.0|      2|
| 0.11978136| 0.0| 0.0| 0.0|  0.1214766|  0.11968876|0.054844067|  0.1616278

In [130]:
sp_NMF.registerTempTable('NMF')
id_df.registerTempTable('TFIDF')

In [159]:
query = '''SELECT TFIDF.*,NMF.NMF1,NMF.NMF2,NMF.NMF3,NMF.NMF4,NMF.NMF5,NMF.NMF6,NMF.NMF7,NMF.NMF8,
            NMF.NMF9,NMF.NMF10,NMF.NMF11,NMF.NMF12,NMF.NMF13,NMF.NMF14,NMF.NMF15
            FROM TFIDF JOIN NMF ON TFIDF.cust_id = NMF.cust_id ORDER BY TFIDF.cust_id'''
df_for_RF = hc.sql(query)

In [160]:
df_for_RF.show()

+--------------------+-------+-------------------+--------------------+------------------+-----------+----------+-----------+------------+-----------------+-------------+-------------+-------------+----------------+---------+--------+---------------+---+------------------+------------+--------+----+------------------+-------------------+------------------+-----------------------+----------+-----------+----------+-----------+------------+-----------+-----------+-----+---------------+-------------+-----------+----------+----------+--------+--------+-------------+------------+--------+-------------------+---+--------+--------------+------------+-----+------------------------+----+-------+----------+--------+--------+---------+-------+------------+--------+--------------------+------------------+--------------+----+--------------+-------+--------+----------+----------+---------+--------------------+-----------------+-------+------------+-----------+-----+--------+----------+--------------+

In [136]:
print "total number of datapoints: {}".format(df_for_RF.count())
print "total number of features: {}".format(len(df_for_RF.columns))

total number of datapoints: 1000
total number of features: 128


In [157]:
#renaming columns. no space!!

In [163]:
# removing Video Games feature. It's actually a duplicate column of Video_Games column
df_for_RF = df_for_RF.drop('Video Games')
df_for_RF = df_for_RF.toDF(*(c.replace(' ', '_') for c in df_for_RF.columns))

## SAVING File
Saving my file as a parquet file

In [162]:
df_for_RF.write.save("RF_ready.parquet")

**Checking if I'm not losing data**

In [2]:
test= spark.read.load("RF_ready.parquet")

In [3]:
print "total number of datapoints: {}".format(test.count())
print "total number of features: {}".format(len(test.columns))

total number of datapoints: 1000
total number of features: 127


Size of the df is correct. Now I just need to check if there's any nulls.

In [12]:
test=test.drop('TFIDF_features')

In [13]:
test.select([count(when(isnan(c) | col(c).isNull(), c)).alias(c) for c in test.columns]).show()

+----------+-------+-------------------+--------------------+------------------+-----------+----+-----------+------------+-----------------+-------------+-------------+-------------+----------------+---------+--------+---------------+---+------------------+------------+--------+----+------------------+-------------------+------------------+-----------------------+----------+-----------+----------+-----------+------------+-----------+-----------+-----+---------------+-------------+-----------+----------+----------+--------+--------+-------------+------------+--------+-------------------+---+--------+--------------+------------+-----+------------------------+----+-------+----------+--------+--------+---------+-------+------------+--------+--------------------+------------------+--------------+----+--------------+-------+--------+----------+----------+---------+--------------------+-----------------+-------+------------+-----------+-----+--------+----------+--------------+------------+---

Note: 
- TFIDF_features are sparse vector feature that cause error when I run the previous line.  
- There is no null. I should use parquet file to export and import files.

----------------