## Upload Review Data using AzureML

Create a batch file and execute:
    
```
cd "C:\Program Files (x86)\Microsoft SDKs\Azure\AzCopy"
AzCopy /Source:C:\_ilia_share\amazon_prod_reviews_clean\raw /Dest:https://ikcentralusstore.blob.core.windows.net/amazonrev /DestKey:dLR5lH2QN/ejGmyD61nQoh7Cc2DW8jIKhR5n5uvGu8+H3Qem4J0XzWG1/7XtBxmVlWr+y/GNRlwX4Km5YU68sg== /Pattern:"aggressive_dedup.json"
pause
```

## Load Review Data (from Blob)

In [1]:
# paths
blob = "wasb://amazonrev@ikcentralusstore.blob.core.windows.net"
json_dta = blob + "/aggressive_dedup.json"

Creating SparkContext as 'sc'


ID,YARN Application ID,Kind,State,Spark UI,Driver log,Current session?
38,application_1469453428769_0016,pyspark,idle,Link,Link,✔


Creating HiveContext as 'sqlContext'
SparkContext and HiveContext created. Executing user code ...


In [2]:
# load data
jsonFile = sqlContext.read.json(json_dta)
jsonFile.registerTempTable("reviews")

print(type(jsonFile)) #  <class 'pyspark.sql.dataframe.DataFrame'>
jsonFile.show(5)

# Note: also load the IMDB data at some point
# ...

<class 'pyspark.sql.dataframe.DataFrame'>
+----------+-------+-------+--------------------+-----------+--------------------+---------------+--------------------+--------------+
|      asin|helpful|overall|          reviewText| reviewTime|          reviewerID|   reviewerName|             summary|unixReviewTime|
+----------+-------+-------+--------------------+-----------+--------------------+---------------+--------------------+--------------+
|B003UYU16G| [0, 0]|    5.0|It is and does ex...|11 21, 2012|A00000262KYZUE4J5...| Steven N Elich|Does what it's su...|    1353456000|
|B005FYPK9C| [0, 0]|    5.0|I was sketchy at ...| 01 8, 2013|A000008615DZQRRI9...|      mj waldon|           great buy|    1357603200|
|B000VEBG9Y| [0, 0]|    3.0|Very mobile produ...|03 24, 2014|A00000922W28P2OCH...|Gabriel Merrill|Great product but...|    1395619200|
|B001EJMS6K| [0, 0]|    4.0|Easy to use a mob...|03 24, 2014|A00000922W28P2OCH...|Gabriel Merrill|Great inexpensive...|    1395619200|
|B003XJCNVO| 

## Examine some of the reviews

In [3]:
%%sql 
SELECT overall, reviewText
FROM reviews
LIMIT 10

In [4]:
%%sql 
SELECT overall, COUNT(overall) as freq
FROM reviews
GROUP BY overall
ORDER by -freq

In [5]:
# Create a dataframe of our reviews
# To analyse class imbalance
reviews =  sqlContext.sql("SELECT " + 
                          "CASE WHEN overall < 3 THEN 'low' " +
                          "WHEN overall > 3 THEN 'high' ELSE 'mid' END as label, " + 
                          "reviewText as sentences " + 
                          "FROM reviews")

tally = reviews.groupBy("label").count()
tally.show()

+-----+--------+
|label|   count|
+-----+--------+
|  mid| 7039272|
|  low|10963811|
| high|64453794|
+-----+--------+

In [6]:
# Let's look at some reviews to see how clean they are
# there seems to be lots of html formatting
for c,r in enumerate(reviews.take(20)):
    print("%d. %s" % (c+1,r['sentences']))

1. It is and does exactly what the description said it would be and would do. Couldn't be happier with it.
2. I was sketchy at first about these but once you wear them for a couple hours they break in they fit good on my board an have little wear from skating in them. They are a little heavy but won't get eaten up as bad by your grip tape like poser dc shoes.
3. Very mobile product. Efficient. Easy to use; however product needs a varmint guard. Critters are able to gorge themselves without a guard.
4. Easy to use a mobile. If you're taller than 4ft, be ready to tuck your legs behind you as you hang and pull.
5. Love this feeder. Heavy duty & capacity. Best feature is the large varmint guard. Definitely use a small lock or securing device on the battery housing latch. I gave 4 stars because several bolts were missing. Check contents b4 beginning.
6. Solid, stable mount. Holds iPhone with phone protector well. I have not however used the dash mount part of this product (only windshield).

In [7]:
# Some very basic cleaning
from pyspark.sql.functions import UserDefinedFunction, col
from pyspark.sql.types import StringType, BooleanType 
from bs4 import BeautifulSoup

def cleanerHTML(line):
    # html formatting
    html_clean = BeautifulSoup(line, "lxml").get_text().lower()
    # pad punctuation
    punctuation = "\"'.,?!:;(){}[]/"
    for char in punctuation:
        html_clean = html_clean.replace(char, ' ' + char + ' ')
    # remove any double spaces, line-breaks, etc.
    return " ".join(html_clean.split())

cleaner = UserDefinedFunction(cleanerHTML, StringType())
cleanedReviews = reviews.select(reviews.label, 
                                cleaner(reviews.sentences).alias('sentences'))

def longEnough(line):
    return len(line) > 10

minlength = UserDefinedFunction(longEnough, BooleanType())
cleanedReviews = cleanedReviews.where(minlength(col('sentences')))

# Lose around 13,000 bad reviews
#82,456,877
#82,443,303

In [8]:
# A bit cleaner ...
for c,r in enumerate(cleanedReviews.take(20)):
    print("%d. %s" % (c+1,r['sentences']))

1. it is and does exactly what the description said it would be and would do . couldn ' t be happier with it .
2. i was sketchy at first about these but once you wear them for a couple hours they break in they fit good on my board an have little wear from skating in them . they are a little heavy but won ' t get eaten up as bad by your grip tape like poser dc shoes .
3. very mobile product . efficient . easy to use ; however product needs a varmint guard . critters are able to gorge themselves without a guard .
4. easy to use a mobile . if you ' re taller than 4ft , be ready to tuck your legs behind you as you hang and pull .
5. love this feeder . heavy duty & capacity . best feature is the large varmint guard . definitely use a small lock or securing device on the battery housing latch . i gave 4 stars because several bolts were missing . check contents b4 beginning .
6. solid , stable mount . holds iphone with phone protector well . i have not however used the dash mount part of this

In [9]:
cleanedReviews.show()

+-----+--------------------+
|label|           sentences|
+-----+--------------------+
| high|it is and does ex...|
| high|i was sketchy at ...|
|  mid|very mobile produ...|
| high|easy to use a mob...|
| high|love this feeder ...|
| high|solid , stable mo...|
| high|i bought this pep...|
| high|beautiful photos ...|
|  low|my idea of colora...|
|  low|no matter what we...|
|  low|i do not suggest ...|
|  low|useless - all you...|
| high|this book is real...|
| high|it is not a stick...|
| high|love the size and...|
| high|its very colorful...|
| high|the condition of ...|
| high|only negative . t...|
| high|bought this for m...|
| high|this book is a gr...|
+-----+--------------------+
only showing top 20 rows

In [10]:
# Equalise classes 
neg_rev = cleanedReviews.filter("label = 'low'")
pos_rev = cleanedReviews.filter("label = 'high'").limit(neg_rev.count())

In [11]:
pos_rev.count()

10961702

In [12]:
# Save data
allData = pos_rev.unionAll(neg_rev)
print(allData.count())

21923404

In [13]:
#allDataLoc = blob + "/cleaned_equal_classes.json"
#allData.write.format("json").mode("overwrite").save(allDataLoc)

## Load Clean Data

In [14]:
#allDataLoc = blob + "/cleaned_equal_classes.json"
#allData = sqlContext.read.json(allDataLoc)

#data_count = allData.count()
#print(data_count)

In [15]:
#sub_sample = 1000000
#sub_sample_ratio = float(sub_sample)/float(data_count)

#print(sub_sample_ratio)
#print(type(allData))

In [16]:
# sub_sample -> sample(boolean withReplacement, double fraction, long seed)
#allData = allData.sample(False, sub_sample_ratio, 12345)

# split intro training and test (50%, 50%)
trainingData, testData = allData.randomSplit([0.5, 0.5])

In [17]:
trainingData.show()

+-----+--------------------+
|label|           sentences|
+-----+--------------------+
| high|! ! ! ! ! ! ! ! !...|
| high|! ! ! ! ! ! ! ! !...|
| high|! ! ! ! ! ! ! ! v...|
| high|! ! ! ! ! ! this ...|
| high|! ! ! ! ! ! very ...|
| high|! ! ! ! ! wow . ....|
| high|! ! ! ! ## @@ ? ?...|
| high|! ! ! ! aaliyah 4...|
| high|! ! ! ! this game...|
| high|! ! ! ! this is t...|
| high|! ! ! ! to god al...|
| high|! ! ! ' s most re...|
| high|! ! ! a question ...|
| high|! ! ! fan-freakin...|
| high|! ! ! if you are ...|
| high|! ! ! masterpiece...|
| high|! ! ! spelt healt...|
| high|! ! ! spoiler ale...|
| high|! ! ! the best ac...|
| high|! ! ! this is an ...|
+-----+--------------------+
only showing top 20 rows

In [18]:
testData.show()

+-----+--------------------+
|label|           sentences|
+-----+--------------------+
| high|! ! ! ! ! ! ! ! !...|
| high|! ! ! ! ! ! ! ! !...|
| high|! ! ! ! ! ! ! ! !...|
| high|! ! ! ! ! ! ! ! !...|
| high|! ! ! ! ! ! ! ! !...|
| high|! ! ! ! ! ! ! ! !...|
| high|! ! ! ! ! ! ! *no...|
| high|! ! ! ! ! ! ! lov...|
| high|! ! ! ! ! ! gets ...|
| high|! ! ! ! ! ! this ...|
| high|! ! ! ! ! ! very ...|
| high|! ! ! ! ! fabulou...|
| high|! ! ! ! ! wwwooow...|
| high|! ! ! ! great fun...|
| high|! ! ! ! great pro...|
| high|! ! ! ! like it b...|
| high|! ! ! ! spoilers ...|
| high|! ! ! ! this came...|
| high|! ! ! ! this is t...|
| high|! ! ! a question ...|
+-----+--------------------+
only showing top 20 rows

## 1. TFIDF

In [19]:
""" Pipeline for feature selection and classification
Using:

https://spark.apache.org/docs/1.5.2/ml-features.html
https://spark.apache.org/docs/1.6.1/api/python/pyspark.sql.html
http://spark.apache.org/docs/latest/api/python/pyspark.mllib.html#pyspark.mllib.classification.LogisticRegressionModel
http://nlp.stanford.edu/IR-book/html/htmledition/document-and-query-weighting-schemes-1.html#sec:querydocweighting

Attempting to replicate: 

class sklearn.feature_extraction.text.TfidfVectorizer(input='content', encoding='utf-8',
decode_error='strict', strip_accents=None, lowercase=True, preprocessor=None, 
tokenizer=None, analyzer='word', stop_words=None, token_pattern='(?u)\b\w\w+\b', 
ngram_range=(1, 3), max_df=1.0, min_df=1, max_features=40000, vocabulary=None, 
binary=False, dtype=<class 'numpy.int64'>, norm='l2', use_idf=True, 
smooth_idf=True, sublinear_tf=True)

I think only sublinear_tf and ngram_range need to be modified

(https://github.com/scikit-learn/scikit-learn/blob/master/sklearn/feature_extraction/text.py)
if self.sublinear_tf:
    np.log(X.data, X.data)
    X.data += 1
            
"""

from pyspark.ml import Pipeline
from pyspark.ml.feature import HashingTF, IDF, Tokenizer, NGram, StringIndexer
from pyspark.sql.functions import col, udf
from itertools import chain
from pyspark.sql.types import ArrayType, StringType
from pyspark.mllib.linalg import Vectors, VectorUDT
import numpy as np

numfeat = 40000

########################
# 1. Feature-extraction
########################

def concat(type):
    """ UDF to concatenate lists across columns to create
    an n-gram range. To reproduce ngram_range=(1,3) from sklearn
    """
    def concat_(*args):
        return list(chain(*args))
    return udf(concat_, ArrayType(type))    

# UDF to combine n-grams into one column
concat_string_arrays = concat(StringType())

# UDF to apply sub-linear scaling on sparse vectors tf
vector_udf = udf(lambda sv: Vectors.sparse(sv.size,
                                           dict(zip(sv.indices, np.log(sv.values) + 1))),
                 VectorUDT())

indexer = StringIndexer(inputCol="label", outputCol="sentiment_idx")
tokenizer = Tokenizer(inputCol="sentences", outputCol="words")
biGram = NGram(inputCol = "words", n=2, outputCol = "2gram")
triGram = NGram(inputCol = "words", n=3, outputCol = "3gram")
hashingtf  = HashingTF(inputCol="ngrams", outputCol="rawFeatures", numFeatures=numfeat)
idf = IDF(inputCol="logRawFeatures", outputCol="features")

#######
# Train
#######
indexerModel = indexer.fit(trainingData)
trainingDataIx = indexerModel.transform(trainingData)
tokenized_train = tokenizer.transform(trainingDataIx)

biGram_train = biGram.transform(tokenized_train)
triGram_train = triGram.transform(biGram_train)
ngrammed_train = triGram_train.withColumn("ngrams", concat_string_arrays(
        col("words"),
        col("2gram"),
        col("3gram")))
hashed_train = hashingtf.transform(ngrammed_train)
sublintf_train = hashed_train.withColumn('logRawFeatures', vector_udf(
        hashed_train.rawFeatures))

idfModel = idf.fit(sublintf_train)
idf_train = idfModel.transform(sublintf_train)
idf_train.first()

Row(label=u'high', sentences=u'! ! ! ! ! ! ! ! ! ! ! ! ! ! ! ! ! ! ! ! ! ! ! ! ! ! ! ! ! ! ! ! ! ! ! ! ! ! ! ! ! ! ! ! ! ! ! ! ! ! ! ! ! ! ! ! ! ! ! ! ! ! ! ! ! ! ! ! ! ! ! ! ! ! ! ! ! ! ! ! ! ! ! ! ! ! ! ! ! ! ! ! ! ! ! ! ! ! ! ! ! ! ! ! ! ! ! ! ! ! ! ! ! ! ! ! ! ! ! ! ! ! ! ! ! ! ! ! ! ! ! ! ! ! ! ! ! ! ! ! ! ! ! ! ! ! ! ! ! ! ! ! ! ! ! ! ! ! ! ! ! ! ! ! ! ! the emotions were so strong , if the characters laughed , you laughed , if they smiled , you smiled , if they cried , you cried . . . some moments were just so heartbreaking . . .', sentiment_idx=1.0, words=[u'!', u'!', u'!', u'!', u'!', u'!', u'!', u'!', u'!', u'!', u'!', u'!', u'!', u'!', u'!', u'!', u'!', u'!', u'!', u'!', u'!', u'!', u'!', u'!', u'!', u'!', u'!', u'!', u'!', u'!', u'!', u'!', u'!', u'!', u'!', u'!', u'!', u'!', u'!', u'!', u'!', u'!', u'!', u'!', u'!', u'!', u'!', u'!', u'!', u'!', u'!', u'!', u'!', u'!', u'!', u'!', u'!', u'!', u'!', u'!', u'!', u'!', u'!', u'!', u'!', u'!', u'!', u'!', u'!', u'!', u'!', u'!

### tfidf variants:

![alt text](http://nlp.stanford.edu/IR-book/html/htmledition/img462.png "TFs")

In [None]:
"""
###########################
# Example: Apply sub-linear
###########################
from pyspark.mllib.linalg import Vectors, VectorUDT
import numpy as np

testy = hashed_train.first()['rawFeatures']

print(type(testy))
print(type(testy.values))
print(testy)

vector_udf = udf(lambda sv: Vectors.sparse(sv.size, dict(zip(sv.indices, np.log(sv.values) + 10))), VectorUDT())
sublintf_train = hashed_train.withColumn('sublintf', vector_udf(hashed_train.rawFeatures))

testy2 = sublintf_train.first()['sublintf']
print(testy2)
"""

"\n###########################\n# Example: Apply sub-linear\n###########################\nfrom pyspark.mllib.linalg import Vectors, VectorUDT\nimport numpy as np\n\ntesty = hashed_train.first()['rawFeatures']\n\nprint(type(testy))\nprint(type(testy.values))\nprint(testy)\n\nvector_udf = udf(lambda sv: Vectors.sparse(sv.size, dict(zip(sv.indices, np.log(sv.values) + 10))), VectorUDT())\nsublintf_train = hashed_train.withColumn('sublintf', vector_udf(hashed_train.rawFeatures))\n\ntesty2 = sublintf_train.first()['sublintf']\nprint(testy2)\n"

In [None]:
######
# Test
######
testDataIx = indexerModel.transform(testData)
tokenized_test = tokenizer.transform(testDataIx)

biGram_test = biGram.transform(tokenized_test)
triGram_test = triGram.transform(biGram_test)
ngrammed_test = triGram_test.withColumn("ngrams", concat_string_arrays(
        col("words"),
        col("2gram"),
        col("3gram")))
hashed_test = hashingtf.transform(ngrammed_test)
sublintf_test = hashed_test.withColumn('logRawFeatures', vector_udf(
        hashed_test.rawFeatures))
idf_test = idfModel.transform(sublintf_test)

idf_test.first()

In [None]:
from pyspark.ml.classification import LogisticRegression, GBTClassifier

# 2A. Classifier (Logistic Regression)
classi = LogisticRegression(labelCol="sentiment_idx", featuresCol="features")
tfidfModel = classi.fit(idf_train)
pred = tfidfModel.transform(idf_test)

# 3. Examine
numSuccesses = pred.where("""(prediction = sentiment_idx)""").count()
numInspections = numSuccesses + pred.where("""(prediction != sentiment_idx)""").count()
acc = (float(numSuccesses) / float(numInspections)) * 100
print("%.2f success rate" % acc)

"""
# Sub-sample: 1 mill total
Standard: 76.77 success rate
With ngrams(1,3): 88.17 success rate
With ngrams + sublineartf: 88.32 success rate
# Full data: 89.05 success rate
"""

In [None]:
# 3. Evaluation
pred.select(col('prediction'),col('sentiment_idx')).show()