In [47]:
""" Pipeline for feature selection and classification
Using:

https://spark.apache.org/docs/1.5.2/ml-features.html
https://spark.apache.org/docs/1.6.1/api/python/pyspark.sql.html
http://spark.apache.org/docs/latest/api/python/pyspark.mllib.html#pyspark.mllib.classification.LogisticRegressionModel
http://nlp.stanford.edu/IR-book/html/htmledition/document-and-query-weighting-schemes-1.html#sec:querydocweighting

Attempting to replicate: 

class sklearn.feature_extraction.text.TfidfVectorizer(input='content', encoding='utf-8',
decode_error='strict', strip_accents=None, lowercase=True, preprocessor=None, 
tokenizer=None, analyzer='word', stop_words=None, token_pattern='(?u)\b\w\w+\b', 
ngram_range=(1, 3), max_df=1.0, min_df=1, max_features=40000, vocabulary=None, 
binary=False, dtype=<class 'numpy.int64'>, norm='l2', use_idf=True, 
smooth_idf=True, sublinear_tf=True)

I think only sublinear_tf and ngram_range need to be modified

"""
from pyspark.ml import Pipeline
from pyspark.ml.feature import HashingTF, IDF, Tokenizer, NGram, StringIndexer
from pyspark.ml.classification import LogisticRegression, RandomForestClassifier, GBTClassifier
from pyspark.sql.functions import col, udf
from itertools import chain
from pyspark.sql.types import ArrayType, StringType
import numpy as np

numfeat = 40000

# 1. Feature-extraction
def concat(type):
    """ UDF to concatenate lists across columns to create
    an n-gram range. To reproduce ngram_range=(1,3) from sklearn
    """
    def concat_(*args):
        return list(chain(*args))
    return udf(concat_, ArrayType(type))                   
concat_string_arrays = concat(StringType())

indexer = StringIndexer(inputCol="sentiment", outputCol="sentiment_idx")
tokenizer = Tokenizer(inputCol="sentences", outputCol="words")
biGram = NGram(inputCol = "words", n=2, outputCol = "2gram")
triGram = NGram(inputCol = "words", n=3, outputCol = "3gram")
hashingtf  = HashingTF(inputCol="ngrams", outputCol="rawFeatures", numFeatures=numfeat)
idf = IDF(inputCol="rawFeatures", outputCol="features")

"""
# Apply sub-linear tf scaling!!!
Replace tf with 1 + log(tf) like so:

        if self.sublinear_tf:
            np.log(X.data, X.data)
            X.data += 1

# Something like:
sub_lin_tf = hashed_train.withColumn('lograwFeatures', np.log(hashed_train.rawFeatures)+1)
"""

#######
# Train
#######
indexerModel = indexer.fit(trainingData)
trainingDataIx = indexerModel.transform(trainingData)
tokenized_train = tokenizer.transform(trainingDataIx)

biGram_train = biGram.transform(tokenized_train)
triGram_train = triGram.transform(biGram_train)
ngrammed_train = triGram_train.withColumn("ngrams", concat_string_arrays(
        col("words"),
        col("2gram"),
        col("3gram")))
hashed_train = hashingtf.transform(ngrammed_train)



idfModel = idf.fit(hashed_train)
idf_train = idfModel.transform(hashed_train)

In [60]:
# Apply sub-linear?

#idf_train.first()['rawFeatures']

#sub_lin_tf = hashed_train.withColumn('lograwFeatures', hashed_train.rawFeatures+1)

SparseVector(40000, {277: 1.0, 1056: 1.0, 1128: 1.0, 2165: 1.0, 3370: 1.0, 3371: 1.0, 3500: 1.0, 3707: 1.0, 4051: 1.0, 4486: 1.0, 4668: 1.0, 4846: 1.0, 5627: 1.0, 5646: 1.0, 6272: 1.0, 6944: 1.0, 7183: 1.0, 8088: 1.0, 9207: 1.0, 9541: 1.0, 9581: 1.0, 10564: 1.0, 11966: 1.0, 12365: 1.0, 12620: 1.0, 12709: 1.0, 13250: 1.0, 14081: 1.0, 15914: 1.0, 16727: 2.0, 16890: 1.0, 16897: 1.0, 16956: 1.0, 20127: 1.0, 20883: 1.0, 21925: 1.0, 22837: 1.0, 23051: 1.0, 23444: 1.0, 23895: 1.0, 23967: 1.0, 24280: 1.0, 24738: 1.0, 25310: 1.0, 25969: 1.0, 26033: 1.0, 26302: 1.0, 27086: 1.0, 29182: 1.0, 29509: 1.0, 29593: 1.0, 31576: 1.0, 32107: 1.0, 32586: 1.0, 34076: 1.0, 34380: 1.0, 34466: 1.0, 34619: 1.0, 35240: 1.0, 35615: 1.0, 36103: 2.0, 36715: 1.0, 37023: 1.0, 37075: 1.0, 37368: 1.0, 38027: 1.0, 38092: 1.0, 38621: 1.0, 39070: 1.0, 39352: 1.0})

In [39]:
######
# Test
######
testDataIx = indexerModel.transform(testData)
tokenized_test = tokenizer.transform(testDataIx)

biGram_test = biGram.transform(tokenized_test)
triGram_test = triGram.transform(biGram_test)
ngrammed_test = triGram_test.withColumn("ngrams", concat_string_arrays(
        col("words"),
        col("2gram"),
        col("3gram")))
hashed_test = hashingtf.transform(ngrammed_test)
idf_test = idfModel.transform(hashed_test)

In [41]:
# 2A. Classifier (Logistic Regression)
classi = LogisticRegression(labelCol="sentiment_idx", featuresCol="features")
tfidfModel = classi.fit(idf_train)
pred = tfidfModel.transform(idf_test)

# 3. Examine
numSuccesses = pred.where("""(prediction = sentiment_idx)""").count()
numInspections = numSuccesses + pred.where("""(prediction != sentiment_idx)""").count()
acc = (float(numSuccesses) / float(numInspections)) * 100
print("%.2f success rate" % acc) # 76.77 success rate

"""
Standard: 76.77 success rate
With ngrams(1,3): 88.17 success rate
With ngrams + sublineartf: ?
"""

88.17 success rate
'\nStandard: 76.77 success rate\nWith ngrams: ?\nWith ngrams + sublineartf: ?\n'

In [None]:
# 2C. Classifier (GBTClassifier)
classi = GBTClassifier(labelCol="sentiment_idx", featuresCol="features", numClasses=2)
tfidfModel = classi.fit(idf_train)
pred = tfidfModel.transform(idf_test)

# 3. Examine
numSuccesses = pred.where("""(prediction = sentiment_idx)""").count()
numInspections = numSuccesses + pred.where("""(prediction != sentiment_idx)""").count()
acc = (float(numSuccesses) / float(numInspections)) * 100
print("%.2f success rate" % acc) # ? success rate

In [61]:
# 3. Evaluation
pred.select(col('prediction'),col('sentiment_idx')).show()

+----------+-------------+
|prediction|sentiment_idx|
+----------+-------------+
|       1.0|          1.0|
|       1.0|          1.0|
|       1.0|          1.0|
|       1.0|          1.0|
|       0.0|          1.0|
|       1.0|          1.0|
|       1.0|          1.0|
|       1.0|          1.0|
|       0.0|          1.0|
|       1.0|          1.0|
|       1.0|          1.0|
|       1.0|          1.0|
|       1.0|          1.0|
|       1.0|          1.0|
|       1.0|          1.0|
|       1.0|          1.0|
|       1.0|          1.0|
|       1.0|          1.0|
|       1.0|          1.0|
|       1.0|          1.0|
+----------+-------------+
only showing top 20 rows