In [41]:
import pyspark
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName('big-data').master("local[8]").config("spark.driver.memory", "8g").config('spark.sql.execution.arrow.enabled', 'true').getOrCreate()

First read the generated fake reviews and try to get a sense of how long the average review is.

In [103]:
tokenized_fake_reviews = spark.read.text('fake-reviews/generated.txt')

In [106]:
from pyspark.sql.functions import col, length, mean

tokenized_fake_reviews.agg(mean(length(tokenized_fake_reviews.value))).show()

+------------------+
|avg(length(value))|
+------------------+
|100.48283333333333|
+------------------+



Now read some real reviews

In [118]:
real_reviews = spark.read.json('yelp-dataset/yelp_academic_dataset_review.json').sample(False, 0.05, seed=42)

We want to limit reviews to around 100 characters. This is obviously not 100% exact because the generated reviews are tokenized and the yelp review data set is not.

In [119]:
from pyspark.sql.functions import col, length

real_reviews = real_reviews.where(length(real_reviews.text) <= 100).limit(6000)

Remove new-lines

In [120]:
from pyspark.sql.functions import regexp_replace
real_reviews = real_reviews.withColumn('text', regexp_replace(real_reviews.text, "[\\r\\n]", " "))

In [121]:
real_reviews = real_reviews.withColumn('text', regexp_replace(real_reviews.text, "[^\x00-\x7F]+", " ")) 

In [122]:
real_reviews.count()

6000

In [123]:
real_reviews.show()

+--------------------+----+-------------------+-----+--------------------+-----+--------------------+------+--------------------+
|         business_id|cool|               date|funny|           review_id|stars|                text|useful|             user_id|
+--------------------+----+-------------------+-----+--------------------+-----+--------------------+------+--------------------+
|pDoDY-cDLyeKQgDx5...|   0|2012-04-28 09:28:54|    0|iS0ANobtW1ZfBdu36...|  5.0|Great food. Try t...|     0|0MKwtAFrhNslL-h5E...|
|ybbcg01-j7tKJ_oLE...|   0|2015-04-01 15:20:43|    0|Sc0nxPCLfPt7KHdN3...|  2.0|Very average Toro...|     0|GqGlMZegcGp5GKM6N...|
|YzMUZjUMcgI-NSGu4...|   0|2014-07-13 15:06:28|    0|TJjrhzZxMfmMjlGrw...|  4.0|Food was amazon, ...|     0|xvACCLMLVs1p4Q4va...|
|EAs61Wm1O6tLjCs8t...|   0|2015-03-18 21:06:16|    0|LbRonpdNBwxWT4Pou...|  4.0|I recommend there...|     0|pcr9Gj69fZtU5hX6O...|
|nrahyQyopCtajDqUt...|   0|2018-04-04 02:16:24|    0|5gKt71TDpn0LyQHlf...|  5.0|I have eat

Save for tokenization step

In [124]:
real_reviews.select(real_reviews.text).write.format('text').save('fake-reviews/real_reviews.txt')

As noted in the fake_review generation notebook, the regular tokenizers are not good enough, and we have used Stanford's CoreNLP package for tokenization when training our fake review generator model.

In [125]:
!cat fake-reviews/real_reviews.txt/part-*.txt | CLASSPATH=~/dev/stanford-parser-full-2018-10-17/stanford-parser.jar java edu.stanford.nlp.process.PTBTokenizer -lowerCase -preserveLines > fake-reviews/tokenized_real_reviews.txt

PTBTokenizer tokenized 111869 tokens at 691655.64 tokens per second.


In [126]:
tokenized_real_reviews = spark.read.text('fake-reviews/tokenized_real_reviews.txt')

In [127]:
tokenized_real_reviews.count()

6000

In [128]:
from pyspark.sql.functions import lit
tokenized_real_reviews = tokenized_real_reviews.withColumn('label', lit(0))

In [129]:
tokenized_fake_reviews = tokenized_fake_reviews.withColumn('label', lit(1))

In [130]:
reviews = tokenized_fake_reviews.union(tokenized_real_reviews)

In [131]:
train, val = reviews.randomSplit([0.9, 0.1], seed=42)

In [132]:
train.sample(False, 0.15, seed=42).orderBy(rand()).show()
val.sample(False, 0.15, seed=42).orderBy(rand()).show()

+--------------------+-----+
|               value|label|
+--------------------+-----+
|this place is a j...|    1|
|this place is n't...|    0|
|this company is t...|    1|
|excellent sushi !...|    0|
|i love this place...|    1|
|this place is ok ...|    1|
|tuesdays half of ...|    0|
|i ordered a mediu...|    0|
|pizza my dear has...|    0|
|they fixed my iph...|    1|
|do n't stay here ...|    1|
|they are the best...|    0|
|very efficient , ...|    0|
|called and no ans...|    0|
|i love this place...|    1|
|i had a great exp...|    1|
|the staff at chan...|    1|
|this mexican is i...|    0|
|the rooms are nic...|    1|
|some of the worst...|    0|
+--------------------+-----+
only showing top 20 rows

+--------------------+-----+
|               value|label|
+--------------------+-----+
|trust me when i s...|    0|
|i had a huge meat...|    0|
|love this place !...|    1|
|i love this place...|    1|
|if you can ignore...|    0|
|delicious food -l...|    0|
|we had a great t

In [133]:
from pyspark.ml.feature import HashingTF, IDF, Tokenizer
from pyspark.ml.feature import StringIndexer
from pyspark.ml import Pipeline
from pyspark.ml.classification import LogisticRegression


tokenizer = Tokenizer(inputCol="value", outputCol="tokens")
tf = HashingTF(numFeatures=2**16, inputCol="tokens", outputCol='tf')
idf = IDF(inputCol='tf', outputCol="features", minDocFreq=4)
pipeline = Pipeline(stages=[tokenizer, tf, idf])

idf_model = pipeline.fit(train)
train_df = idf_model.transform(train)
val_df = idf_model.transform(val)



In [134]:
lr = LogisticRegression(maxIter=100, labelCol='label')
lr_model = lr.fit(train_df)


In [135]:
predict_test = lr_model.transform(val_df)

Area under Curve / ROC:

In [136]:
from pyspark.ml.evaluation import BinaryClassificationEvaluator

evaluator = BinaryClassificationEvaluator(rawPredictionCol="rawPrediction")
evaluator.evaluate(predict_test)

0.9781689747598837

So we have a good Area under Curve / ROC. This is very dangerous however since in real life fake and real reviews are imbalanced. There are fewer fake reviews than real reviews and we should thus use the f1-score or Area Under Curve / PR instead.

In [142]:
evaluator.evaluate(predict_test, {evaluator.metricName: "areaUnderPR"})

0.9748998646699216

In [144]:
val.count()

1197

If we create an imbalanced test set, the Area under the curve for precision-recall and the f1-score drops significantly few a 'few fake reviews' / 'a lot of real reviews' scenario.

In [205]:
def f1_score(df):
    tp = df.where((df.label == 1) & (df.prediction == 1)).count()
    tn = df.where((df.label == 0) & (df.prediction == 0)).count()
    fp = df.where((df.label == 0) & (df.prediction == 1.0)).count()
    fn = df.where((df.label == 1) & (df.prediction == 0)).count()
    p = tp / (tp + fp)
    r = tp / (tp + fn)
    return 2 * (p * r) / (p + r)

In [211]:
imbalanced_val = val.where(val.label==1).sample(0.5).union(val.where(val.label==0))
predict_imbalanced = lr_model.transform(idf_model.transform(imbalanced_val))
evaluator = BinaryClassificationEvaluator(rawPredictionCol="rawPrediction")
print(evaluator.evaluate(predict_imbalanced))
print(evaluator.evaluate(predict_imbalanced, {evaluator.metricName: "areaUnderPR"}))
print(f1_score(predict_imbalanced))

0.9779574970484071
0.9521065292036168
0.8944723618090452


In [210]:
imbalanced_val = val.where(val.label==0).sample(0.5).union(val.where(val.label==1))
predict_imbalanced = lr_model.transform(idf_model.transform(imbalanced_val))
evaluator = BinaryClassificationEvaluator(rawPredictionCol="rawPrediction")
print(evaluator.evaluate(predict_imbalanced))
print(evaluator.evaluate(predict_imbalanced, {evaluator.metricName: "areaUnderPR"}))
print(f1_score(predict_imbalanced))

0.9775343340082756
0.985803527150516
0.9567430025445293


Unfortunately, this test set isn't that large. We can try loading another 10k fake reviews which were created from context inputs the trained model has never seen. We can then try to achieve a 20:80 ratio of fake to real reviews (bear in mind we have no way of knowing whether the Yelp data set does not contain any fake reviews)

In [213]:
real_reviews = spark.read.json('yelp-dataset/yelp_academic_dataset_review.json').sample(False, 0.4, seed=42)
real_reviews = real_reviews.where(length(real_reviews.text) <= 100).limit(40000)
real_reviews = real_reviews.withColumn('text', regexp_replace(real_reviews.text, "[\\r\\n]", " "))
real_reviews = real_reviews.withColumn('text', regexp_replace(real_reviews.text, "[^\x00-\x7F]+", " "))
real_reviews.count()

40000

In [214]:
real_reviews.select(real_reviews.text).write.format('text').save('fake-reviews/real_reviews_40k.txt')
!cat fake-reviews/real_reviews_40k.txt/part-*.txt | CLASSPATH=~/dev/stanford-parser-full-2018-10-17/stanford-parser.jar java edu.stanford.nlp.process.PTBTokenizer -lowerCase -preserveLines > fake-reviews/tokenized_real_reviews_40k.txt
tokenized_real_reviews = spark.read.text('fake-reviews/tokenized_real_reviews_40k.txt')

PTBTokenizer tokenized 748407 tokens at 1647591.67 tokens per second.


In [216]:
tokenized_fake_reviews = spark.read.text('fake-reviews/generated_10k.txt')

In [217]:
tokenized_real_reviews = tokenized_real_reviews.withColumn('label', lit(0))
tokenized_fake_reviews = tokenized_fake_reviews.withColumn('label', lit(1))
test_reviews = tokenized_fake_reviews.union(tokenized_real_reviews)

In [219]:
test_reviews.count()

50000

In [218]:
predict_test = lr_model.transform(idf_model.transform(test_reviews))
evaluator = BinaryClassificationEvaluator(rawPredictionCol="rawPrediction")
print(evaluator.evaluate(predict_test))
print(evaluator.evaluate(predict_test, {evaluator.metricName: "areaUnderPR"}))
print(f1_score(predict_test))

0.9749909325000622
0.8945515982757801
0.8344529963963163


So at first glance, the F1-Score isn't all that bad, we can also look at the averaged F1-Score and print the underlying statistics.

In [237]:
def f1_score_(df, p=1, n=0):
    tp = df.where((df.label == p) & (df.prediction == p)).count()
    tn = df.where((df.label == n) & (df.prediction == n)).count()
    fp = df.where((df.label == n) & (df.prediction == p)).count()
    fn = df.where((df.label == p) & (df.prediction == n)).count()
    print('tp {}, fp {}, tn {}, fn {}'.format(tp, fp, tn, fn))
    precision = tp / (tp + fp)
    recall = tp / (tp + fn)
    print('precision', precision)
    print('recall', recall)
    return 2 * (precision * recall) / (precision + recall)

In [238]:
print('avg f1', (f1_score_(predict_test, 1, 0) + f1_score_(predict_test, 0, 1))/2)

tp 9378, fp 3099, tn 36901, fn 622
precision 0.7516229862947824
recall 0.9378
tp 36901, fp 622, tn 9378, fn 3099
precision 0.983423500253178
recall 0.922525
avg f1 0.8932271689668332


Looking at the precision and recall, the question is whether we could live with these high misclassification rates in a production environment. What is more important: preventing as many fake reviews as possible, or letting through some fake reviews in order to not annoy users?