In [None]:
!wget http://setup.johnsnowlabs.com/colab.sh -O - | bash

import os
import pyspark
import sparknlp
import pandas as pd
import string, re

from pymongo import MongoClient

from sparknlp.base import *
from sparknlp.annotator import *
from pyspark.ml import Pipeline, Transformer

from pyspark.sql.functions import udf, col, when
from pyspark.sql.types import ArrayType, StringType

import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords
eng_stopwords = stopwords.words('english')
punctuation = string.punctuation+'“’”...—…‼‘'

spark = sparknlp.start(gpu=True)

--2022-12-14 05:35:04--  http://setup.johnsnowlabs.com/colab.sh
Resolving setup.johnsnowlabs.com (setup.johnsnowlabs.com)... 51.158.130.125
Connecting to setup.johnsnowlabs.com (setup.johnsnowlabs.com)|51.158.130.125|:80... connected.
HTTP request sent, awaiting response... 302 Found
Location: https://setup.johnsnowlabs.com/colab.sh [following]
--2022-12-14 05:35:04--  https://setup.johnsnowlabs.com/colab.sh
Connecting to setup.johnsnowlabs.com (setup.johnsnowlabs.com)|51.158.130.125|:443... connected.
HTTP request sent, awaiting response... 302 Moved Temporarily
Location: https://raw.githubusercontent.com/JohnSnowLabs/spark-nlp/master/scripts/colab_setup.sh [following]
--2022-12-14 05:35:04--  https://raw.githubusercontent.com/JohnSnowLabs/spark-nlp/master/scripts/colab_setup.sh
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.108.133, 185.199.109.133, 185.199.110.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.108.133|:44

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


## Fetch Data fro MongoDB

In [None]:
client = MongoClient('mongodb://yongtai:taiyong@test.asknyu.com:27017/')
proj_db = client.project
constraint_train = proj_db.constraint_train
constraint_val = proj_db.constraint_val
constraint_test = proj_db.constraint_test

In [None]:
train_pandasDF = pd.DataFrame(list(constraint_train.find())).drop(['_id'], axis=1)
trainData = spark.createDataFrame(train_pandasDF) 

val_pandasDF = pd.DataFrame(list(constraint_val.find())).drop(['_id'], axis=1)
valData = spark.createDataFrame(val_pandasDF) 

test_pandasDF = pd.DataFrame(list(constraint_test.find())).drop(['_id'], axis=1)
testData = spark.createDataFrame(test_pandasDF) 

## Preprocessing

In [None]:
class CustomTransformer(Transformer, DefaultParamsReadable, DefaultParamsWritable):
    # lazy workaround - a transformer needs to have these attributes
    _defaultParamMap = dict()
    _paramMap = dict()
    _params = dict()
    def __init__(self):
      super(CustomTransformer, self).__init__()

class NullDropper(CustomTransformer):
  def __init__(self, cols=None):
    super(NullDropper, self).__init__()
    self.cols = cols

  def _transform(self, data):
    return data.dropna()

class LabelEncoder(CustomTransformer):
  def __init__(self, cols=None):
    super(LabelEncoder, self).__init__()
    self.cols = cols

  def _transform(self, data):
    return data.withColumn("label", when(col("label")=="real", 0.0).otherwise(1.0))

class Cleaner(CustomTransformer):
  def __init__(self, cols=None):
    self.cols = cols

  def _transform(self, data):
    def filter_out_urls(words):
      # eliminate nulls and blanks
      newWords = []
      for word in words.split(" "):
          if not word.startswith("https:"):
              newWords.append(word)
      return " ".join(newWords)

    udf_filter_urls = udf(filter_out_urls, StringType())
    return data.withColumn("text", udf_filter_urls(col("tweet")))

In [None]:
nullDroper = NullDropper()

labelEncoder = LabelEncoder()

cleaner = Cleaner()

documentAssembler = DocumentAssembler() \
     .setInputCol('text') \
     .setOutputCol('document')

tokenizer = Tokenizer() \
    .setInputCols(["document"]) \
    .setOutputCol("token")

normalizer = Normalizer() \
  .setInputCols("token") \
  .setOutputCol("normalized") \
  .setLowercase(True) \
  .setCleanupPatterns(["""[^\w\d\s]"""]) \

stemmer = Stemmer() \
    .setInputCols(["normalized"]) \
    .setOutputCol("stemmed")

stopwordsCleaner = StopWordsCleaner() \
     .setInputCols(['stemmed']) \
     .setOutputCol('stopremoved') \
     .setCaseSensitive(False) \
     .setStopWords(eng_stopwords)

finisher = Finisher() \
    .setInputCols("stopremoved") \
    .setOutputCols("sentence") \
    .setOutputAsArray(False) \
    .setAnnotationSplitSymbol(" ")

In [None]:
preprocessPipeline = Pipeline(stages = [
    nullDroper, 
    labelEncoder, 
    cleaner,
    documentAssembler, 
    tokenizer,
    normalizer,
    stemmer,
    stopwordsCleaner
])

preprocessModel = preprocessPipeline.fit(trainData)

train = preprocessModel.transform(trainData)
val = preprocessModel.transform(valData)

In [None]:
train.select("stopremoved").show(truncate=False)

+---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------

## Training & Evaluation

In [None]:
class F1BinaryEvaluator():

    def __init__(self, predCol="prediction", labelCol="label", metricLabel=1.0):
        self.labelCol = labelCol
        self.predCol = predCol
        self.metricLabel = metricLabel

    def isLargerBetter(self):
        return True

    def evaluate(self, dataframe):
        tp = dataframe.filter(self.labelCol + ' = ' + str(self.metricLabel) + ' and ' + self.predCol + ' = ' + str(self.metricLabel)).count()
        fp = dataframe.filter(self.labelCol + ' != ' + str(self.metricLabel) + ' and ' + self.predCol + ' = ' + str(self.metricLabel)).count()
        fn = dataframe.filter(self.labelCol + ' = ' + str(self.metricLabel) + ' and ' + self.predCol + ' != ' + str(self.metricLabel)).count()
        return tp / (tp + (.5 * (fn +fp)))

### With Glove 100D
Best validation score: **85.31%**

In [None]:
glove_embeddings = WordEmbeddingsModel().pretrained() \
      .setInputCols(["document", 'stopremoved'])\
      .setOutputCol("embeddings")\
      .setCaseSensitive(False)

embeddingsSentence = SentenceEmbeddings() \
      .setInputCols(["document", "embeddings"]) \
      .setOutputCol("sentence_embeddings") \
      .setPoolingStrategy("AVERAGE")

docClassifier = ClassifierDLApproach() \
    .setInputCols("sentence_embeddings") \
    .setOutputCol("category") \
    .setLabelColumn("label") \
    .setBatchSize(32) \
    .setMaxEpochs(200) \
    .setLr(5e-3) \
    .setDropout(0.5) \
    .setEnableOutputLogs(True) \
    .setOutputLogsPath('logs')

glove_100d download started this may take some time.
Approximate size to download 145.3 MB
[OK!]


In [None]:
trainPipeline = Pipeline(stages = [
    glove_embeddings,
    embeddingsSentence,
    docClassifier
])

model = trainPipeline.fit(train)

In [None]:
with open("/content/logs/ClassifierDLApproach_e0e03443faf5.log", "r") as log_file :
    print(log_file.read())

Training started - epochs: 200 - learning_rate: 0.005 - batch_size: 32 - training_examples: 6420 - classes: 2
Epoch 0/200 - 0.62s - loss: 100.23262 - acc: 0.79815626 - batches: 201
Epoch 1/200 - 0.45s - loss: 94.01349 - acc: 0.8574687 - batches: 201
Epoch 2/200 - 0.45s - loss: 89.5342 - acc: 0.8759062 - batches: 201
Epoch 3/200 - 0.46s - loss: 86.65842 - acc: 0.8859062 - batches: 201
Epoch 4/200 - 0.45s - loss: 86.37572 - acc: 0.89325 - batches: 201
Epoch 5/200 - 0.44s - loss: 82.83719 - acc: 0.8990312 - batches: 201
Epoch 6/200 - 0.45s - loss: 81.03305 - acc: 0.9043437 - batches: 201
Epoch 7/200 - 0.43s - loss: 79.79601 - acc: 0.90731245 - batches: 201
Epoch 8/200 - 0.45s - loss: 79.5243 - acc: 0.91012496 - batches: 201
Epoch 9/200 - 0.47s - loss: 78.89721 - acc: 0.91231245 - batches: 201
Epoch 10/200 - 0.47s - loss: 77.48657 - acc: 0.9149687 - batches: 201
Epoch 11/200 - 0.45s - loss: 76.67535 - acc: 0.916375 - batches: 201
Epoch 12/200 - 0.46s - loss: 76.38347 - acc: 0.91709375 - ba

#### Evaluation

In [None]:
result = model.transform(val)
result.select("label", "category.result").show(truncate=False)

+-----+------+
|label|result|
+-----+------+
|1.0  |[1.0] |
|1.0  |[0.0] |
|0.0  |[0.0] |
|0.0  |[0.0] |
|0.0  |[0.0] |
|0.0  |[0.0] |
|1.0  |[1.0] |
|1.0  |[1.0] |
|0.0  |[1.0] |
|1.0  |[1.0] |
|1.0  |[1.0] |
|0.0  |[1.0] |
|1.0  |[1.0] |
|1.0  |[1.0] |
|0.0  |[0.0] |
|1.0  |[1.0] |
|1.0  |[1.0] |
|0.0  |[0.0] |
|0.0  |[0.0] |
|0.0  |[0.0] |
+-----+------+
only showing top 20 rows



In [None]:
f1_evaluator = F1BinaryEvaluator()
f1_evaluator.evaluate(result.select("label", "category.result").withColumn("prediction", col("result")[0]))

0.8531400966183574

### With Universal Sentence Encoder
Best validation f-1 score: **91.89%**

In [None]:
# actual content is inside description column
document = DocumentAssembler()\
                  .setInputCol("tweet")\
                  .setOutputCol("document")

# we can also use sentece detector here if we want to train on and get predictions for each sentence
use = UniversalSentenceEncoder.pretrained("tfhub_use_lg", "en") \
                  .setInputCols("document") \
                  .setOutputCol("sentence_embeddings")

# the classes/labels/categories are in category column
classsifierdl = ClassifierDLApproach()\
                  .setInputCols(["sentence_embeddings"])\
                  .setOutputCol("category")\
                  .setLabelColumn("label")\
                  .setMaxEpochs(10)\
                  .setLr(0.001)\
                  .setBatchSize(32)\
                  .setEnableOutputLogs(True) \
                  .setOutputLogsPath('logs')

tfhub_use_lg download started this may take some time.
Approximate size to download 753.3 MB
[OK!]


In [None]:
trainPipeline = Pipeline(stages = [
    #nullDroper, 
    #labelEncoder, 
    document,
    use,
    classsifierdl
])

model = trainPipeline.fit(trainData)

In [None]:
model.save("model")

In [None]:
import shutil
shutil.make_archive("model", 'zip', "model")

'/content/model.zip.zip'

In [None]:
from google.colab import files
files.download("model.zip.zip")

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [None]:
with open("/content/logs/ClassifierDLApproach_5a73e1f1f843.log", "r") as log_file :
    print(log_file.read())

Training started - epochs: 200 - learning_rate: 0.001 - batch_size: 32 - training_examples: 6420 - classes: 2
Epoch 0/200 - 0.62s - loss: 89.49148 - acc: 0.8646562 - batches: 201
Epoch 1/200 - 0.46s - loss: 80.39503 - acc: 0.9135625 - batches: 201
Epoch 2/200 - 0.45s - loss: 79.05554 - acc: 0.9227812 - batches: 201
Epoch 3/200 - 0.46s - loss: 78.23068 - acc: 0.92809373 - batches: 201
Epoch 4/200 - 0.45s - loss: 77.692825 - acc: 0.931625 - batches: 201
Epoch 5/200 - 0.46s - loss: 77.269775 - acc: 0.9335 - batches: 201
Epoch 6/200 - 0.46s - loss: 76.92266 - acc: 0.93553126 - batches: 201
Epoch 7/200 - 0.46s - loss: 76.63774 - acc: 0.937875 - batches: 201
Epoch 8/200 - 0.46s - loss: 76.40783 - acc: 0.9385 - batches: 201
Epoch 9/200 - 0.45s - loss: 76.21756 - acc: 0.93990624 - batches: 201
Epoch 10/200 - 0.47s - loss: 76.05511 - acc: 0.9413125 - batches: 201
Epoch 11/200 - 0.44s - loss: 75.91445 - acc: 0.94256246 - batches: 201
Epoch 12/200 - 0.45s - loss: 75.79178 - acc: 0.9431875 - batch

#### Evaluation

In [None]:
result = model.transform(valData)
result.select("label", "category.result").show(truncate=False)

+-----+------+
|label|result|
+-----+------+
|fake |[fake]|
|fake |[fake]|
|real |[real]|
|real |[real]|
|real |[real]|
|real |[real]|
|fake |[fake]|
|fake |[fake]|
|real |[real]|
|fake |[fake]|
|fake |[fake]|
|real |[real]|
|fake |[fake]|
|fake |[fake]|
|real |[real]|
|fake |[fake]|
|fake |[fake]|
|real |[real]|
|real |[real]|
|real |[real]|
+-----+------+
only showing top 20 rows



In [None]:
f1_evaluator = F1BinaryEvaluator()
f1_evaluator.evaluate(result.select("label", "category.result").withColumn("prediction", col("result")[0]))

ZeroDivisionError: ignored