In [1]:
# !pip3 install unidecode
# !pip3 install scikit-learn==0.23.2 scipy==1.5.2
# !wget https://f000.backblazeb2.com/file/malay-dataset/emotion/emotion-twitter-lexicon.json

In [2]:
from pyspark.sql import SparkSession
from pyspark.ml import Pipeline
from pyspark.ml.feature import CountVectorizer, IDF
from pyspark.ml.classification import LogisticRegression
from pyspark.ml.classification import NaiveBayes
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
import json
import random
import re
from unidecode import unidecode
from sklearn.model_selection import train_test_split

In [3]:
with open('emotion-twitter-lexicon.json') as fopen:
    data = json.load(fopen)

In [4]:
labels = list(data.keys())
labels

['anger', 'fear', 'happy', 'love', 'sadness', 'surprise']

In [5]:
def cleaning(string):
    """
    use by any transformer model before tokenization
    """
    string = unidecode(string)
    string = re.sub('\(dot\)', '.', string)
    string = (
        re.sub(re.findall(r'\<a(.*?)\>', string)[0], '', string)
        if (len(re.findall(r'\<a (.*?)\>', string)) > 0)
        and ('href' in re.findall(r'\<a (.*?)\>', string)[0])
        else string
    )
    string = re.sub(
        r'\w+:\/{2}[\d\w-]+(\.[\d\w-]+)*(?:(?:\/[^\s/]*))*', ' ', string
    )
    
    chars = '.,/*'
    for c in chars:
        string = string.replace(c, f' {c} ')
        
    string = re.sub(r'[ ]+', ' ', string.lower()).strip().split()
    string = [w for w in string if w[0] != '@']
    return string

In [6]:
emotion = []
for k, v in data.items():
    sample = random.sample(v, 10000)
    sample = [(labels.index(k), cleaning(t)) for t in sample]
    emotion.extend(sample)

In [8]:
train_emotion, test_emotion = train_test_split(emotion, test_size = 0.1)
len(train_emotion), len(test_emotion)

(54000, 6000)

In [9]:
sess = SparkSession.builder.appName('nlp').getOrCreate()

22/02/10 14:50:18 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
Using Spark's default log4j profile: org/apache/spark/log4j-defaults.properties
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).


In [10]:
train_df = sess.createDataFrame(train_emotion, ['label', 'words'])
test_df = sess.createDataFrame(test_emotion, ['label', 'words'])

In [11]:
pipeline = Pipeline(stages=[
    CountVectorizer(inputCol='words',
                    outputCol='tf'),
    IDF(inputCol='tf',
        outputCol='tfidf'),
    LogisticRegression(featuresCol='tfidf',
                       regParam=1.0),
])

In [12]:
model = pipeline.fit(train_df)

22/02/10 14:50:56 WARN TaskSetManager: Stage 0 contains a task of very large size (4399 KiB). The maximum recommended task size is 1000 KiB.
22/02/10 14:51:06 WARN TaskSetManager: Stage 4 contains a task of very large size (4399 KiB). The maximum recommended task size is 1000 KiB.
22/02/10 14:51:11 WARN DAGScheduler: Broadcasting large task binary with size 1452.7 KiB
22/02/10 14:51:11 WARN TaskSetManager: Stage 5 contains a task of very large size (4399 KiB). The maximum recommended task size is 1000 KiB.
22/02/10 14:51:15 WARN DAGScheduler: Broadcasting large task binary with size 1454.1 KiB
22/02/10 14:51:15 WARN TaskSetManager: Stage 6 contains a task of very large size (4399 KiB). The maximum recommended task size is 1000 KiB.
22/02/10 14:51:18 WARN BLAS: Failed to load implementation from: com.github.fommil.netlib.NativeSystemBLAS
22/02/10 14:51:18 WARN BLAS: Failed to load implementation from: com.github.fommil.netlib.NativeRefBLAS
22/02/10 14:51:19 WARN DAGScheduler: Broadcasti

22/02/10 14:51:35 WARN DAGScheduler: Broadcasting large task binary with size 1454.1 KiB
22/02/10 14:51:35 WARN TaskSetManager: Stage 39 contains a task of very large size (4399 KiB). The maximum recommended task size is 1000 KiB.
22/02/10 14:51:35 WARN DAGScheduler: Broadcasting large task binary with size 1454.1 KiB
22/02/10 14:51:35 WARN TaskSetManager: Stage 40 contains a task of very large size (4399 KiB). The maximum recommended task size is 1000 KiB.


In [14]:
prediction = model.transform(test_df)
MulticlassClassificationEvaluator().evaluate(prediction)

22/02/10 14:52:09 WARN DAGScheduler: Broadcasting large task binary with size 4.0 MiB
                                                                                

0.8918511148081394

In [15]:
pipeline = Pipeline(stages=[
    CountVectorizer(inputCol='words',
                    outputCol='tf'),
    IDF(inputCol='tf',
        outputCol='tfidf'),
    NaiveBayes(smoothing=1.0, 
               modelType='multinomial', 
              featuresCol='tfidf'),
])

In [16]:
model = pipeline.fit(train_df)

22/02/10 14:52:36 WARN TaskSetManager: Stage 43 contains a task of very large size (4399 KiB). The maximum recommended task size is 1000 KiB.
22/02/10 14:52:40 WARN TaskSetManager: Stage 47 contains a task of very large size (4399 KiB). The maximum recommended task size is 1000 KiB.
22/02/10 14:52:44 WARN DAGScheduler: Broadcasting large task binary with size 1456.6 KiB
22/02/10 14:52:44 WARN TaskSetManager: Stage 48 contains a task of very large size (4399 KiB). The maximum recommended task size is 1000 KiB.
22/02/10 14:52:47 WARN DAGScheduler: Broadcasting large task binary with size 1453.5 KiB
                                                                                

In [29]:
prediction = model.transform(test_df)
MulticlassClassificationEvaluator().evaluate(prediction)

22/02/10 14:54:59 WARN DAGScheduler: Broadcasting large task binary with size 4.0 MiB
                                                                                

0.7710925990576287

In [31]:
# !rm -rf model
model.save('model')

22/02/10 14:56:06 WARN TaskSetManager: Stage 86 contains a task of very large size (2672 KiB). The maximum recommended task size is 1000 KiB.
                                                                                

In [32]:
from pyspark.ml.pipeline import PipelineModel

In [33]:
model_load = PipelineModel.load('model')

In [34]:
prediction = model_load.transform(test_df)
MulticlassClassificationEvaluator().evaluate(prediction)

22/02/10 14:56:28 WARN DAGScheduler: Broadcasting large task binary with size 4.0 MiB
                                                                                

0.7710925990576287