# PySpark NLP Tutorial

Upload this jupyter notebook to Google drive, and open this tutorial with Google Colab

In [None]:
# install the dependencies:
%env spark_version=2.4.4
%env hadoop_version=2.7

!apt-get install openjdk-8-jdk-headless -qq > /dev/null
!wget -q https://www-us.apache.org/dist/spark/spark-${spark_version}/spark-${spark_version}-bin-hadoop${hadoop_version}.tgz
!tar xf spark-${spark_version}-bin-hadoop${hadoop_version}.tgz

!python --version

In [None]:
# set environment
import os

current_directory = os.getcwd()

os.environ["JAVA_HOME"] = "/usr/lib/jvm/java-8-openjdk-amd64"
os.environ["SPARK_HOME"] = "{}/spark-{}-bin-hadoop{}".format(current_directory, os.environ["spark_version"], os.environ["hadoop_version"])


In [None]:
# install findspark pyspark
!pip install findspark
!pip install pyspark

In [None]:
import findspark
findspark.init() # os.environ["SPARK_HOME"]
from pyspark import SparkConf
from pyspark.sql import SparkSession

conf = SparkConf().setAppName('pyspark-nlp-app').setMaster("local[*]")
spark = SparkSession.builder.config(conf=conf).getOrCreate()
spark

In [None]:
# download data
import requests

url = "https://github.com/liuhoward/teaching/raw/master/big_data/smsspam/"
train_file = "SMSSpamCollection.train"
test_file = "SMSSpamCollection.test"

r = requests.get(url + train_file)
open(train_file, 'wb').write(r.content)

r = requests.get(url + test_file)
open(test_file, 'wb').write(r.content)


In [None]:
# load train data

from pyspark.sql.types import *

# define schema
data_schema = StructType([
    StructField("category", StringType()),
    StructField("text", StringType())])

# read train csv file
train_data = spark.read.csv(train_file, schema=data_schema, sep='\t', header=None)
print(type(train_data))

In [None]:
train_data.printSchema() # print detail schema of data
train_data.show(n=5, truncate=True) # show top 5 rows
train_data.count()  # number of examples


In [None]:
# lower case

from pyspark.sql.functions import lower, col

lower_train = train_data.select('category', lower(col('text')).alias('text'))

lower_train.show(n=5, truncate=80)

In [None]:
# tokenize

from pyspark.ml.feature import Tokenizer, RegexTokenizer
from pyspark.sql.functions import col, udf


tokenizer = Tokenizer(inputCol="text", outputCol="words")
#tokenizer = RegexTokenizer(inputCol="text", outputCol="words", pattern="\\W")

tokenized_train = tokenizer.transform(lower_train).select("category", "words")


tokenized_train.show(n=5, truncate=80)

In [None]:
# remove stopwords

from pyspark.ml.feature import StopWordsRemover

remover = StopWordsRemover(inputCol="words", outputCol="filtered")
filtered_train = remover.transform(tokenized_train).select("category", "filtered")

filtered_train.show(n=5, truncate=80)

In [None]:
# convert feature to vector

from pyspark.ml.feature import CountVectorizer

# fit a CountVectorizerModel from the corpus.
vectorizer = CountVectorizer(inputCol="filtered", outputCol="features", vocabSize=5000, minDF=2.0)

cv_model = vectorizer.fit(filtered_train)

train_feature = cv_model.transform(filtered_train).select("category","features")
train_feature.show(n=5, truncate=80)

In [None]:
# convert label to index

from pyspark.ml.feature import StringIndexer

indexer = StringIndexer(inputCol="category", outputCol="label")

index_model = indexer.fit(train_feature)

train_xy = index_model.transform(train_feature).select("features", "label")

train_xy.show(n=5, truncate=80)


In [None]:
# preprocess test data

test_data = spark.read.csv(test_file, schema=data_schema, sep='\t', header=None)

lower_test = test_data.select('category', lower(col('text')).alias('text'))

tokenized_test = tokenizer.transform(lower_test).select("category", "words")

filtered_test = remover.transform(tokenized_test).select("category", "filtered")

test_feature = cv_model.transform(filtered_test).select("category","features")

test_xy = index_model.transform(test_feature).select("features", "label")

test_xy.show(n=5)


In [None]:
from pyspark.ml.classification import LogisticRegression


cls = LogisticRegression()

lrModel = cls.fit(train_xy)

In [None]:
predictions = lrModel.transform(test_xy)

predictions.show(n=5, truncate=20)

In [None]:
# evaluate AUC

from pyspark.ml.evaluation import BinaryClassificationEvaluator

evaluator = BinaryClassificationEvaluator(rawPredictionCol="rawPrediction")

evaluator.evaluate(predictions, {evaluator.metricName: "areaUnderROC"})

In [None]:
# evaluate

from pyspark.mllib.evaluation import MulticlassMetrics

predictionAndLabels = predictions.select("prediction", "label").rdd
metrics = MulticlassMetrics(predictionAndLabels)


print(f"accuracy: {metrics.accuracy}")
print(f"precision: {metrics.precision(1.0)}")

print(f"recall: {metrics.recall(1.0)}")

print(f"f1 score: {metrics.fMeasure(1.0, 1.0)}")

In [None]:
# data is imbalanced

train_label_count = train_xy.groupby('label').count().toPandas()
train_label_count

In [None]:
# add weight to handle imbalanced classes

ratio = train_label_count.loc[0, 'count'] / train_label_count.loc[1, 'count']

ratio

In [None]:
from pyspark.sql.functions import when

def weight_balance(labels):
    return when(labels == 1, ratio).otherwise(1)

train_xy_weight = train_xy.withColumn('weights', weight_balance(col('label')))

train_xy_weight.show(n=5)

In [None]:
cls = LogisticRegression(weightCol="weights")

lrModel = cls.fit(train_xy_weight)

predictions = lrModel.transform(test_xy)

predictions.show(n=5, truncate=20)

In [None]:
predictionAndLabels = predictions.select("prediction", "label").rdd
metrics = MulticlassMetrics(predictionAndLabels)


print(f"accuracy: {metrics.accuracy}")
print(f"precision: {metrics.precision(1.0)}")

print(f"recall: {metrics.recall(1.0)}")

print(f"f1 score: {metrics.fMeasure(1.0, 1.0)}")

try RegexTokenizer instead of Tokenizer

In [None]:
# stop
spark.stop()