# Spark - How to Use Machine Learning Pipeline

In [4]:
from pyspark import SparkContext
from pyspark.sql.session import SparkSession
from pyspark.ml import Pipeline
import pyspark.sql.types as tp
from pyspark.ml.feature import StringIndexer, OneHotEncoderEstimator, VectorAssembler
from pyspark.ml.feature import StopWordsRemover, Word2Vec, RegexTokenizer
from pyspark.ml.classification import LogisticRegression

### Spark Initialization

In [2]:
# initializing spark session
sc = SparkContext("local[2]", appName="spark streaming twitter sentiment")  # local n means n threads can be used
spark = SparkSession(sc)

### Load Data

In [5]:
df = spark.read.csv('twitter_sentiment.csv', header=True)
df = df.withColumn("label", df["label"].cast(tp.IntegerType()))
df = df.withColumn("id", df["id"].cast(tp.IntegerType()))

df.show(5)
df.printSchema()

+---+-----+--------------------+
| id|label|               tweet|
+---+-----+--------------------+
|  1|    0| @user when a fat...|
|  2|    0|@user @user thank...|
|  3|    0|  bihday your maj...|
|  4|    0|#model   i love u...|
|  5|    0| factsguide: soci...|
+---+-----+--------------------+
only showing top 5 rows

root
 |-- id: integer (nullable = true)
 |-- label: integer (nullable = true)
 |-- tweet: string (nullable = true)



In [6]:
print(df.count())

training_df = df.limit(31000)
print(training_df.count())

testing_df = df.filter(df['id'] > 31000)
print(testing_df.count())
print(testing_df.groupBy().min('id').collect(), testing_df.groupBy().max('id').collect())

31962
31000
962
[Row(min(id)=31001)] [Row(max(id)=31962)]


### Build the Pipeline, Fit the Model

In [7]:
# define stage 1: tokenize the tweet text    
stage_1 = RegexTokenizer(inputCol= 'tweet' , outputCol= 'tokens', pattern= '\\W')
# define stage 2: remove the stop words
stage_2 = StopWordsRemover(inputCol= 'tokens', outputCol= 'filtered_words')
# define stage 3: create a word vector of the size 100
stage_3 = Word2Vec(inputCol= 'filtered_words', outputCol= 'vector', vectorSize= 100)
# define stage 4: Logistic Regression Model
model_lg = LogisticRegression(featuresCol= 'vector', labelCol= 'label')

In [8]:
# setup the pipeline
pipeline = Pipeline(stages= [stage_1, stage_2, stage_3, model_lg])

# fit the pipeline model with the training data
pipelineFit = pipeline.fit(training_df)

### Predict Results

In [9]:
test_df = pipelineFit.transform(testing_df).select('tweet','prediction', 'label')

test_df.show()

+--------------------+----------+-----+
|               tweet|prediction|label|
+--------------------+----------+-----+
|the excitement of...|       0.0|    0|
|as a huge @user f...|       0.0|    0|
|immoality   #wave...|       0.0|    0|
|#repost from @use...|       0.0|    0|
| @user nice discu...|       0.0|    0|
|love them !! #che...|       0.0|    0|
|this so of shit i...|       0.0|    0|
|#travel#girl#russ...|       0.0|    0|
| @user when a for...|       0.0|    0|
|first day as a 26...|       0.0|    0|
|well, @user is fo...|       0.0|    0|
|my first comic bo...|       0.0|    0|
|spray painting ad...|       0.0|    0|
|can #lighttherapy...|       0.0|    0|
|.@user fyi since ...|       0.0|    0|
|it's really happy...|       0.0|    0|
|even when i don't...|       0.0|    0|
|@user stands watc...|       0.0|    0|
|friday feeling   ...|       0.0|    0|
| @user just 1 mor...|       0.0|    0|
+--------------------+----------+-----+
only showing top 20 rows



## Notes

It's desinitely looks simple and organized to use machine learning pipeline, but there are some limitations:
* It has to process Spark DataFrame
* When I was trying to apply this pipeline in spark streaming data, it always gave me errors in data schema related issue. Since streaming data is using rdd, you have to convert to dataframe, where schema s required, but there could be different unintelligent schema issues appear.

## Reference
* https://www.analyticsvidhya.com/blog/2019/12/streaming-data-pyspark-machine-learning-model/