In [1]:
import os
import re
import urllib.request
import numpy as np
import pandas as pd
from zipfile import ZipFile
from bs4 import BeautifulSoup

from pyspark.sql import SparkSession
from pyspark.sql.functions import udf, rand, when, col
from pyspark.sql.types import StructType, StructField, DoubleType, StringType
from pyspark.ml import Pipeline
from pyspark.ml.feature import CountVectorizer, RegexTokenizer


spark = SparkSession.builder.appName("MyApp")\
    .config("spark.jars.packages", "com.microsoft.ml.spark:mmlspark_2.11:1.0.0-rc1")\
    .config('spark.executor.memory', '8g')\
    .getOrCreate()

from mmlspark.vw import VowpalWabbitClassifier
from mmlspark.train import ComputeModelStatistics
from pyspark.mllib.evaluation import BinaryClassificationMetrics
import matplotlib.pyplot as plt

%matplotlib inline

In [3]:
TRAIN_DATA_PATH = "/home/haitien/Desktop/TwitterSentimentAnalysis_BigData20191/data/training.1600000.processed.noemoticon.csv"
TEST_DATA_PATH = "/home/haitien/Desktop/TwitterSentimentAnalysis_BigData20191/data/testdata.manual.2009.06.14.csv"

In [4]:

# Folder for storing the downloaded data
DATA_FOLDER = "data"
# Data column names
COL_NAMES = ["label", "id", "date", "query_string", "user", "text"]
# Text encoding type of the data
ENCODING = "iso-8859-1"

In [None]:
df_train = pd.read_csv(TRAIN_DATA_PATH, 
                       header=None, names=COL_NAMES, encoding=ENCODING)


In [None]:
df_train = spark.createDataFrame(df_train, verifySchema=False)

In [None]:
df_train.limit(10).toPandas()

In [None]:
print("Number of training samples: ", df_train.count())

In [None]:
df_train = df_train.orderBy(rand()) \
                   .limit(100000) \
                   .withColumn("label", when(col("label") > 0, 1.0).otherwise(0.0)) \
                   .select(["label", "text"])

In [None]:
# Specify featurizers
tokenizer = RegexTokenizer(inputCol="text",
                           outputCol="words")

# from pyspark.ml.feature import Tokenizer, HashingTF
# tokenizer = Tokenizer(inputCol="text", outputCol="words")
# hashingTF = HashingTF(inputCol=tokenizer.getOutputCol(), outputCol="features")

count_vectorizer = CountVectorizer(inputCol="words",
                                   outputCol="features")

# Define VW classification model
args = "--loss_function=logistic --quiet --holdout_off"
vw_model = VowpalWabbitClassifier(featuresCol="features", 
                                  labelCol="label", 
                                  args=args, 
                                  numPasses=1)

# Create a pipeline
vw_pipeline = Pipeline(stages=[tokenizer, count_vectorizer, vw_model])

In [None]:
vw_trained = vw_pipeline.fit(df_train)

In [6]:
df_test = pd.read_csv(TEST_DATA_PATH, 
                       header=None, names=COL_NAMES, encoding=ENCODING)
df_test = spark.createDataFrame(df_test, verifySchema=False)

In [2]:
from pyspark.ml import PipelineModel
model4 = PipelineModel.read().load("saved_model/model4")

In [7]:
predictions = model4.transform(df_test)

In [9]:
from pyspark.ml.evaluation import BinaryClassificationEvaluator
evaluator = BinaryClassificationEvaluator(rawPredictionCol="rawPrediction")
evaluator.evaluate(predictions)

0.8382878363869971