In [1]:
from src.spark import Spark
import src.tweet_volume as funcs
import matplotlib.pyplot as plt
from pyspark.sql import functions as fs
import os 
import pandas as pd
from src.plotting import double_plot
from pyspark.ml.feature import HashingTF, IDF, Tokenizer
from pyspark.ml.feature import StringIndexer
from pyspark.ml import Pipeline
from pyspark.ml.classification import LogisticRegression
from pyspark.ml.evaluation import BinaryClassificationEvaluator
import src.nlp.clean as clean
from pyspark.sql.types import StringType, IntegerType, StructField, StructType

In [2]:
spark = Spark('load', 'local')
sess = spark.session()

In [4]:
df = funcs.load_dataframe(sess, '/cs/home/ls99/PycharmProjects/Spark/data/*.json', funcs.schema)
df2 = funcs.parse_timestamp(df)

In [5]:
eth_df = df2.filter(fs.lower(df['text']).like("%ether%") | fs.lower(df['text']).like("%eth%") | fs.lower(df['text']).like("%ethereum%"))


In [6]:

eth_clean = clean.clean_tweets(eth_df, "text")
eth_train, eth_test, eth_eval = eth_clean.randomSplit([0.98, 0.1, 0.1])

In [7]:
eth_train.first().text

u' #ethereum #eth'

In [3]:
sent_schema = StructType([
    StructField("target", IntegerType(), True),
    StructField("text", StringType(), True)
])

train_df = sess.read.csv("/cs/unique/ls99-kf39-cs5052/train/train.csv", header=True, schema=sent_schema)


In [4]:

train_df.first()

Row(target=0, text=u'                     is so sad for my APL friend.............')

In [5]:
train_clean = clean.clean_tweets(train_df, "text")


u'is so sad for my apl friend'

In [6]:
eth_train, eth_test, eth_eval = train_clean.randomSplit([0.98, 0.1, 0.1])

In [7]:
tokeniser = Tokenizer(inputCol="text", outputCol="words")
hashtf = HashingTF(numFeatures=2**12, inputCol="words", outputCol="tf")
idf = IDF(inputCol="tf", outputCol="features", minDocFreq=5)
string_index = StringIndexer(inputCol="target", outputCol="label")
pipeline = Pipeline(stages=[tokeniser, hashtf, idf, string_index])

# Sentiment feature selection pipeline
eth_model = pipeline.fit(eth_train)
eth_train_df = eth_model.transform(eth_train)
eth_eval_df = eth_model.transform(eth_eval)

# Fit a classifier to classify as either positive or negative sentiment
lr = LogisticRegression(maxIter=100)
classifier = lr.fit(eth_train_df)
predictions = classifier.transform(eth_eval_df)

report = BinaryClassificationEvaluator(rawPredictionCol="rawPrediction")
bin_rep = report.evaluate(predictions)
accuracy = predictions.filter(predictions.label == predictions.prediction).count() / float(eth_eval_df.count())

print("RocAuc: %.4f, Accuracy: %.4f" % (bin_rep, accuracy))