In [1]:
from src.spark import Spark
import src.tweet_volume as funcs
import matplotlib.pyplot as plt
from pyspark.sql import functions as fs
from src.plotting import double_plot
from pyspark.ml import PipelineModel
from pyspark.ml.classification import LogisticRegressionModel
import src.nlp.clean as clean
from pyspark.sql.types import StringType, IntegerType, StructField, StructType
from src.nlp.sentiment import SentimentAnalyser


In [2]:
spark = Spark('load', 'local')
sess = spark.session()

In [3]:
df = funcs.load_dataframe(sess, '/cs/home/ls99/PycharmProjects/Spark/data/*.json', funcs.schema)
df2 = funcs.parse_timestamp(df)
eth_df = df2.filter(fs.lower(df['text']).like("%ether%") | fs.lower(df['text']).like("%eth%") | fs.lower(df['text']).like("%ethereum%"))
eth_clean = clean.clean_tweets(eth_df, "text")

eth_clean.first().text

u'rt @sisafund an articles is being posted by #sisa #presale #ico #eth #btc #fintech #blockchain '

In [None]:
sent_schema = StructType([
    StructField("target", IntegerType(), True),
    StructField("text", StringType(), True)
])

train_df = sess.read.csv("/cs/unique/ls99-kf39-cs5052/train/train.csv", header=True, schema=sent_schema)

train_clean = clean.clean_tweets(train_df, "text")
train_clean.first()

In [16]:
train, eval = train_clean.randomSplit([0.99, 0.1], seed=42)
print("Number of tweets in train: {:,}".format(train.count()))

Number of tweets in train: 1,434,200


In [4]:
pipeline = "/cs/unique/ls99-kf39-cs5052/models/pipeline"
classifier = "/cs/unique/ls99-kf39-cs5052/models/linreg"

sentiment_model = SentimentAnalyser()
sentiment_model.load(pipeline, classifier)

In [5]:
eth_pred = sentiment_model.predict(eth_clean)

positive = eth_pred.filter(eth_pred.prediction == sentiment_model.positive).count()
negative = eth_pred.filter(eth_pred.prediction == sentiment_model.negative).count()

print("Ether tweets positive: {:,}, negative: {:,}".format(positive, negative))

Ether tweets positive: 13,369, negative: 1,022
