In [2]:
from src.spark import Spark
import pandas as pd
import src.tweet_volume as funcs
import matplotlib.pyplot as plt
from pyspark.sql import functions as fs
from src.plotting import double_plot
from pyspark.ml import PipelineModel
from pyspark.ml.classification import LogisticRegressionModel
import src.nlp.clean as clean
from pyspark.sql.types import StringType, IntegerType, StructField, StructType
from src.nlp.sentiment import SentimentAnalyser


In [3]:
spark = Spark('load', 'local')
sess = spark.session()

In [4]:
df = funcs.load_dataframe(sess, '/cs/unique/ls99-kf39-cs5052/data/tweets/*.json', funcs.schema)
df2 = funcs.parse_timestamp(df)


u'rt @sisafund an articles is being posted by #sisa #presale #ico #eth #btc #fintech #blockchain '

In [5]:
sent_schema = StructType([
    StructField("target", IntegerType(), True),
    StructField("text", StringType(), True)
])

train_df = sess.read.csv("/cs/unique/ls99-kf39-cs5052/train/train.csv", header=True, schema=sent_schema)

train_clean = clean.clean_tweets(train_df, "text")
train_clean.first()

Row(target=0, text=u'is so sad for my apl friend')

In [6]:
train, evaluation = train_clean.randomSplit([0.99, 0.1], seed=42)
print("Number of tweets in train: {:,}".format(train.count()))

Number of tweets in train: 1,434,200


In [7]:
pipeline = "/cs/unique/ls99-kf39-cs5052/models/pipeline"
classifier = "/cs/unique/ls99-kf39-cs5052/models/linreg"

sentiment_model = SentimentAnalyser()
sentiment_model.load(pipeline, classifier)

In [8]:
pred = sentiment_model.predict(evaluation)
sentiment_model.classification_report(pred)

pos, neg = sentiment_model.count_sentiments(pred)

RocAuc: 0.8661, Accuracy: 0.7906


Tweets positive: 77,399, negative: 67,016


In [10]:
alts = ['ETH', 'BTC', 'XMR', 'DASH', 'LTC', 'ETC', 'BCH']
hashes = {
        'ETH': ['%ethereum%', '%ether%', '%eth%'],
        'BTC': ['%bitcoin%', '%btc%', '%bitcoin%'],
        'XMR': ["%monero%", "%xmr%", "%monero%"], 
        'DASH': ["%digital cash%", "%dash%", "%dash%"], 
        'LTC': ["%litecoin%", "%ltc%", "%litecoin%"], 
        'ETC': ["%ethereum classic%", "%etc%", "%eth classic%"], 
        'BCH': ["%bitcoincash%", "%bch%", "%bitcoin cash%"]
    }
for alt in alts:
    hash = hashes[alt]
    coin_df = df2.filter(fs.lower(df['text']).like(hash[0]) | fs.lower(df['text']).like(hash[1]) | fs.lower(df['text']).like(hash[2]))
    coin_clean = clean.clean_tweets(coin_df, "text")

    print(coin_clean.first().text)
    
    coin_pred = sentiment_model.predict(coin_clean)
    pos, neg = sentiment_model.count_sentiments(pred)
    
    daily_pos = funcs.aggregate_by_day(pos)
    daily_neg = funcs.aggregate_by_day(neg)
    daily_pos.set_index("date", inplace=True)
    daily_neg.set_index("date", inplace=True)
    daily_pos.index = pd.to_datetime(daily_pos.index)
    daily_neg.index = pd.to_datetime(daily_neg.index)
    
    daily_sent = pd.DataFrame(data={'pos':daily_pos.count, 'neg':daily_neg.count}, index=daily_pos.index)
    daily_sent['sentiment'] = (daily_sent.pos - daily_sent.neg) / float(daily_sent.pos + daily_sent.neg)
    
    #### Fetch coin price data, combine with sentiment score and plot.
    

rt @sisafund an articles is being posted by #sisa #presale #ico #eth #btc #fintech #blockchain 


rt @sisafund an articles is being posted by #sisa #presale #ico #eth #btc #fintech #blockchain 


rt @sisafund an articles is being posted by #sisa #presale #ico #eth #btc #fintech #blockchain 


rt @sisafund an articles is being posted by #sisa #presale #ico #eth #btc #fintech #blockchain 


rt @sisafund an articles is being posted by #sisa #presale #ico #eth #btc #fintech #blockchain 


rt @sisafund an articles is being posted by #sisa #presale #ico #eth #btc #fintech #blockchain 


rt @sisafund an articles is being posted by #sisa #presale #ico #eth #btc #fintech #blockchain 
