## twitter-sentiment
#### Jeff Oxenberg

* Monitor a specific topic/set of topics on twitter, score sentiment of tweets using the vader model
* Insert tweets into Cassandra
* Cassandra table is indexed by Solr (using DataStax Enterprise)
* Solr can be used to search for specific terms, get time-based averages, etc

In [2]:
from tweepy.streaming import StreamListener
from tweepy import Stream, OAuthHandler
from cassandra.cluster import Cluster
from cassandra.query import PreparedStatement
from cassandra import ConsistencyLevel
from nltk.sentiment.vader import SentimentIntensityAnalyzer
from nltk import tokenize
from urllib.request import urlopen
import json
import datetime
import re
import pandas as pd
from pandas.io.json import json_normalize
from bokeh.io import output_notebook, show
from bokeh.plotting import figure

In [3]:
key=""
ksecret=""
token=""
tsecret=""

#authenticate
auth = OAuthHandler(key, ksecret)
auth.set_access_token(token, tsecret)

In [4]:
sid = SentimentIntensityAnalyzer()

In [5]:
cluster = Cluster(['172.17.136.41'])
session = cluster.connect('tweets')

ERROR:cassandra.connection:Closing connection <AsyncoreConnection(198025960) 172.17.136.41:9042> due to protocol error: code=000a [Protocol error] message="Invalid or unsupported protocol version: 4"


In [6]:
insert = session.prepare("INSERT INTO tweets (id, timestamp, text, weight) VALUES (?,?,?,?)")
insert.consistency_level = ConsistencyLevel.ONE

In [7]:
class listener(StreamListener):

    def on_data(self, data):
        d = json.loads(data)
        try:
            ctext = re.sub(r"http\S+", "", d[u"text"].strip())
            sent = sid.polarity_scores(ctext)
            #tuple (id,created_at,text,sentiment(-1 to 1))
            x = (d["id"],datetime.datetime.strptime(d["created_at"], "%a %b %d %H:%M:%S %z %Y"),ctext,sent["compound"])
            #print(x)
            session.execute(insert, x)
            return(True)
        except Exception:
            pass

    def on_error(self, errcode):
        print(errcode)

In [None]:
#start the stream of tweets, filter on topic
while True:
    try:
        streamer = Stream(auth, listener())
        streamer.filter(track=["clinton", "sanders", "bernie", "hillary"])
    except Exception:
        pass

## Solr Queries
Query the solr instance to get aggregate sentiment for a particular date range

In [8]:
cx = urlopen("http://172.17.136.41:8983/solr/tweets.tweets/select?q=text:(bernie+OR+sanders)&timestamp:[\"2016-02-12T02:00:00Z\"+TO+\"2016-02-12T04:20:00Z\"]\
&stats=true&stats.field=weight&rows=0&wt=json").read()
d = json.loads(cx.decode())

In [9]:
d["stats"]["stats_fields"]["weight"]["mean"]

0.03541715942420903

In [10]:
cx = urlopen("http://172.17.136.41:8983/solr/tweets.tweets/select?q=text:(bernie+OR+sanders)&facet=true&facet.date=timestamp&facet.date.start=\
2016-02-12T03:10:00Z&facet.date.end=2016-02-12T04:10:00Z&facet.date.gap=%2B10MINUTE&stats=true&stats.field=weight&rows=0&wt=json").read()
d = json.loads(cx.decode())
cx1 = urlopen("http://172.17.136.41:8983/solr/tweets.tweets/select?q=text:(hillary+OR+clinton)&facet=true&facet.date=timestamp&facet.date.start=\
2016-02-12T03:10:00Z&facet.date.end=2016-02-12T04:10:00Z&facet.date.gap=%2B10MINUTE&stats=true&stats.field=weight&rows=0&wt=json").read()
d1 = json.loads(cx1.decode())
#http://172.17.136.41:8983/solr/tweets.tweets/select?q=text:*&facet=true&facet.range={!tag=r1}timestamp&facet.range.start=2016-02-12T03:10:00Z&facet.range.end=2016-02-12T04:10:00Z&facet.range.gap=%2B10MINUTE&facet.pivot={!range=r1}weight&stats=true&stats.field=weight&rows=0&wt=json

## Graphs

In [11]:
output_notebook()

In [12]:
df = json_normalize(d["facet_counts"]["facet_dates"]["timestamp"])
df = df.T
df = df.drop(['end', 'gap', 'start'])
df.columns=["mentions"]
df = df.reset_index()
df1 = json_normalize(d1["facet_counts"]["facet_dates"]["timestamp"])
df1 = df1.T
df1 = df1.drop(['end', 'gap', 'start'])
df1.columns=["mentions"]
df1 = df1.reset_index()

In [13]:
p = figure(width=400, height=400, x_axis_type="datetime", title="Mentions")
p.line(pd.to_datetime(df["index"]), df["mentions"], legend="sanders")
p.line(pd.to_datetime(df1["index"]), df1["mentions"], legend="clinton", color="red")
p.legend.location = "top_left"
show(p)

## Data Model

In [None]:
'''
cqlsh:tweets> DESC tweets

CREATE KEYSPACE tweets WITH replication = {'class': 'NetworkTopologyStrategy', 'Cassandra': '1', 'Solr': '1'} \
AND durable_writes = true;

CREATE TABLE tweets.tweets (
    id varint,
    timestamp timestamp,
    solr_query text,
    text text,
    weight float,
    PRIMARY KEY (id, timestamp)
) WITH CLUSTERING ORDER BY (timestamp ASC)
    AND bloom_filter_fp_chance = 0.01
    AND caching = '{"keys":"ALL", "rows_per_partition":"NONE"}'
    AND comment = ''
    AND compaction = {'class': 'org.apache.cassandra.db.compaction.SizeTieredCompactionStrategy'}
    AND compression = {'sstable_compression': 'org.apache.cassandra.io.compress.LZ4Compressor'}
    AND dclocal_read_repair_chance = 0.1
    AND default_time_to_live = 0
    AND gc_grace_seconds = 864000
    AND max_index_interval = 2048
    AND memtable_flush_period_in_ms = 0
    AND min_index_interval = 128
    AND read_repair_chance = 0.0
    AND speculative_retry = '99.0PERCENTILE';
'''

## Resources

In [None]:
Hutto, C.J. & Gilbert, E.E. (2014). VADER: A Parsimonious Rule-based Model for Sentiment Analysis of Social Media Text.\
    Eighth International Conference on Weblogs and Social Media (ICWSM-14). Ann Arbor, MI, June 2014. 