## twitter-sentiment
#### Jeff Oxenberg

* Monitor a specific topic/set of topics on twitter, score sentiment of tweets using the vader model
* Insert tweets into Cassandra
* Cassandra table is indexed by Solr (using DataStax Enterprise)
* Solr can be used to search for specific terms, get time-based averages, etc

In [None]:
from tweepy.streaming import StreamListener
from tweepy import Stream
from tweepy import OAuthHandler
from cassandra.cluster import Cluster
from cassandra.query import PreparedStatement
from cassandra import ConsistencyLevel
from nltk.sentiment.vader import SentimentIntensityAnalyzer
from nltk import tokenize
from urllib.request import urlopen
import json
import datetime
import re

In [None]:
#supply your own values here
key=x
ksecret=x
token=x
tsecret=x

#authenticate
auth = OAuthHandler(key, ksecret)
auth.set_access_token(token, tsecret)

In [None]:
sid = SentimentIntensityAnalyzer()

In [None]:
cluster = Cluster(['localhost'])
session = cluster.connect('tweets')

In [None]:
insert = session.prepare("INSERT INTO tweets (id, timestamp, text, weight) VALUES (?,?,?,?)")
insert.consistency_level = ConsistencyLevel.ONE

In [None]:
class listener(StreamListener):

    def on_data(self, data):
        d = json.loads(data)
        ctext = re.sub(r"http\S+", "", d["text"].strip())
        sent = sid.polarity_scores(ctext) 
        #tuple (id,created_at,text,sentiment(-1 to 1))
        x = (d["id"],datetime.datetime.strptime(d["created_at"], "%a %b %d %H:%M:%S %z %Y"),ctext,sent["compound"])
        #print(x)
        session.execute(insert, x)
        return(True)

    def on_error(self, errcode):
        print(errcode)

In [None]:
#start the stream of tweets, filter on topic
streamer = Stream(auth, listener())
streamer.filter(track=["test"])

## Solr Queries
Query the solr instance to get aggregate sentiment for a particular date range

In [None]:
cx = urlopen("http://172.17.136.41:8983/solr/tweets.tweets/select?q=timestamp:[\"2016-02-09T17:23:29Z\"+TO+\"2016-02-09T17:24:29Z\"]\
&stats=true&stats.field=weight&rows=0&wt=json").read()
d = json.loads(cx.decode())

In [None]:
d["stats"]["stats_fields"]["weight"]["mean"]

In [None]:
#add some more interesting queries

## Graphs

In [None]:
#add some interesting graphs here

## Data Model

In [None]:
'''
cqlsh:tweets> DESC tweets

CREATE KEYSPACE tweets WITH replication = {'class': 'NetworkTopologyStrategy', 'Cassandra': '2', 'Solr': '1'} \
AND durable_writes = true;

CREATE TABLE tweets.tweets (
    id varint,
    timestamp timestamp,
    solr_query text,
    text text,
    weight float,
    PRIMARY KEY (id, timestamp)
) WITH CLUSTERING ORDER BY (timestamp ASC)
    AND bloom_filter_fp_chance = 0.01
    AND caching = '{"keys":"ALL", "rows_per_partition":"NONE"}'
    AND comment = ''
    AND compaction = {'class': 'org.apache.cassandra.db.compaction.SizeTieredCompactionStrategy'}
    AND compression = {'sstable_compression': 'org.apache.cassandra.io.compress.LZ4Compressor'}
    AND dclocal_read_repair_chance = 0.1
    AND default_time_to_live = 0
    AND gc_grace_seconds = 864000
    AND max_index_interval = 2048
    AND memtable_flush_period_in_ms = 0
    AND min_index_interval = 128
    AND read_repair_chance = 0.0
    AND speculative_retry = '99.0PERCENTILE';
'''

## Resources

In [None]:
Hutto, C.J. & Gilbert, E.E. (2014). VADER: A Parsimonious Rule-based Model for Sentiment Analysis of Social Media Text.\
    Eighth International Conference on Weblogs and Social Media (ICWSM-14). Ann Arbor, MI, June 2014. 