In [None]:
!pip install -q tweepy vincent
import tweepy, json, nltk, re, operator, string, vincent, pandas, math, pprint
from tweepy import OAuthHandler
from nltk import bigrams
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from collections import Counter, defaultdict
nltk.download('punkt')
nltk.download('stopwords')

In [None]:
# Connecting

def authorise():
    '''set up tweepy twitter api connection'''
    consumer_key = input('consumer key: ')
    consumer_secret = input('consumer secret: ')
    access_token = input('access token: ')
    access_secret = input('access secret: ')
    auth = OAuthHandler(consumer_key, consumer_secret)
    auth.set_access_token(access_token, access_secret)
    api = tweepy.API(auth)
    return api

api = authorise()

In [None]:
# Data Sourcing

def get_user():
    '''get user id from username'''
    username = input("Please enter the twitter username: ")
    user = api.get_user(username)
    return user.id

def process_or_store(tweet):
    '''process or store tweet data'''
    json_store.append(json.dumps(tweet, indent=4))

user_id = get_user()
json_store = []

for status in tweepy.Cursor(api.user_timeline, id=user_id).items(100):
    # Process a single status
    process_or_store(status._json)

In [None]:
# Data Wrangling

emoticons_str = r"""
    (?:
        [:=;] # Eyes
        [oO\-]? # Nose (optional)
        [D\)\]\(\]/\\OpP] # Mouth
    )"""

regex_str = [
    emoticons_str,
    r'<[^>]+>', # HTML tags
    r'(?:@[\w_]+)', # @-mentions
    r"(?:\#+[\w_]+[\w\'_\-]*[\w_]+)", # hash-tags
    r'http[s]?://(?:[a-z]|[0-9]|[$-_@.&amp;+]|[!*\(\),]|(?:%[0-9a-f][0-9a-f]))+', # URLs
    r'(?:(?:\d+,?)+(?:\.?\d+)?)', # numbers
    r"(?:[a-z][a-z'\-_]+[a-z])", # words with - and '
    r'(?:[\w_]+)', # other words
    r'(?:\S)' # anything else
]

tokens_re = re.compile(r'('+'|'.join(regex_str)+')', re.VERBOSE | re.IGNORECASE)
emoticon_re = re.compile(r'^'+emoticons_str+'$', re.VERBOSE | re.IGNORECASE)

def tokenize(s):
    return tokens_re.findall(s)
 
def preprocess(s, lowercase=False):
    tokens = tokenize(s)
    if lowercase:
        tokens = [token if emoticon_re.search(token) else token.lower() for token in tokens]
    return tokens

punctuation = list(string.punctuation)
stop = stopwords.words('english') + punctuation + ['rt','via','…']

count_terms_only = Counter()
count_all = Counter()
cooccurrence_matrix = defaultdict(lambda: defaultdict(int))
dates_PTT = []

for doc_count, document in enumerate(json_store):
    tweet = json.loads(document)
    terms_all = [term.lower() for term in preprocess(tweet['text'])]
    terms_only = [term.lower() for term in preprocess(tweet['text']) if term.lower() not in stop and not term.startswith(('#','@'))]
    terms_hash = [term.lower() for term in preprocess(tweet['text']) if term.startswith('#')]
    
    if '#PrepareToTry' in terms_hash:
        dates_PTT.append(tweet['created_at'])
        
    count_all.update(terms_all)
    count_terms_only.update(terms_only)
    
    for i in range(len(terms_only)-1):
        for j in range(i+1, len(terms_only)):
            w1, w2 = sorted([terms_only[i], terms_only[j]])
            if w1 != w2:
                cooccurrence_matrix[w1][w2] += 1

In [None]:
# Data Analysis

max_matrix = []
# For each term, look for the most common co-occurrent terms
for t1 in cooccurrence_matrix:
    t1_max_terms = sorted(cooccurrence_matrix[t1].items(), key=operator.itemgetter(1), reverse=True)[:5]
    for t2, t2_count in t1_max_terms:
        max_matrix.append(((t1, t2), t2_count))
# Get the most frequent co-occurrences
terms_max = sorted(max_matrix, key=operator.itemgetter(1), reverse=True)
print(terms_max[:5])

# a list of "1" to count the hashtags
ones = [1]*len(dates_PTT)
# the index of the series
idx = pandas.DatetimeIndex(dates_PTT)
# the actual series (at series of 1s for the moment)
PTT = pandas.Series(ones, index=idx)
 
# Resampling / bucketing
per_hour = PTT.resample('1Min').sum().fillna(0)

In [None]:
# Data visualisation

word_freq = count_terms_only.most_common(20)
labels, freq = zip(*word_freq)
data = {'data': freq, 'x': labels}

vincent.core.initialize_notebook()
bar = vincent.Bar(data, iter_idx='x')
bar.display()

time_chart = vincent.Line(PTT)
time_chart.axis_titles(x='Time', y='Freq')
time_chart.display()

In [None]:
p_t = {}
p_t_com = defaultdict(lambda : defaultdict(int))
n_docs = doc_count

for term, n in count_all.items():
    p_t[term] = n / n_docs
    for t2 in cooccurrence_matrix[term]:
        p_t_com[term][t2] = cooccurrence_matrix[term][t2] / n_docs

In [None]:
positive_vocab = [
    'good', 'nice', 'great', 'awesome', 'outstanding',
    'fantastic', 'terrific', ':)', ':-)', 'like', 'love'
]
negative_vocab = [
    'bad', 'terrible', 'crap', 'useless', 'hate', ':(', ':-('
]

In [None]:
pmi = defaultdict(lambda : defaultdict(int))
for t1 in p_t:
    for t2 in cooccurrence_matrix[t1]:
        denom = p_t[t1] * p_t[t2]
        pmi[t1][t2] = math.log2(p_t_com[t1][t2] / denom)

semantic_orientation = {}
for term, n in p_t.items():
    positive_assoc = sum(pmi[term][tx] for tx in positive_vocab)
    negative_assoc = sum(pmi[term][tx] for tx in negative_vocab)
    semantic_orientation[term] = positive_assoc - negative_assoc
    
semantic_sorted = sorted(semantic_orientation.items(), 
                         key=operator.itemgetter(1), 
                         reverse=True)
top_pos = semantic_sorted[:10]
top_neg = semantic_sorted[-10:]
 
print(top_pos)
print(top_neg)
