# Evaluation for score calculation
1. request

In [1]:
from sklearn.cross_validation import KFold
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from collections import Counter


import ConfigParser
import matplotlib.pyplot as plt
import networkx as nx
import sys
import time
import re
import io
import os
import numpy as np
from TwitterAPI import TwitterAPI

In [2]:
def tokenize(text):
    text = re.sub('http\S+', ' ', text)
    text = re.sub('@\S+', ' ', text)
    text = text.lower()
    text = text.replace('rt','')
    return re.sub('\W+', ' ', text).split()


In [3]:
def get_training_files(path):
    """ Return a list of file names in this directory that end in .txt 
    The list should be sorted alphabetically by file name.
    Params:
        path....a directory containing .txt review files.
    Returns:
        a list of .txt file names, sorted alphabetically.
    """
    ###TODO
    files = []
    for (dirpath, dirnames, filenames) in os.walk(path):
        for fn in filenames:
            if fn.endswith(".txt"):
                files.append(os.path.join(dirpath,fn))
    return sorted(files)
    ###

In [12]:
files = get_training_files('data')

training_tweets = []
for fname in files:
    f = open(fname, 'r')
    for line in f:
        toks = line.lower().rstrip('\n').split('\t')
        training_tweets.append(toks)
print 'From',len(files),'files'
print len(training_tweets),'tweets have been read'
print '- ',len([t for t in training_tweets if t[0]=='-1']),' negative tweets'
print '- ',len([t for t in training_tweets if t[0]=='1']),' positive tweets'

From 8 files
504 tweets have been read
-  251  negative tweets
-  253  positive tweets


In [13]:
train_texts = np.array([tweet[2].replace(tweet[1],'') for tweet in training_tweets])
labels = np.array([tweet[0] for tweet in training_tweets])

In [14]:
def get_clf(c=1, penalty='l2'):
    return LogisticRegression(random_state=42, C=c, penalty=penalty)

In [15]:
def do_vec(texts):
    global tokenize
    vec = CountVectorizer(input='content',tokenizer=tokenize, min_df=2, max_df=.7, binary=True, ngram_range=(1,1))
    X = vec.fit_transform(texts)
    return X, vec

In [16]:
def do_cross_validation(X, y, n_folds=5, verbose=False, c=1):
    """
    Perform n-fold cross validation, calling get_clf() to train n
    different classifiers. Use sklearn's KFold class: http://goo.gl/wmyFhi
    Be sure not to shuffle the data, otherwise your output will differ.
    Params:
        X.........a csr_matrix of feature vectors
        y.........the true labels of each document
        n_folds...the number of folds of cross-validation to do
        verbose...If true, report the testing accuracy for each fold.
    Return:
        the average testing accuracy across all folds.
    """
    ###TODO
    kf = KFold(len(y), n_folds=n_folds)
    fold_number = 0
    accuracies = []
    for train_index, test_index in kf:
        clf = get_clf(c=c)
        clf.fit(X[train_index], y[train_index])
        predicted = clf.predict(X[test_index])
        acc = accuracy_score(y[test_index], predicted)
        accuracies.append(acc)
        if verbose:
            print "fold "+str(fold_number)+" accuracy="+str(acc)
        fold_number += 1
    return np.mean(accuracies)
    ###
    
X, vec = do_vec(train_texts)
cs = [.00001,.0001,.001,.01,.09, .1,.5, 1, 5, 10, 1000]
# cs = [1]
for c in cs:
    print 'when c =',c,', average cross validation accuracy=',do_cross_validation(X, labels, verbose=False, c=c)

when c = 1e-05 , average cross validation accuracy= 0.650871287129
when c = 0.0001 , average cross validation accuracy= 0.650871287129
when c = 0.001 , average cross validation accuracy= 0.650871287129
when c = 0.01 , average cross validation accuracy= 0.682574257426
when c = 0.09 , average cross validation accuracy= 0.722178217822
when c = 0.1 , average cross validation accuracy= 0.724178217822
when c = 0.5 , average cross validation accuracy= 0.74798019802
when c = 1 , average cross validation accuracy= 0.74596039604
when c = 5 , average cross validation accuracy= 0.74203960396
when c = 10 , average cross validation accuracy= 0.732099009901
when c = 1000 , average cross validation accuracy= 0.684475247525


In [18]:
def get_twitter(config_file):
    config = ConfigParser.ConfigParser()
    config.read(config_file)
    twitter = TwitterAPI(
                   config.get('twitter', 'consumer_key'),
                   config.get('twitter', 'consumer_secret'),
                   config.get('twitter', 'access_token'),
                   config.get('twitter', 'access_token_secret'))
    return twitter

twitter = get_twitter('twitter_wii.cfg')
print('Established Twitter connection.')

def robust_request(twitter, resource, params, max_tries=5):
    for i in range(max_tries):
        request = twitter.request(resource, params)
        if request.status_code == 200:
            return request
        else:
            print >> sys.stderr, 'Got error:', request.text, '\nsleeping for 15 minutes.'
            sys.stderr.flush()
            time.sleep(61 * 15)

def get_info(bn):
    request = robust_request(twitter, 'search/tweets', {'q': bn, 'count':100, 'lang':'en'})
    return {'location':1111,'city':'abc','tweets':request}


Established Twitter connection.


In [19]:
# Download the AFINN lexicon, unzip, and read the latest word list in AFINN-111.txt
from StringIO import StringIO
from zipfile import ZipFile
from urllib import urlopen

url = urlopen('http://www2.compute.dtu.dk/~faan/data/AFINN.zip')
zipfile = ZipFile(StringIO(url.read()))
afinn_file = zipfile.open('AFINN/AFINN-111.txt')

afinn = dict()

for line in afinn_file:
    parts = line.strip().split()
    if len(parts) == 2:
        afinn[parts[0]] = int(parts[1])

def afinn_sentiment(terms, afinn, verbose=False):
    pos = 0
    neg = 0
    for t in terms:
        if t in afinn:
            if verbose:
                print '\t%s=%d' % (t, afinn[t])
            if afinn[t] > 0:
                pos += afinn[t]
            else:
                neg += -1 * afinn[t]
    return (pos, neg)

In [20]:
clf = get_clf(c=1.)
X, vec = do_vec(train_texts)
clf.fit(X, labels)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr',
          penalty='l2', random_state=42, solver='liblinear', tol=0.0001,
          verbose=0)

In [21]:
def get_AFINN_prediction(texts):
    res = []
    for i in range(len(texts)):
        terms = tokenize(texts[i])
        afinn_score = afinn_sentiment(terms, afinn)
        norm_afinn_score = 0.
        if afinn_score[0]+afinn_score[1] != 0:
            norm_afinn_score = float(afinn_score[0]-afinn_score[1])/float(afinn_score[0]+afinn_score[1])
        res.append(norm_afinn_score)
    return np.array(res)

In [22]:
results = get_AFINN_prediction(train_texts)
results = np.array(['1' if i>0. else '-1' for i in results])

print 'Accuracy score for AFINN is', accuracy_score(labels,results)

Accuracy score for AFINN is 0.769841269841


In [23]:
def get_prediction(texts):
    nTweets = len(texts)
    X = vec.transform(texts)
    clf_predicts = clf.predict(X)
    AFINN_predicts = get_AFINN_prediction(texts)
    
    avg_predicts = [(float(clf_predicts[i])+float(AFINN_predicts[i]))/2. for i in range(nTweets)]
    return np.array(avg_predicts) 

In [24]:
def get_popular_words(texts):
    c = Counter()
    for t in texts:
        words = tokenize(t)
        c.update(words)
    return c.most_common()

In [25]:
def get_popular_hashtag(texts):
    c = Counter()
    pat = re.compile(r"#(\w+)")
    for t in texts:
        hasgtags = pat.findall(t)
        c.update(hasgtags)
    return c.most_common()

In [26]:
def get_popular_score(newest_tweet, oldest_tweet, nTweets):
    ts1 = time.mktime(time.strptime(oldest_tweet,'%a %b %d %H:%M:%S +0000 %Y'))
    ts2 = time.mktime(time.strptime(newest_tweet,'%a %b %d %H:%M:%S +0000 %Y'))
    diff = ts2-ts1
    rate = diff/nTweets #seconds per tweet

    # less than 10 seconds per tweet
    if rate <= 10. : 
        score = 1.
    # less than 30 seconds per tweet
    elif rate <= 30.:
        score = .95
    # less than 1 minute per tweet
    elif rate <= 60.:
        score = .9
    # less than 10 minutes per tweet
    elif rate <= 600 :
        score = .85
    # less than 1 hour per tweet
    elif rate <= 3600 :
        score = .75
    # less than 1 day per tweet
    elif rate <= 3600*24 :
        score = .7
    # less than 1 week per tweet
    elif rate <= 3600*24*7 :
        score = .65
    # less than 1 month per tweet
    elif rate <= 3600*24*30 :
        score = .55
    else:
        score = .2
        
    if nTweets < 100:
        score *= .5
    
    return score


In [28]:
def get_reputation_score(info):
    location = info['location']
    city = info['city']
    tweets = info['tweets']
    
    tweet_texts = [t['text'] for t in tweets]
    avg_predicts = get_prediction(tweet_texts)
    
    tweet_nRT = [t['retweet_count'] for t in tweets]
    tweet_likes = [t['favorite_count'] for t in tweets]
    users = [t['user'] for t in tweets]
    created_times = [t['created_at'] for t in tweets]
    nTweets = len(tweet_texts)
    sum_weight = 0.

    for i in range(nTweets):
        rt_count = tweet_nRT[i]
        like_count = tweet_likes[i]
        follower_count = users[i]['followers_count']
        mult = 1.
        
        if rt_count > 50 and rt_count <= 200 :
            mult *= 3.
        elif rt_count > 200 and rt_count <= 1000 :
            mult *= 5.
        elif rt_count > 1000 :
            mult *= 10.
        
        if like_count > 50 and like_count <= 200 :
            mult *= 2.
        elif like_count > 200 and like_count <= 1000 :
            mult *= 3.
        elif like_count > 1000 :
            mult *= 4.
            
        if follower_count > 500 and follower_count <= 5000 :
            mult *= 3.
        elif follower_count > 5000 and follower_count <= 50000 :
            mult *= 5.
        elif follower_count > 50000:
            mult *= 10
        
        sum_weight += mult
        avg_predicts[i] = avg_predicts[i]*mult

    senti_score = sum(avg_predicts)/sum_weight

    # Normalize to [0,1]
    senti_score = (senti_score + 1)/2.
    
    if nTweets > 1:
        pop_score = get_popular_score(created_times[0], created_times[nTweets-1], nTweets)
    else :
        pop_score = 0.
    
    # Ratio between sentiment score : popular score = 1:2
    score = (1.*senti_score + 2.* pop_score)/3.
    
    return score, get_popular_hashtag(tweet_texts)

In [30]:
brandnames = ['google', 'apple', 'ikea', 'fedex','Heineken', 'Toshiba', 'Carlsberg','usps']

for b in brandnames:
    inf = get_info(b)
    score, hashtags = get_reputation_score(inf)
    print b,"got",str(score)

google got 0.873938158342
apple got 0.824780028647
ikea got 0.859856237817
fedex got 0.752795841101
Heineken got 0.805586305863
Toshiba got 0.825823045267
Carlsberg got 0.798347701149
usps got 0.723977253121


In [None]:
# brandnames = ['usps', 'ups', 'dhl']
# f = open('traning_data8.txt','w')
# for b in brandnames:
#     request = robust_request(twitter, 'search/tweets', {'q': b, 'count':100, 'lang':'en'})
#     for tweet in request:
#         text = b+'\t'+tweet['text']+'\n'
#         text = text.encode('utf8')
#         f.write(str(text))
        