# BRAND REPUTATION CALCULATOR

# Intructions
The user will only have to use the last cell, although IPython Notebooks require to run the cells above for the declaration of the functions and the training of the system.

The program works as follow, using the last cell:
    - Introduce the name of the brand
    - Choose between select several cities or a general reputation (there are examples of both)
    - Run the cell and look at the results
    
The graphs are interactive so the user can zoom in, see the information staying over the bubble, etc.


In [1]:
from sklearn.cross_validation import KFold
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from collections import Counter

from IPython.core.display import display, HTML
import ConfigParser
import matplotlib.pyplot as plt
import networkx as nx
import sys
import re
import io
import os
import ConfigParser
from TwitterAPI import TwitterAPI
from geopy.geocoders import Nominatim
import math as m
import plotly
import plotly.plotly as py
import plotly.graph_objs as go
import plotly.tools as tls
import six
import random
import time
import numpy as np
py.sign_in('jiranun1989','6eacuk3zt3')

In [2]:
%%javascript
IPython.OutputArea.auto_scroll_threshold = 9999;

<IPython.core.display.Javascript object>

# Tweets search

In [3]:
def get_twitter(config_file):
    config = ConfigParser.ConfigParser()
    config.read(config_file)
    twitter = TwitterAPI(
                   config.get('twitter', 'consumer_key'),
                   config.get('twitter', 'consumer_secret'),
                   config.get('twitter', 'access_token'),
                   config.get('twitter', 'access_token_secret'))
    return twitter

def request_by_location(twitter, r, numtweets):
    """ If a Twitter request fails, sleep for 15 minutes.
    Do this at most max_tries times before quitting.
    Args:
      twitter .... A TwitterAPI object.
      city_name ... A list of cities to request
      numtweets ..... How many tweets will be requested
    Returns:
      A dictionary with the name, the coordinates of the city and a list of tweets
    """
    #The returned object
    result={}
    result['tweets']=[]
    #Get location of the city
    geolocator = Nominatim()
    location = geolocator.geocode(r['city'], timeout=10)
    result['location']=(location.latitude, location.longitude)
    result['city']= r['city']
    result['brand'] = r['brand']
    #Conform the query parameters
    params={}
    params['q']= r['brand']
    if r['city'] != 'general':
        params['geocode']=''+str(location.latitude)+','+str(location.longitude)+',20mi'
    params['lang'] = 'en'    
    #if the require number is more than the maximum (only the first time)
    nquery=numtweets
    queriedtweets=0
    max_page=100.0
    last_id=0
    if nquery > max_page:
        nquery=max_page
        

    for i in range(int(m.ceil(numtweets/max_page))):
        params['count']=nquery
        if i==0:
            request = twitter.request('search/tweets', params)
        else:
            params['since_id']=last_id
            
            request = twitter.request('search/tweets', params)
        if request.status_code == 200:
            queriedtweets += nquery
            tweets=[]
            for r in request:
                tweets.append(r)
                if float(r['id'])>last_id:
                    last_id=float(r['id'])
            result['tweets'].extend(tweets)
            numtweets-=nquery
        else:
            print >> sys.stderr, 'Got error:', request.text, '\nsleeping for 15 minutes.'
            sys.stderr.flush()
            time.sleep(60 * 15)
        if numtweets>max_page:
            nquery=max_page
        else:
            nquery=numtweets
    return result

# Sentiment analysis

In [4]:
def tokenize(text):
    text = re.sub('http\S+', ' ', text)
    text = re.sub('@\S+', ' ', text)
    text = text.lower()
    text = text.replace('rt','')
    return re.sub('\W+', ' ', text).split()

def get_training_files(path):
    files = []
    for (dirpath, dirnames, filenames) in os.walk(path):
        for fn in filenames:
            if fn.endswith(".txt"):
                files.append(os.path.join(dirpath,fn))
    return sorted(files)

In [5]:
def get_clf(c=1, penalty='l2'):
    return LogisticRegression(random_state=42, C=c, penalty=penalty)

def do_vec(texts):
    global tokenize
    vec = CountVectorizer(input='content',tokenizer=tokenize, min_df=2, max_df=.7, binary=True, ngram_range=(1,1))
    X = vec.fit_transform(texts)
    return X, vec

In [6]:
# Download the AFINN lexicon, unzip, and read the latest word list in AFINN-111.txt
from StringIO import StringIO
from zipfile import ZipFile
from urllib import urlopen

url = urlopen('http://www2.compute.dtu.dk/~faan/data/AFINN.zip')
zipfile = ZipFile(StringIO(url.read()))
afinn_file = zipfile.open('AFINN/AFINN-111.txt')

afinn = dict()

for line in afinn_file:
    parts = line.strip().split()
    if len(parts) == 2:
        afinn[parts[0]] = int(parts[1])

def afinn_sentiment(terms, afinn, verbose=False):
    pos = 0
    neg = 0
    for t in terms:
        if t in afinn:
            if verbose:
                print '\t%s=%d' % (t, afinn[t])
            if afinn[t] > 0:
                pos += afinn[t]
            else:
                neg += -1 * afinn[t]
    return (pos, neg)

In [7]:
#Get the prediction using AFINN method
def get_AFINN_prediction(texts):
    res = []
    for i in range(len(texts)):
        terms = tokenize(texts[i])
        afinn_score = afinn_sentiment(terms, afinn)
        norm_afinn_score = 0.
        if afinn_score[0]+afinn_score[1] != 0:
            norm_afinn_score = float(afinn_score[0]-afinn_score[1])/float(afinn_score[0]+afinn_score[1])
        res.append(norm_afinn_score)
    return np.array(res)

#Get the sentiment prediction for the tweet
def get_prediction(texts):
    nTweets = len(texts)
    X = vec.transform(texts)
    clf_predicts = clf.predict(X)
    AFINN_predicts = get_AFINN_prediction(texts)
    
    avg_predicts = [(float(clf_predicts[i])+float(AFINN_predicts[i]))/2. for i in range(nTweets)]
    return np.array(avg_predicts) 

#Get the most common hashtags used
def get_popular_hashtag(texts):
    c = Counter()
    pat = re.compile(r"#(\w+)")
    for t in texts:
        hasgtags = pat.findall(t)
        c.update(hasgtags)
    return c.most_common(10)

#Obtain the score based on popularity
def get_popular_score(newest_tweet, oldest_tweet, nTweets):

    ts1 = time.mktime(time.strptime(oldest_tweet,'%a %b %d %H:%M:%S +0000 %Y'))
    ts2 = time.mktime(time.strptime(newest_tweet,'%a %b %d %H:%M:%S +0000 %Y'))
    diff = ts2-ts1
    rate = diff/nTweets #seconds per tweet
    
    # less than 10 seconds per tweet
    if rate <= 10. : 
        score = 1.
    # less than 30 seconds per tweet
    elif rate <= 30.:
        score = .95
    # less than 1 minute per tweet
    elif rate <= 60.:
        score = .9
    # less than 10 minutes per tweet
    elif rate <= 600 :
        score = .85
    # less than 1 hour per tweet
    elif rate <= 3600 :
        score = .75
    # less than 1 day per tweet
    elif rate <= 3600*24 :
        score = .7
    # less than 1 week per tweet
    elif rate <= 3600*24*7 :
        score = .65
    # less than 1 month per tweet
    elif rate <= 3600*24*30 :
        score = .55
    else:
        score = .2
    
    return score

In [8]:
#Compute the sentiment of the tweets given
def get_reputation_score(info):
    location = info['location']
    city = info['city']
    tweets = info['tweets']
    
    tweet_texts = [t['text'] for t in tweets]
    avg_predicts = get_prediction(tweet_texts)
    
    tweet_nRT = [t['retweet_count'] for t in tweets]
    tweet_likes = [t['favorite_count'] for t in tweets]
    users = [t['user'] for t in tweets]
    created_times = [t['created_at'] for t in tweets]
    nTweets = len(tweet_texts)
    sum_weight = 0.

    for i in range(nTweets):
        rt_count = tweet_nRT[i]
        like_count = tweet_likes[i]
        follower_count = users[i]['followers_count']
        mult = 1.
        
        if rt_count > 50 and rt_count <= 200 :
            mult *= 3.
        elif rt_count > 200 and rt_count <= 1000 :
            mult *= 5.
        elif rt_count > 1000 :
            mult *= 10.
        
        if like_count > 50 and like_count <= 200 :
            mult *= 2.
        elif like_count > 200 and like_count <= 1000 :
            mult *= 3.
        elif like_count > 1000 :
            mult *= 4.
            
        if follower_count > 500 and follower_count <= 5000 :
            mult *= 3.
        elif follower_count > 5000 and follower_count <= 50000 :
            mult *= 5.
        elif follower_count > 50000:
            mult *= 10
        
        sum_weight += mult
        avg_predicts[i] = avg_predicts[i]*mult

    senti_score = sum(avg_predicts)/sum_weight

    # Normalize to [0,1]
    senti_score = (senti_score + 1)/2.
    
    if nTweets > 1:
        pop_score = get_popular_score(created_times[0], created_times[nTweets-1], nTweets)
    else :
        pop_score = 0.

        
    # Ratio between sentiment score : popular score = 1:2
    score = (1.*senti_score + 2.* pop_score)/3.
    
    return score*10000, get_popular_hashtag(tweet_texts)

In [9]:
#Training the system
files = get_training_files('data')

training_tweets = []
for fname in files:
    f = open(fname, 'r')
    for line in f:
        toks = line.lower().rstrip('\n').split('\t')
        training_tweets.append(toks)
print 'From',len(files),'files'
print len(training_tweets),'tweets have been read'
print '- ',len([t for t in training_tweets if t[0]=='-1']),' negative tweets'
print '- ',len([t for t in training_tweets if t[0]=='1']),' positive tweets'

train_texts = np.array([tweet[2].replace(tweet[1],'') for tweet in training_tweets])
labels = np.array([tweet[0] for tweet in training_tweets])

clf = get_clf(c=1.)
X, vec = do_vec(train_texts)
clf.fit(X, labels)

From 8 files
504 tweets have been read
-  251  negative tweets
-  253  positive tweets


LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr',
          penalty='l2', random_state=42, solver='liblinear', tol=0.0001,
          verbose=0)

# Scores display

In [10]:
#Show the chart of most used hashtags
def get_words_graph(result):
    words=result['hashtags']
    wx=[]
    wy=[]
    wtext=[]
    wsize=[]
    count=3
    words = sorted(words, key=lambda w: w[1])
    for w in words:
        wy.append(w[1])
        wx.append(count)
        count+=5
        wtext.append(w[0])
        wsize.append(int(w[1])*5)
        
    trace = go.Scatter(
        x=wx,
        y=wy,
        mode='markers+text',
        name='Most used hashtags',
        text=wtext,
        marker=dict(
            sizemode='diameter',
            sizeref=0.85,
            size=wsize,
            line=dict(
                width=2
            ),
        )
    )
    data = go.Data([trace])
    layout = go.Layout(
        title='Most used hashtags',
        xaxis=dict(
            gridcolor='rgb(255, 255, 255)',
            range=[0, 60],
            type='linear',
            zerolinewidth=1,
            ticklen=5,
            gridwidth=2,
        ),
        yaxis=dict(
            title='Number of tweets',
            gridcolor='rgb(255, 255, 255)',
            range=[-5, 50],
            zerolinewidth=1,
            ticklen=5,
            gridwidth=2,
        ),
        paper_bgcolor='rgb(243, 243, 243)',
        plot_bgcolor='rgb(243, 243, 243)',
    )
    fig = go.Figure(data=data, layout=layout)
    display(py.iplot(fig, filename='most-used-hashtags'))

In [11]:
#Internal used functions:
def get_color(points, limits, colors):
    for i in range(len(limits)):
        if points > limits[i][0] and points < limits[i][1]:
            return colors[i]

def get_range(points, limits):
    for i in range(len(limits)):
        if points > limits[i][0] and points < limits[i][1]:
            return limits[i]

#Show cities reputation
def show_map_reputation(results):

    limits = [(0.0,1900.0),(2000.0,3900.0),(4000.0,5900.0),(6000.0,7900.0),(8000.0,10000.0)]
    colors = ["rgb(255,0,0)","rgb(255,55,0)","rgb(255,255,75)","rgb(55,255,0)","rgb(0,255,0)"]
    cities = []
    scale = 50

    cities=[]
    cities.append(go.Scattergeo(
        lat=[c['location'][0] for c in results],
        lon=[c['location'][1] for c in results],
        marker={"color": [get_color(c['reputation'], limits, colors) for c in results],
                "line": {"width": 1},
                "size": 30
            },
        mode="markers",
        name= "",
        text=[c['city'].title()+'<br>Reputation:'+str(c['reputation']) for c in results],
#         textposition = "bottom center"
        ))
        
        
    layout = dict(
            title = 'Reputation by city',
            showlegend = False,
            geo = dict(
                scope='usa',
                showland = True,
                landcolor = 'rgb(217, 217, 217)',
                subunitwidth=1,
                countrywidth=1,
                subunitcolor="rgb(255, 255, 255)",
                countrycolor="rgb(255, 255, 255)"
            ),
        )

    fig = dict( data=cities, layout=layout )
    display(py.iplot( fig, validate=False, filename='reputation-by-city'))

#Show the reputation points
def show_points(result):
    points = result['reputation']
    source = """
    <h1>Reputation score for """+result['brand']
    if result['city']!= 'general':
        source+=""" in """+result['city'].title()
    
    source +=""":</h1>
    <h2>"""+"%.2f" % result['reputation'] +"""/10000 points</h2>
    """
    display(HTML(source))


#Show all the results for a query
def show_results(results):
    if len(results) ==1:
        show_points(results[0])
        get_words_graph(results[0])
    else:
        for r in results:
            show_points(r)
            get_words_graph(r)
        show_map_reputation(results)
        
def get_reputation(brand, cities):
    twitter = get_twitter('twitter.cfg')
    results=[]
    numtweets=100
    for c in cities:
        r={'brand': brand, 'city': c}
        r = request_by_location(twitter, r, numtweets)
        results.append(r)
        r['reputation'], r['hashtags'] = get_reputation_score(r)
    show_results(results)

# Reputation request

In [12]:
brand_name = 'Apple'
#General or list of cities
cities = ['san francisco', 'new york']
#cities = ['general']
get_reputation(brand_name, cities)