In [1]:
from senti_classifier import senti_classifier
from TwitterAPI import TwitterAPI
from datetime import datetime
import ConfigParser
import pickle
import nltk
import csv
import re
import os

In [2]:
# Connect to the Twitter API and return a TwitterAPI object to use.
def get_twitter(config_file):
    config = ConfigParser.ConfigParser()
    config.read(config_file)
    twitter = TwitterAPI(
                   config.get('twitter', 'consumer_key'),
                   config.get('twitter', 'consumer_secret'),
                   config.get('twitter', 'access_token'),
                   config.get('twitter', 'access_token_secret'))
    return twitter

twitter = get_twitter('twitter.cfg')
print('Established Twitter connection.')

Established Twitter connection.


In [3]:
def process_tweet(tweet):
    #Convert to lower case
    tweet = tweet.lower()
    #Convert www.* or https?://* to URL
    tweet = re.sub('((www\.[^\s]+)|(https?://[^\s]+))','URL',tweet)
    #Convert @username to AT_USER
    tweet = re.sub('@[^\s]+','AT_USER',tweet)
    #Remove additional white spaces
    tweet = re.sub('[\s]+', ' ', tweet)
    #Replace #word with word
    tweet = re.sub(r'#([^\s]+)', r'\1', tweet)
    #trim
    tweet = tweet.strip('\'"')
    
    return tweet


def extract_id(tweet_request_response, min_id=998816577052196870):
    """
    Extract the min of tweets' id, use a bad hack to initialize max_id
    """
    for tweet in tweet_request_response:
        if tweet['id'] < min_id:
            min_id = tweet['id']
        
    return min_id

def gather_tweets(search_query, since, until):
    tweets = twitter.request('search/tweets',{'q':search_query+' since:'+since+' until:'+until, 'lang':'en', 'count':100})
    min_id = extract_id(tweets)
    tweets_list=[]
    
    while True:
            if not tweets.get_rest_quota():
                return tweets_list
        
            try:
                tweets = twitter.request('search/tweets',
                             {'q' : search_query+' since:'+since+' until:'+until, 
                              'lang' : 'en', 
                              'max_id' : min_id-1,
                              'count' : 100})
            except:
                print 'No more quota !'
                return tweets_list
            
            for tweet in tweets:
                process_text = process_tweet(tweet['text'])
                tweets_list.append({'text': process_text, 
                                    'author': tweet['user']['name'], 
                                    'date': datetime.strptime(tweet['created_at'], '%a %b %d %H:%M:%S +0000 %Y').strftime('%Y-%m-%d'),
                                    'time': datetime.strptime(tweet['created_at'], '%a %b %d %H:%M:%S +0000 %Y').strftime('%H:%M:%S'),
                                   })
                
            if (extract_id(tweets, min_id=min_id) == min_id):
                return tweets_list
            else:
                min_id = extract_id(tweets, min_id=min_id)
            
            print 'Next max_id: ' + str(min_id)
            print 'Remaining quota: ' + str(tweets.get_rest_quota()['remaining'])
    
    return tweets_list 


In [7]:
def dump_data_stock(period, stock, path):
    tweets_dico_list = gather_tweets(stock, period[0], period[1])
    print 'Period ' + period[0] + ' to ' + period[1] + ' gathered.'
    pickle.dump(tweets_dico_list, open(path, 'wb' ))
    print 'Period ' + period[0] + ' to ' + period[1] + ' written in file.'
    print len(tweets_dico_list)
    return dump_data_stock
    
periods = [['2015-11-30', '2015-12-01'], 
           ['2015-12-01', '2015-12-02'], 
           ['2015-12-02', '2015-12-03'], 
           ['2015-12-03', '2015-12-04'], 
           ['2015-12-04', '2015-12-05'], 
           ['2015-12-05', '2015-12-06'],
           ['2015-12-06', '2015-12-07'],
           ['2015-12-07', '2015-12-08'],
           ['2015-12-08', '2015-12-09'],
           ['2015-12-09', '2015-12-10'],
           ['2015-12-10', '2015-12-11']]

for period in periods:
    path = 'data_apple'+ os.sep + 'save-' + period[0] + '-to-' + period[1] + '.pkl'
    dump_data_stock(period, '%24AAPL', path)
    
for period in periods:
    path = 'data_netflix'+ os.sep + 'save-' + period[0] + '-to-' + period[1] + '.pkl'
    dump_data_stock(period, '%24NFLX', path)

Next max_id: 673977794888531968
Remaining quota: 125
Next max_id: 673961734189817856
Remaining quota: 124
Next max_id: 673943857571037184
Remaining quota: 123
Next max_id: 673934721609416704
Remaining quota: 122
Next max_id: 673925921028505600
Remaining quota: 121
Next max_id: 673918760659456000
Remaining quota: 120
Next max_id: 673905283882610688
Remaining quota: 119
Next max_id: 673889704018894848
Remaining quota: 118
Next max_id: 673883174116659200
Remaining quota: 117
Next max_id: 673875823796092928
Remaining quota: 116
Next max_id: 673866105442598912
Remaining quota: 108
Next max_id: 673852234514477057
Remaining quota: 107
Next max_id: 673828049536856064
Remaining quota: 106
Next max_id: 673789105466134528
Remaining quota: 105
Next max_id: 673754400767234052
Remaining quota: 104
Next max_id: 673719264528965632
Remaining quota: 103
Next max_id: 673676692439732225
Remaining quota: 102
Next max_id: 673656103037612034
Remaining quota: 101
Next max_id: 673654010549698560
Remaining quot

In [8]:
def compute_feeling_avg(tweets_dico_list):
    texts_list = []
    for tweet in tweets_dico_list:
        texts_list.append(tweet['text'])
    print 'list finished'
    return senti_classifier.polarity_scores(texts_list)

In [9]:
def create_csv(periods, pre_path, csv_file):
    results_list = [['2015-11-30', 343, 61.75, 40.5], ['2015-12-01', 487, 92.75, 45.125], ['2015-12-02', 679, 115.375, 53.125], ['2015-12-03', 508, 101.375, 37.125], ['2015-12-04', 256, 76.25, 26.125], ['2015-12-05', 114, 21.625, 12.875], ['2015-12-06', 147, 34.125, 7.625], ['2015-12-07', 300, 54.0, 19.375], ['2015-12-08', 324, 70.75, 28.625]]
    
    for period in periods:
        tweets_dico_list = pickle.load(open( pre_path + 'save-' + period[0] + '-to-' + period[1] + '.pkl', "rb" ))
        feeling = compute_feeling_avg(tweets_dico_list)
        results_list.append([tweets_dico_list[0]['date'], len(tweets_dico_list), feeling[0], feeling[1]])
    
    print results_list
    
    with open(csv_file, "wb") as f:
        writer = csv.writer(f)
        writer.writerows(results_list)
        
    return results_list

In [10]:
periods = [['2015-11-30', '2015-12-01'],
           ['2015-12-01', '2015-12-02'],
           ['2015-12-02', '2015-12-03'],
           ['2015-12-03', '2015-12-04'],
           ['2015-12-04', '2015-12-05'],
           ['2015-12-05', '2015-12-06'],
           ['2015-12-06', '2015-12-07'],
           ['2015-12-07', '2015-12-08'],
           ['2015-12-08', '2015-12-09']]

periods = [['2015-12-09', '2015-12-10'],
           ['2015-12-10', '2015-12-11']]

pre_path = './data_google/'
csv_file = './csv/csv_google_2.csv'

result_google = create_csv(periods, pre_path, csv_file)

list finished
list finished
[['2015-11-30', 343, 61.75, 40.5], ['2015-12-01', 487, 92.75, 45.125], ['2015-12-02', 679, 115.375, 53.125], ['2015-12-03', 508, 101.375, 37.125], ['2015-12-04', 256, 76.25, 26.125], ['2015-12-05', 114, 21.625, 12.875], ['2015-12-06', 147, 34.125, 7.625], ['2015-12-07', 300, 54.0, 19.375], ['2015-12-08', 324, 70.75, 28.625], ['2015-12-09', 320, 72.875, 30.375], ['2015-12-10', 445, 89.625, 44.5]]


## Results GOOGLE
- Period 2015-11-23 to 2015-11-24 = (692.625, 432.375)
- Period 2015-11-24 to 2015-11-25 = (709.5, 446.0)
- Period 2015-11-25 to 2015-11-26 = (726.375, 459.625)
- Period 2015-11-26 to 2015-11-27 = (163.125, 101.625)
- Period 2015-11-29 to 2015-11-30 = (675.75, 418.75)