In [26]:
import tweepy
import os
import requests

In [29]:
auth = tweepy.AppAuthHandler(os.environ["TWITTER_CONSUMER_KEY"], os.environ["TWITTER_CONSUMER_SECRET"])

api = tweepy.API(auth, wait_on_rate_limit=True, wait_on_rate_limit_notify=True)

if (not api):
    print("Can't Authenticate")
    sys.exit(-1)

In [31]:
user_file = open('influencer_twitter.txt')
user_list = user_file.readlines()

In [32]:
for index, user in enumerate(user_list):
    user_list[index] = user.rstrip('\n')

In [33]:
#function to get tweets from a user
def search_tweets(user):
    tweets = []
    count = 0
    try: 
        new_tweets = tweepy.Cursor(api.user_timeline, screen_name=user, exclude_replies=True, count=200, tweet_mode='extended').pages(20)

        print('Starting download for {0}'.format(user))

        for page in new_tweets:
            if count%4==0:
                print('Downloading page {0}'.format(count))
            tweets.extend(page)
            count += 1
    except tweepy.TweepError as e:
        print(e)
    
    print('Finished downloading {0} tweets for {1}'.format(len(tweets), user))
    return tweets

In [11]:
search_tweets('JuneStoyer')

Starting download for JuneStoyer
Downloading page 0
Downloading page 0
Downloading page 0
Downloading page 0
Downloading page 0
Downloading page 0
Downloading page 0
Downloading page 0
Downloading page 0
Downloading page 0
Downloading page 0
Downloading page 0
Downloading page 0
Downloading page 0
Downloading page 0
Downloading page 0
Downloading page 0
Finished downloading 3168 tweets for JuneStoyer


IOPub data rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_data_rate_limit`.


## Import tweets to MongoDB

In [3]:
from pymongo import MongoClient

In [4]:
client = MongoClient()

In [5]:
db = client.cleantech_db

In [6]:
col = db.tweets_collection

In [24]:
col.find()[21]

{'_id': ObjectId('599b9e696d2418254072512d'),
 'user': 'drvox'}

In [32]:
for user in user_list[80:]:
    all_tweets_raw = search_tweets(user)
    all_tweets = ''
    for tweet in all_tweets_raw:
        all_tweets += tweet.full_text + ' '
    user_dict = {}
    user_dict['user'] = user
    user_dict['all_tweets_raw'] = all_tweets
    col.insert_one(user_dict)

Starting download for SuzanneWaldman
Downloading page 0
Downloading page 4
Downloading page 8
Downloading page 12
Downloading page 16
Finished downloading 1870 tweets for SuzanneWaldman
Starting download for robintransition
Downloading page 0
Downloading page 4
Downloading page 8
Downloading page 12
Downloading page 16
Finished downloading 2485 tweets for robintransition
Starting download for Green_Living1
Downloading page 0
Downloading page 4
Downloading page 8
Downloading page 12
Downloading page 16
Finished downloading 3236 tweets for Green_Living1
Starting download for mark_lynas
Downloading page 0
Downloading page 4
Downloading page 8
Downloading page 12
Downloading page 16
Finished downloading 2241 tweets for mark_lynas
Starting download for Bentler
Downloading page 0
Downloading page 4
Downloading page 8
Downloading page 12
Downloading page 16
Finished downloading 3000 tweets for Bentler
Starting download for elaineishere
Downloading page 0
Downloading page 4
Downloading page 8


In [34]:
#delete documents that didn't return any tweets
col.delete_many({'all_tweets_raw': ''})

<pymongo.results.DeleteResult at 0x7efe89815288>

In [36]:
col.count()

100

## Preprocessing tweets

In [8]:
import re
import sys
import unicodedata
from string import digits
import html

In [40]:
for user in col.find():
    tweets_clean = html.unescape(user['all_tweets_raw']) #convert html entities to string
    tweets_clean = ''.join(c for c in unicodedata.normalize('NFC', tweets_clean) if c <= '\uFFFF') #remove emojis
    tweets_clean = re.sub('([%#\/]+)|(@[A-Za-z0-9_]+)|(\w+:\/\/\S+)|(^rt)|(^RT)|(^Rt)|(\sRT\s)|(\sRt\s)|(\srt\s)|(http.+?)', '', tweets_clean).strip()
    tweets_clean = ''.join([i for i in tweets_clean if not i.isdigit()])
    col.update_one({'_id': user['_id']}, {'$set':{'tweets_clean':tweets_clean}})
    

In [19]:
for user in col.find():
    tweets_clean = html.unescape(user['all_tweets_raw']) #convert html entities to string
    tweets_clean = ''.join(c for c in unicodedata.normalize('NFC', tweets_clean) if c <= '\uFFFF') #remove emojis
    tweets_clean = re.sub('(@[A-Za-z0-9_]+)|(\w+:\/\/\S+)', '', tweets_clean).strip()
    tweets_clean = re.sub('(^rt)|(^RT)|(^Rt)|(\sRT\s)|(\sRt\s)|(\srt\s)', '.', tweets_clean)
    tweets_clean = re.sub('( http)', '.', tweets_clean)
    tweets_clean = ''.join([i for i in tweets_clean if not i.isdigit()])
    col.update_one({'_id':user['_id']}, {'$set':{'tweets_sentences':tweets_clean}})

# Process tweets individually for sentiment analysis

In [93]:
#new collection for individual tweets rather than bag of words
indiv_col = db.individual_tweets

In [96]:
for user in user_list[80:]:
    tweets = search_tweets(user)
    tweet_list = []
    for i in range(len(tweets)):
        text = tweets[i].full_text
        tweet_list.append(text)
    user_dict = {}
    user_dict['user'] = user
    user_dict['individual_tweets'] = tweet_list
    indiv_col.insert_one(user_dict)

Starting download for SuzanneWaldman
Downloading page 0
Downloading page 4
Downloading page 8
Downloading page 12
Downloading page 16
Finished downloading 1885 tweets for SuzanneWaldman
Starting download for robintransition
Downloading page 0
Downloading page 4
Downloading page 8
Downloading page 12
Downloading page 16
Finished downloading 2490 tweets for robintransition
Starting download for Green_Living1
Downloading page 0
Downloading page 4
Downloading page 8
Downloading page 12
Downloading page 16
Finished downloading 3212 tweets for Green_Living1
Starting download for mark_lynas
Downloading page 0
Downloading page 4
Downloading page 8
Downloading page 12
Downloading page 16
Finished downloading 2241 tweets for mark_lynas
Starting download for Bentler
Downloading page 0
Downloading page 4
Downloading page 8
Downloading page 12
Downloading page 16
Finished downloading 2980 tweets for Bentler
Starting download for elaineishere
Downloading page 0
Downloading page 4
Downloading page 8


In [98]:
indiv_col.find_one({'user': 'JuneStoyer'})['individual_tweets']

['RT @h0t_p0ppy: #OpKillingBay #EU \nLetting a child see an animal in distress will cause the child distress #FaroeIslands https://t.co/jq3AfV…',
 'RT @xavierkatana: https://t.co/qaXEZURhA4',
 "RT @ThankTankCr8: My #AnimalsAreNotFood #stickers arrived early! Want some? PM me your address and I'll send you some! #GoVegan #Vegan http…",
 'RT @seashepherd: Please #SignAndShare this petition from @SeaShepherdFran:\nFrance must stop the shark cull in La Reunion Island.\nhttps://t.…',
 "RT @h0t_p0ppy: 50 dolphins killed because they didn't want to give up the chase. Meat no one wants #FaroeIslands @seashepherd #OpKillingBay…",
 'RT @h0t_p0ppy: 1207 dolphins now slaughtered since may. This is worse than #Taiji Japan. This is #FaroeIslands @Tinganes #OpKillingBay http…',
 'RT @AlbiDeak: Video 21th August 2017: \n 40 Dolphins were barbaric hunted and slaughtered in Faroe Islands at beach in Skálafirði.\n#OpKillin…',
 'RT @JazzyDolphin: @TomHall If you #LOVE #Dolphins plz join #OpKillingBay #EU\