# Twitter Data


In [1]:
%run functions.ipynb

In [2]:
import snscrape.modules.twitter as sntwitter

import json
import os
import datetime
from nltk.tokenize import TweetTokenizer, WordPunctTokenizer
from nltk.sentiment import SentimentIntensityAnalyzer

In [3]:
def load_tweets(tfile):
    
    tweets = []
    for line in open(tfile):
        try:
            tweets.append(json.loads(line))
        except:
            pass
        
    return tweets

In [4]:
def process_tweet(tweet):
    
    toks = tt.tokenize(tweet['text'])
    tweet['tokens'] = toks
    
    
    tweet['Valence']=0
    tweet['Dominance']=0
    tweet['Arousal']=0
    
    tweet['VAD_toks']=[]
    
    for t in toks:
        if t.lower() in NRC_VAD.keys():
            scores = NRC_VAD[t.lower()]
            scores['tok']=t
            
            tweet['Valence']+=scores['V']
            tweet['Arousal']+=scores['A']
            tweet['Dominance']+=scores['D']
            
            tweet['VAD_toks'].append(scores)
    
    
    for dimension in ('Valence','Arousal','Dominance'):
        if len(tweet['VAD_toks'])>0:
            tweet[dimension] /= len(tweet['VAD_toks'])

In [5]:
def download_query_tweets(query, date_since, date_until, max=1000):
    print(f"Downloading tweets for query: '{query}' from {date_since} to {date_until} (max of {max})")

    tweet_list = []
    qu = query
    query = f'{query} since:{date_since} until:{date_until}'
    
    for i,tweet in enumerate(sntwitter.TwitterSearchScraper(query).get_items()):
        if i>=max:
            break
    

        tweet_dict = {
            'id': tweet.id,
            'created_at': tweet.date.strftime('%Y-%m-%d %H:%M'),
            'text': tweet.content,
            'username': tweet.username,
            'query':qu
        }

        tweet_list.append(tweet_dict)
        
    return tweet_list

In [6]:
def ranged_query(date_range,query,out_path,max=1000):
    """
    takes start and end dates and collects tweets 
    for each query. Saves results as json grouped by year
    
    Args:
        - date_range: tuple containing start and end '%Y-%m-%d'
        - query: list containing searchable terms
        - out_path: dir to save json files
        - max: maximum number of tweets to search
    
    Returns:
        None: saves to disc
    """
    
    since, until = date_range # a tuple of start and end date (%Y-%m-%d)
    all_tweets = []
    for qu in query: # for each query download tweets within range and add to larger list
        tweet_list = download_query_tweets(qu,since,until,max=max)
        all_tweets.extend(tweet_list)
    
    year = since.split('-')[0]  # get what year the query is for
    
    # create a folder for the year if it doesn't exist already
    if not os.path.exists(f"{out_path}/{year}"):
        os.makedirs(f"{out_path}/{year}")
    
    # save json
    out_file_name = f"{out_path}/{year}/{since}_{until}_{'_'.join(query)}.json"
    with open(out_file_name,'w') as out_file:
        out_file.write(json.dumps(all_tweets))
        

In [7]:
dates = [("2020-04-01",'2020-04-11'),
         ('2020-09-12','2020-09-23'),
         ('2019-04-01','2019-04-11'),
         ('2019-09-12','2019-09-23'),
         ('2018-04-01','2018-04-11'),
         ('2018-09-12','2018-09-23'),
         ('2017-04-01','2017-04-11'),
         ('2017-09-12','2017-09-23'),
         ('2016-04-01','2016-04-11'),
         ('2016-09-12','2016-09-23'),
         ('2015-04-01','2015-04-11'),
         ('2015-09-12','2015-09-23')
        ]

queries = ['#ncaa','ncaa','#title9']

for daterange in dates:
    ranged_query(daterange,queries,'data',60)

Downloading tweets for query: '#ncaa' from 2020-04-01 to 2020-04-11 (max of 60)
Downloading tweets for query: 'ncaa' from 2020-04-01 to 2020-04-11 (max of 60)
Downloading tweets for query: '#title9' from 2020-04-01 to 2020-04-11 (max of 60)
Downloading tweets for query: '#ncaa' from 2020-09-12 to 2020-09-23 (max of 60)
Downloading tweets for query: 'ncaa' from 2020-09-12 to 2020-09-23 (max of 60)
Downloading tweets for query: '#title9' from 2020-09-12 to 2020-09-23 (max of 60)
Downloading tweets for query: '#ncaa' from 2019-04-01 to 2019-04-11 (max of 60)
Downloading tweets for query: 'ncaa' from 2019-04-01 to 2019-04-11 (max of 60)
Downloading tweets for query: '#title9' from 2019-04-01 to 2019-04-11 (max of 60)
Downloading tweets for query: '#ncaa' from 2019-09-12 to 2019-09-23 (max of 60)
Downloading tweets for query: 'ncaa' from 2019-09-12 to 2019-09-23 (max of 60)
Downloading tweets for query: '#title9' from 2019-09-12 to 2019-09-23 (max of 60)
Downloading tweets for query: '#ncaa