In [1]:
import tweepy
import constants



### Import Keys

In [2]:
consumer_key = constants.TWITTER_PUBLIC_API_KEY
consumer_secret = constants.TWITTER_PRIVATE_API_KEY
access_token = constants.TWITTER_ACCESS_TOKEN
access_token_secret = constants.TWITTER_PRIVATE_ACCESS_TOKEN
bearer_token = constants.TWITTER_BEARER_TOKEN

# auth = tweepy.OAuth1UserHandler(
#    consumer_key, consumer_secret, access_token, access_token_secret
# )



### Create Tweepy Client using bearer token

In [3]:

client = tweepy.Client(bearer_token=bearer_token)



#### Build query

Want to have it exclude if multiple team tags  
Can't include them all however as would go over the query character limit of ~512 

Would limit the amount of tweets coming in however would also have to then filter again afterwards to remove the hastags that I couldan't include in the query  

Approach 1:
- build query and include first 50% of negated hashtags 
- filter tweets after using 2nd half of hashtags

Approach 2:
- build query and leave all hashtags
- filter it afterwards for tweets with any other hashtag

No idea which is more efficient

In [4]:
def get_non_team_hashtags_to_negate(team_hashtags, hashtag_list):
    output = [b for b in hashtag_list if all(a not in b for a in team_hashtags)]
    return output


In [15]:
import hashtags_sm
from hashtags_stripped import ht_flat, ht_half_1, ht_half_2

# test team list
team_list = ["bears", "bengals", "saints"]

# build twitter query for a team 
# take team name in dictionary and hashtag list
# Ex "(#RuleTheJungle OR #Bengals OR #WhoDey) -is:retweet"
# Important to have the parathensis around the main hashtags or else the retweet parameter won't work

def build_team_query(team_name, hashtag_doc, hashtags_all):
    
    # retrieve team hashtags in list
    team_hashtags = hashtag_doc[str(team_name)]

    #retrieve non-team hashtags in list
    negate_team_hashtags = get_non_team_hashtags_to_negate(team_hashtags, hashtags_all)
    
    # build start of query
    query = '(' + team_hashtags[0]
    
    # if more than one hashtag, combine using OR statements
    if (len(team_hashtags) > 1): 
        for tag in team_hashtags[1:]:
            query = query + ' OR ' + tag

    # filter out retweets? - not sure if I should do this or not, likely retweet something you also think
    query = query + ") -is:retweet lang:en"
    
    # filter out other team hashtags
    for tag in negate_team_hashtags:
        # stop if length of query is above 500
        # limit is 512
        if len(query) > 495:
            break
        query = query + " -"+ tag
    print("Query Length: ", len(query))
    return query

def get_all_team_queries(team_list, team_hashtags, negate_hashtags):
    queries = {}
    for team in team_list:
        queries[team] = build_team_query(team ,team_hashtags, negate_hashtags)
    return queries


team_queries = get_all_team_queries(team_list, hashtags_sm.hashtags, ht_half_1)
team_queries[str('bears')]

Query Length:  466
Query Length:  466
Query Length:  471


'(#DaBears OR #BearsNation OR #Bears) -is:retweet lang:en -#DirtyBirds -#Falcons -#BirdCityFootball -#Cardinals -#RavensFlock -#Ravens -#BillsMafia -#GoBills -#Bills -#KeepPounding -#panthers -#RuleTheJungle -#Bengals -#WhoDey -#Browns -#ClevelandBrowns  -#DawgPound -#DallasCowboys -#Cowboys -#BroncosCountry -#LetsRide -#Broncos -#OnePride -#Lions  -#GoPackGo -#Packers -#WeAreTexans -#Texans -#ForTheShoe -#Colts -#DUUUVAL -#Jags -#Jaguars -#ChiefsKingdom -#Chiefs'

In [7]:
# build search tweets 

user_fields = ["id"]
expansions = ["author_id"]

# max_results min = 10
def build_team_search(team_query, max_results = 10):
    team_response = client.search_recent_tweets(team_query, max_results=max_results, tweet_fields=tweet_fields, user_fields=user_fields, expansions=expansions)
    return team_response

# bengals_response = build_team_search(query_bengals, 10)

# Pagination

Requests limited to 100 tweets at a time

Must paginate requests using ```tweepy.Paginator```   
```limit```: sets how many pages  
```max_results```: sets how many results per page (limited to 100)  
Total Results = limit * max_results  

```start_time```: start of when to retrieve tweets  
```end_time```: end of when to retrieve tweets  
**Note**: times are in format of ```2022-11-24T15:25:00Z```  
This is ZULU time which is **7 hours ahead of MST**  


In [59]:

def paginate_search(team_query, max_results, limit=20):
    tweet_fields = ["author_id", "created_at", "text"]
    paginator = tweepy.Paginator(
        client.search_recent_tweets, 
        query = team_query, 
        tweet_fields = tweet_fields, 
        # start_time='2022-11-24T18:16:15Z', 
        # end_time='2022-11-24T22:16:15Z',
        max_results = max_results,
        limit = limit
    )
    # .flatten(limit = limit)

    return paginator

    # start_time=2019-01-01T17:00:00Z
    # end_time=2020-12-12T01:00:00Z

    

#### Paginate for Each Team Query


In [60]:
def get_all_team_paginators(team_queries, max_results=10, limit = 3):
    # create empty dict for resulting paginators
    paginators = {}

    # iterate through dict
    for team_key, team_query in team_queries.items():
        team_paginator = paginate_search(team_query, max_results, limit)
        paginators[team_key] = team_paginator
    return paginators



In [61]:
team_paginators = get_all_team_paginators(team_queries, 10, 2)

### Create team paginator

In [62]:
team_paginators

{'bears': <tweepy.pagination.Paginator at 0x1f273363790>,
 'bengals': <tweepy.pagination.Paginator at 0x1f274c268f0>,
 'saints': <tweepy.pagination.Paginator at 0x1f274c26cb0>}

## Put tweets in dataframe or some format

- Tweet Team
- Tweet Text
- Tweet Author
- Tweet Date Time

# Create CSV file for team

In [63]:
import json
import csv
import tweepy
import re
import io
import sys

def create_dataset(paginator, team):
    with open('team_data/%s_data.csv' % (team), 'w', encoding="utf-8") as file:
        w = csv.writer(file)
        
        # Write header row (feature column names of your choice)
        w.writerow(['team',
                     'timestamp', 
                     'tweet_text', 
                     'userid' 
                     ])
        for page in paginator: 
            # For each tweet matching hashtag, write relevant info to the spreadsheet
            for tweet in page.data:
                w.writerow([team,
                            tweet.created_at, 
                            tweet.text.replace('\n',' ').encode('utf-8'), 
                            tweet.author_id, 
                            ])

In [64]:
def create_all_team_datasets(team_paginators):
    # iterate through team paginator dict
    for team_key, team_paginator in team_paginators.items():
        # pass each to create_dataset
        create_dataset(team_paginator, team_key)

In [65]:
create_all_team_datasets(team_paginators)

## Need to filter out multiteam hashtags