In [4]:
import tweepy
import constants
import pandas as pd
import datetime as dt

### Create Tweepy Client using bearer token

In [5]:
bearer_token = constants.TWITTER_BEARER_TOKEN
client = tweepy.Client(bearer_token=bearer_token)


#### Build query

Want to have it exclude if multiple team tags  
Can't include them all however as would go over the query character limit of ~512 

Would limit the amount of tweets coming in however would also have to then filter again afterwards to remove the hastags that I couldan't include in the query  

Approach 1:
- build query and include first 50% of negated hashtags 
- filter tweets after using 2nd half of hashtags

In [3]:
def get_non_team_hashtags_to_negate(team_hashtags, hashtag_list):
    team_hashtags = [team.lower() for team in team_hashtags]
    hashtag_list = [hashtag.lower() for hashtag in hashtag_list]
    output = [b for b in hashtag_list if all(a not in b for a in team_hashtags)]
    return output


In [4]:
from hashtags_all import hashtags
from hashtags_stripped import ht_flat, ht_half_1, ht_half_2

# test_team_list = ["bears", "bengals", "saints"]
team_list = hashtags.keys()

# build twitter query for a team 
# take team name in dictionary and hashtag list
# Ex "(#RuleTheJungle OR #Bengals OR #WhoDey) -is:retweet"
# Important to have the parathensis around the main hashtags or else the retweet parameter won't work

def build_team_query(team_name, hashtag_doc, hashtags_all):
    
    # retrieve team hashtags in list
    team_hashtags = hashtag_doc[str(team_name)]

    #retrieve non-team hashtags in list
    negate_team_hashtags = get_non_team_hashtags_to_negate(team_hashtags, hashtags_all)
    
    # build start of query
    query = '(' + team_hashtags[0]
    
    # if more than one hashtag, combine using OR statements
    if (len(team_hashtags) > 1): 
        for tag in team_hashtags[1:]:
            query = query + ' OR ' + tag

    # filter out retweets? - not sure if I should do this or not, likely retweet something you also think
    query = query + ") -is:retweet lang:en"
    
    # filter out other team hashtags
    for tag in negate_team_hashtags:
        # stop if length of query is above 500
        # limit is 512
        if len(query) > 495:
            break
        query = query + " -"+ tag
    # print("Query Length: ", len(query))
    return query

def get_all_team_queries(team_list, team_hashtags, negate_hashtags):
    queries = {}
    for team in team_list:
        queries[team] = build_team_query(team ,team_hashtags, negate_hashtags)
    return queries





### Create Team Queries
*Only pass in half of the hashtags list to limit the length of the query  
Will use the other half ```ht_half_2``` to filter out after the fact

Will also have to filter out any other hashtags like #nhl or #college etc.

In [5]:
team_queries = get_all_team_queries(team_list, hashtags, ht_half_1)
len(team_queries)

team_queries

{'falcons': '(#DirtyBirds OR #Falcons) -is:retweet lang:en -#birdcityfootball -#cardinals -#ravensflock -#ravens -#billsmafia -#gobills -#bills -#keeppounding -#panthers -#dabears -#bearsnation -#bears -#rulethejungle -#bengals -#whodey -#browns -#clevelandbrowns  -#dawgpound -#dallascowboys -#cowboys -#broncoscountry -#letsride -#broncos -#onepride -#lions  -#gopackgo -#packers -#wearetexans -#texans -#fortheshoe -#colts -#duuuval -#jags -#jaguars -#chiefskingdom -#chiefs',
 'cardinals': '(#BirdCityFootball OR #cardinals OR #AZCardinals) -is:retweet lang:en -#dirtybirds -#falcons -#ravensflock -#ravens -#billsmafia -#gobills -#bills -#keeppounding -#panthers -#dabears -#bearsnation -#bears -#rulethejungle -#bengals -#whodey -#browns -#clevelandbrowns  -#dawgpound -#dallascowboys -#cowboys -#broncoscountry -#letsride -#broncos -#onepride -#lions  -#gopackgo -#packers -#wearetexans -#texans -#fortheshoe -#colts -#duuuval -#jags -#jaguars -#chiefskingdom -#chiefs',
 'ravens': '(#ravens) 

#### Individual Team Search - No Pagination
Can use for testing

In [91]:
# build search tweets 
# will use paginate normally, test case
user_fields = ["id"]
expansions = ["author_id"]

# max_results min = 10
def build_team_search(team_query, max_results = 10):
    team_response = client.search_recent_tweets(
            team_query, 
            max_results=max_results, 
            tweet_fields=["author_id", "created_at", "text"], 
            user_fields=user_fields, 
            expansions=expansions
        )
    return team_response

panthers_resp = build_team_search(team_queries[str('panthers')], max_results=10)
panthers_resp.data

### Get times and games from schedule data

1. Get games from schedule
2. For each game
    a. get both teams
    b. get date & time
    c. get week
    d. get home and away ? not sure need this
3. For each game create database of tweets for each team
    a. build query
        i. Queries are always the same, can just create a dict of queries
    b. get start time (4 days before game?) and end time (1 hr before game)
    b. pagination function
        ii. start time & end time
    b. create database of tweets

### Import Schedule CSV

In [7]:
sched_df = pd.read_csv('./2022_schedule_cleaned.csv', infer_datetime_format=True)

# add scores for this week, eventually automatic
sched_df.at[15, 'home_score'] = 17.0
sched_df.at[15, 'away_score'] = 24.0


# convert all datetime fields to datetime type
sched_df['GameTime'] = pd.to_datetime(sched_df['GameTime'])
sched_df['SearchStartTime'] = pd.to_datetime(sched_df['SearchStartTime'])
sched_df['SearchEndTime'] = pd.to_datetime(sched_df['SearchEndTime'])

# convert datetime to twitter appropriate format for datetime requests
sched_df['GameTime'] = sched_df['GameTime'].dt.strftime('%Y-%m-%dT%H:%M:%SZ')
sched_df['SearchStartTime'] = sched_df['SearchStartTime'].dt.strftime('%Y-%m-%dT%H:%M:%SZ')
sched_df['SearchEndTime'] = sched_df['SearchEndTime'].dt.strftime('%Y-%m-%dT%H:%M:%SZ')


In [8]:
sched_df.head()


Unnamed: 0,Week,Day,Date,Time,home,away,home_score,away_score,GameTime,SearchStartTime,SearchEndTime
0,12,Thu,2022-11-24,12:30PM,lions,bills,25.0,28.0,2022-11-24T17:30:00Z,2022-11-20T16:30:00Z,2022-11-24T16:30:00Z
1,12,Thu,2022-11-24,4:30PM,cowboys,giants,28.0,20.0,2022-11-24T21:30:00Z,2022-11-20T20:30:00Z,2022-11-24T20:30:00Z
2,12,Thu,2022-11-24,8:20PM,vikings,patriots,33.0,26.0,2022-11-25T01:20:00Z,2022-11-21T00:20:00Z,2022-11-25T00:20:00Z
3,12,Sun,2022-11-27,1:00PM,titans,bengals,16.0,20.0,2022-11-27T18:00:00Z,2022-11-23T17:00:00Z,2022-11-27T17:00:00Z
4,12,Sun,2022-11-27,1:00PM,browns,buccaneers,23.0,17.0,2022-11-27T18:00:00Z,2022-11-23T17:00:00Z,2022-11-27T17:00:00Z


### Get Week 12 Games

In [90]:
sched_df_wk12 = sched_df[sched_df['Week'] == 12]
sched_df_wk12_no_thurs = sched_df_wk12.iloc[3:]


# Pagination

Requests limited to 100 tweets at a time

Must paginate requests using ```tweepy.Paginator```   
```limit```: sets how many pages  
```max_results```: sets how many results per page (limited to 100)  
Total Results = limit * max_results  

```start_time```: start of when to retrieve tweets  
```end_time```: end of when to retrieve tweets  
**Note**: times are in format of ```2022-11-24T15:25:00Z```  
This is ZULU time which is **7 hours ahead of MST**  


### Individual Team Paginator Search

In [121]:
#check start time within 7 days

def check_start_time_not_greater_7_days(start_time):
    week_ago = dt.datetime.utcnow() - pd.to_timedelta('7 days')
    print('1 week ago:', week_ago, '\t Start_time: ', start_time)
    check = week_ago < dt.datetime.strptime(start_time, '%Y-%m-%dT%H:%M:%SZ')
    if (check == False):
        print('Start time too old')
    else:
        print('Start time allowed')
    return check

check_start_time_not_greater_7_days('2022-11-23T17:55:00Z')

1 week ago: 2022-11-23 18:53:42.984763 	 Start_time:  2022-11-23T17:55:00Z
Start time too old


False

In [122]:

# start_time=2019-01-01T17:00:00Z
# end_time=2020-12-12T01:00:00Z
#2022-11-23T20:05:00Z	

def paginate_search(team_query, max_results, start_time, end_time, limit=20):
    start_valid = check_start_time_not_greater_7_days(start_time)
    if (start_valid == True):
        tweet_fields = ["author_i``d", "created_at", "text"]
        paginator = tweepy.Paginator(
            client.search_recent_tweets, 
            query = team_query, 
            tweet_fields = tweet_fields, 
            start_time=start_time, 
            end_time=end_time,
            max_results = max_results,
            limit = limit
        )
        return paginator
    else:
        print('Start Time Not Valid')

    

### Call Paginator Search from Game Schedule
Iterate through each game row  
Get all data needed  
Call paginae search for both home and away teams  
Add to dictionary

In [66]:
def create_week_paginations_from_schedule(wk_sched_df, team_queries, max_results = 100, limit = 50):
    paginators = {}
    for game_row in wk_sched_df.itertuples():
        wk = game_row.Week

        start_time = game_row.SearchStartTime
        end_time = game_row.SearchEndTime
        
        home_team = game_row.home
        away_team = game_row.away

        home_team_query = team_queries[str(home_team)]
        away_team_query = team_queries[str(away_team)]

        home_team_paginator = paginate_search(
            home_team_query, 
            max_results, 
            start_time, 
            end_time,
            limit
        )
        away_team_paginator = paginate_search(
            away_team_query, 
            max_results, 
            start_time, 
            end_time,
            limit
        )
        
        home_key = 'wk_' + str(wk) + '_' + home_team
        away_key = 'wk_' + str(wk) + '_' + away_team

        paginators[home_key] = {
            "paginator": home_team_paginator,
            "team": home_team
        }

        paginators[away_key] = {
            "paginator": away_team_paginator,
            "team": away_team
        }
    return paginators

### Get Paginator Dictionary for Week 12 games
Not including thursday night games

### Create CSV file for team
- Tweet Team
- Tweet Text
- Tweet Author
- Tweet Date Time

In [81]:
import csv
import tweepy

def create_dataset(paginator_dict, key):
    team = paginator_dict[str("team")]
    paginator = paginator_dict[str("paginator")]
    with open('team_data/%s_tweets.csv' % (key), 'w', encoding="utf-8") as file:
        w = csv.writer(file)
        
        # Write header row (feature column names of your choice)
        w.writerow(['team',
                     'timestamp', 
                     'tweet_text', 
                     'userid' 
                     ])
        tweet_count = 0
        page_count = 0
        for page in paginator: 
            page_count += 1
            if (page.data == None):
                print(f'NO DATA FOR {team}')
                print(page.meta)
            # For each tweet matching hashtag, write relevant info to the spreadsheet
            else:
                for tweet in page.data:
                    tweet_count += 1
                    w.writerow([team,
                                tweet.created_at, 
                                tweet.text.replace('\n',' ').encode('utf-8'), 
                                tweet.author_id, 
                                ])
        print(f'Team: {team} \t\t Page Count: {page_count} \t Tweet Count: {tweet_count}')

In [31]:
def create_all_team_datasets(team_paginators):
    # iterate through team paginator dict
    for key, paginator_dict in team_paginators.items():
        # pass each to create_dataset
        # print('KEY:%s - PAGINATOR: %s ' %(key, paginator_dict))
        create_dataset(paginator_dict, key)

### Run Code to get all Datasets

Had some issues with requests tapping out at certain points.  
Had to break it up to sort it out

In [104]:
sched_df_wk12_no_thurs_2 = sched_df_wk12.iloc[7:9]
sched_df_wk12_no_thurs_3 = sched_df_wk12.iloc[11:12]
# sched_df_wk12_no_thurs_4 = sched_df_wk12.iloc[12:14]
# sched_df_wk12_no_thurs_5 = sched_df_wk12.iloc[14:16]
sched_df_wk12_no_thurs_3

Unnamed: 0,Week,Day,Date,Time,home,away,home_score,away_score,GameTime,SearchStartTime,SearchEndTime
11,12,Sun,2022-11-27,4:05PM,cardinals,chargers,24.0,25.0,2022-11-27T21:05:00Z,2022-11-23T20:05:00Z,2022-11-27T20:05:00Z


In [103]:
paginators_wk12 = create_week_paginations_from_schedule(sched_df_wk12_no_thurs_3, team_queries, 100)
create_all_team_datasets(paginators_wk12)

Team: jets 		 Page Count: 34 	 Tweet Count: 3373
Team: bears 		 Page Count: 24 	 Tweet Count: 2345


## Need to filter out multiteam hashtags