In [1]:
import tweepy
import constants
import pandas as pd
import datetime as dt

### Import Keys & Create Tweepy Client

In [2]:
bearer_token = constants.TWITTER_BEARER_TOKEN

client = tweepy.Client(bearer_token)


In [3]:
from team_queries import team_queries
team_queries[str('bills')]

'(#billsmafia OR #gobills OR #bills) -is:retweet lang:en -#dirtybirds -#falcons -#birdcityfootball -#cardinals -#ravensflock -#ravens -#keeppounding -#panthers -#dabears -#bearsnation -#bears -#rulethejungle -#bengals -#whodey -#browns -#clevelandbrowns  -#dawgpound -#dallascowboys -#cowboys -#broncoscountry -#letsride -#broncos -#onepride -#lions  -#gopackgo -#packers -#wearetexans -#texans -#fortheshoe -#colts -#duuuval -#jags -#jaguars -#chiefskingdom -#chiefs'

#### Individual Team Search - No Pagination
Can use for testing

In [4]:
# build search tweets 
# will use paginate normally, test case

# max_results min = 10
def build_team_search(team_query, max_results = 10):
    team_response = client.search_recent_tweets(
            team_query, 
            max_results=max_results, 
            tweet_fields=["author_id", "created_at", "text"], 
        )
    return team_response

bills_response = build_team_search(team_queries[str('bills')], max_results=10)
bills_response.data

[<Tweet id=1605308830803378176 text='@BuffaloBillsBiz @BuffaloBills @EnergyMark I don’t like the red! I like blue jersey/white pants but the red helps in the snow 👍🏼 #GoBills'>,
 <Tweet id=1605308623055470592 text='@stefondiggs I lost my dad when I was 12, I am so grateful that you’re doing this for him. Nothing will ever ease the pain, but you’re making the most out of a bad situation. I would love for #billsmafia to donate a memorial for his father at the new stadium, I’d gladly contribute'>,
 <Tweet id=1605308562917539845 text='Washington Has 1 Heck of a Back Field !! #BillsForum #BillsOpinion #BillsFans #BillsMafia #BillsFan https://t.co/U2fPNSFrpk'>,
 <Tweet id=1605308428171112448 text='Ravens just claimed Sammy Watkins.\nFrom what I read the last two days  #BillsMafia is more than happy with that news😂😂'>,
 <Tweet id=1605308422219612161 text="It's December 20th, and #Bills fans are STILL complaining about how hot it was in Miami in September. \n\nlol">,
 <Tweet id=160530840894882

### Get times and games from schedule data

1. Get games from schedule
2. For each game
    a. get both teams
    b. get date & time
    c. get week
    d. get home and away ? not sure need this
3. For each game create database of tweets for each team
    a. build query
        i. Queries are always the same, can just create a dict of queries
    b. get start time (4 days before game?) and end time (1 hr before game)
    b. pagination function
        ii. start time & end time
    b. create database of tweets

### Import Schedule CSV

In [5]:

def import_schedule_csv(filepath):
    sched_df = pd.read_csv(filepath, infer_datetime_format=True)

    # convert all datetime fields to datetime type
    sched_df['GameTime'] = pd.to_datetime(sched_df['GameTime'])
    sched_df['SearchStartTime'] = pd.to_datetime(sched_df['SearchStartTime'])
    sched_df['SearchEndTime'] = pd.to_datetime(sched_df['SearchEndTime'])

    # convert datetime to twitter appropriate format for datetime requests
    sched_df['GameTime'] = sched_df['GameTime'].dt.strftime('%Y-%m-%dT%H:%M:%SZ')
    sched_df['SearchStartTime'] = sched_df['SearchStartTime'].dt.strftime('%Y-%m-%dT%H:%M:%SZ')
    sched_df['SearchEndTime'] = sched_df['SearchEndTime'].dt.strftime('%Y-%m-%dT%H:%M:%SZ')

    return sched_df


# Pagination

Requests limited to 100 tweets at a time

Must paginate requests using ```tweepy.Paginator```   
```limit```: sets how many pages  
```max_results```: sets how many results per page (limited to 100)  
Total Results = limit * max_results  

```start_time```: start of when to retrieve tweets  
```end_time```: end of when to retrieve tweets  
**Note**: times are in format of ```2022-11-24T15:25:00Z```  
This is ZULU time which is **7 hours ahead of MST**  


### Individual Team Paginator Search

In [6]:
#check start time within 7 days

def check_start_time_not_greater_7_days(start_time):
    week_ago = dt.datetime.utcnow() - pd.to_timedelta('7 days')
    # print('1 week ago:', week_ago, '\t Start_time: ', start_time)
    check = week_ago < dt.datetime.strptime(start_time, '%Y-%m-%dT%H:%M:%SZ')
    # if (check == False):
    #     print('Start time too old')
    # else:
    #     print('Start time allowed')
    return check

check_start_time_not_greater_7_days('2022-12-05T00:15:00Z')

False

In [7]:

# start_time=2019-01-01T17:00:00Z
# end_time=2020-12-12T01:00:00Z
#2022-11-23T20:05:00Z	

def paginate_search(team_query, max_results, start_time, end_time, limit=20):
    start_valid = check_start_time_not_greater_7_days(start_time)
    tweet_fields = ["author_id", "created_at", "text"]
    
    if (start_valid == True):
        paginator = tweepy.Paginator(
            client.search_recent_tweets, 
            query = team_query, 
            tweet_fields = tweet_fields, 
            start_time=start_time, 
            end_time=end_time,
            max_results = max_results,
            limit = limit
        )
        return paginator
    else:
        # run query with start time being one week ago + 1 minute
        # otherwise time becomes invalid by time last pages come through, need the buffer
        start_time = dt.datetime.utcnow() - pd.to_timedelta('7 days') + pd.to_timedelta('1 minute')
        print('Start Time Not Valid', team_query[2:30], '\n New Sart Time: ', start_time)
        paginator = tweepy.Paginator(
            client.search_recent_tweets, 
            query = team_query, 
            tweet_fields = tweet_fields, 
            start_time=start_time, 
            end_time=end_time,
            max_results = max_results,
            limit = limit
        )
        return paginator

    

### Call Paginator Search from Game Schedule
Iterate through each game row  
Get all data needed  
Call paginae search for both home and away teams  
Add to dictionary

In [8]:
def get_paginator_dict_key(wk, team):
    team_key = 'wk_' + str(wk) + '_' + team
    return team_key

def create_week_paginations_from_schedule(wk_sched_df, team_queries, max_results = 100, limit = 80):
    # create empty paginators dict
    paginators = {}

    # iterate through each game row
    for game_row in wk_sched_df.itertuples():
        wk = game_row.Week

        # get search start and end times
        
        start_time = game_row.SearchStartTime
        end_time = game_row.SearchEndTime
        
        #get home and away teams
        home_team = game_row.home
        away_team = game_row.away

        # get team queries
        home_team_query = team_queries[str(home_team)]
        away_team_query = team_queries[str(away_team)]

        # create team paginators for home and away
        home_team_paginator = paginate_search(
            home_team_query, 
            max_results, 
            start_time, 
            end_time,
            limit
        )
        away_team_paginator = paginate_search(
            away_team_query, 
            max_results, 
            start_time, 
            end_time,
            limit
        )

        # create dict key from         
        home_key = get_paginator_dict_key(wk, home_team)
        away_key = get_paginator_dict_key(wk, away_team)

        # add paginators to dict
        paginators[home_key] = {
            "paginator": home_team_paginator,
            "team": home_team
        }
        paginators[away_key] = {
            "paginator": away_team_paginator,
            "team": away_team
        }
        
    return paginators

### Create CSV file for team
- Tweet Team
- Tweet Text
- Tweet Author
- Tweet Date Time

In [9]:
import csv
import tweepy

def create_tweet_save_path (week, key):
    return  f'./team_data/wk_{week}/{key}_tweets.csv'


# 'team_data/%s_tweets.csv' % (key)
def create_dataset(paginator_dict, key, week_num):\
    # get team and paginator
    team = paginator_dict[str("team")]
    paginator = paginator_dict[str("paginator")]

    # create save path for tweet csv's
    file_save_path = create_tweet_save_path(week_num, key)
    
    with open(file_save_path, 'w', encoding="utf-8") as file:
        w = csv.writer(file)
        
        # Write header row (feature column names of your choice)
        w.writerow(['team',
                     'timestamp', 
                     'tweet_text', 
                     'userid' 
                     ])
        # initiate tweet and page counts
        tweet_count = 0
        page_count = 0
        
        # iterate through pages of tweets
        for page in paginator: 
            page_count += 1

            # if no data found record and skip
            if (page.data == None):
                print(f'NO DATA FOR {team}')
                print(page.meta)
            
            else:
                # For each tweet add to csv file
                for tweet in page.data:
                    tweet_count += 1
                
                    w.writerow([team,
                                tweet.created_at, 
                                tweet.text.replace('\n',' ').encode('utf-8'), 
                                tweet.author_id, 
                                ])
        # record data
        print(f'Team: {team} \t\t Page Count: {page_count} \t Tweet Count: {tweet_count}')

In [10]:
def create_all_team_datasets(team_paginators, week_num):
    # iterate through team paginator dict
    for key, paginator_dict in team_paginators.items():
        # print('KEY:%s - PAGINATOR: %s ' %(key, paginator_dict))
        # pass each to create_dataset
        create_dataset(paginator_dict, key, week_num)

### Run Code to get all Datasets

1. create new filepath following template for new schedule
2. import schedule with the filepath and save df
3. Select which lines to run the code, might be only have certain games ```sched_14_df[4:9]``` etc.  
4. set `week_num = ##`  
5. set the dataframe parameter in the ```create_week_paginations_from_schedule```
6. Run Code  
7. commit to github so that have data if it gets lost

In [15]:
filepath_13 = './schedules/schedule_wk_13_cleaned.csv'
filepath_14 = './schedules/schedule_wk_14_cleaned.csv'
filepath_15 = './schedules/schedule_wk_15_cleaned.csv'
filepath_16 = './schedules/schedule_wk_16_cleaned.csv'
filepath_17 = './schedules/schedule_wk_17_cleaned.csv'

sched_15_df = import_schedule_csv(filepath_15)
sched_15_df_sun_mon = sched_15_df[13:]
sched_15_df_sun_mon

Unnamed: 0,Week,Day,Date,Time,home,away,home_score,away_score,GameTime,SearchStartTime,SearchEndTime
13,15,Sun,2022-12-18,4:25 PM,chargers,titans,17,14,2022-12-18T21:25:00Z,2022-12-14T20:25:00Z,2022-12-18T20:25:00Z
14,15,Sun,2022-12-18,8:20 PM,raiders,patriots,30,24,2022-12-19T01:20:00Z,2022-12-15T00:20:00Z,2022-12-19T00:20:00Z
15,15,Mon,2022-12-19,8:15 PM,packers,rams,24,12,2022-12-20T01:15:00Z,2022-12-16T00:15:00Z,2022-12-20T00:15:00Z


In [17]:
week_num = '15'
paginators_wk15 = create_week_paginations_from_schedule(sched_15_df_sun_mon, team_queries, 100)
create_all_team_datasets(paginators_wk15, week_num)

TooManyRequests: 429 Too Many Requests
Too Many Requests

## Need to filter out multiteam hashtags