# Description
This file can by used for scraping tweets by using Twitter API. To run this file, firstly you have to create your account on https://apps.twitter.com/. After doing it, create .env file in the main directory of the project, where you save your private API credentials - CONSUMER_KEY, CONSUMER_SECRET, ACCESS_KEY, ACCESS_SECRET.

WARNING:
You can get tweets from maximum last 7 days!

In [1]:
# Run this cell just once! (or restart Kernel before second time)

import os

os.chdir('..')


In [2]:
import csv
import datetime
import sys

import pandas as pd
import pandas as pd
import tweepy
from dotenv import load_dotenv

from utils.fixed import ENV_PATH, MAIN_PATH, DATA_PATH, load_match_data, MATCH_DATA


In [3]:
def to_short_df(tweets, hashtag_list, teams):
    # load tweets to small dataframe having only 5 columns obligatory and enough for this project
    DataSet = pd.DataFrame()
    DataSet['tweetID'] = [tweet.id for tweet in tweets]
    DataSet['tweetText'] = [tweet.text for tweet in tweets]
    DataSet['tweetCreated'] = [tweet.created_at for tweet in tweets]
    DataSet['hashtag'] = [hashtag for hashtag in hashtag_list]
    DataSet['team'] = [team for team in teams]
    return DataSet

def to_long_df(tweets, hashtag_list, teams):
    # load tweets to bigger dataframe having many additional data which can be used in further analyses
    DataSet = pd.DataFrame()
    DataSet['tweetID'] = [tweet.id for tweet in tweets]
    DataSet['tweetText'] = [tweet.text for tweet in tweets]
    DataSet['tweetCreated'] = [tweet.created_at for tweet in tweets]
    DataSet['tweetRetweetCt'] = [tweet.retweet_count for tweet in tweets]
    DataSet['tweetFavoriteCt'] = [tweet.favorite_count for tweet in tweets]
    DataSet['tweetSource'] = [tweet.source for tweet in tweets]
    DataSet['userID'] = [tweet.user.id for tweet in tweets]
    DataSet['userScreen'] = [tweet.user.screen_name for tweet in tweets]
    DataSet['userName'] = [tweet.user.name for tweet in tweets]
    DataSet['userCreateDt'] = [tweet.user.created_at for tweet in tweets]
    DataSet['userDesc'] = [tweet.user.description for tweet in tweets]
    DataSet['userFollowerCt'] = [tweet.user.followers_count for tweet in tweets]
    DataSet['userFriendsCt'] = [tweet.user.friends_count for tweet in tweets]
    DataSet['userLocation'] = [tweet.user.location for tweet in tweets]
    DataSet['userTimezone'] = [tweet.user.time_zone for tweet in tweets]
    DataSet['hashtag'] = [hashtag for hashtag in hashtag_list]
    DataSet['team'] = [team for team in teams]
    return DataSet

def get_tweets(team, hashtag, day, month, year, hour, mins):
    # get tweets for a hashtag from specified period of time (starting from match_start date)
    startDate = datetime.datetime(year, month, day, hour, mins, 0)
    endDate = startDate + datetime.timedelta(minutes=150)
    startDate_param = datetime.datetime.strftime(startDate, '%Y-%m-%d')
    endDate_param = datetime.datetime.strftime(startDate + datetime.timedelta(days=1), '%Y-%m-%d')
    tmpTweets = api.search(q=hashtag, lang="en", since=startDate_param, until=endDate_param, count=100)
    tweets = []
    hashtag_list = []
    teams = []
    try:
        while (tmpTweets[-1].created_at > startDate):
            tmpTweets = api.search(q=hashtag, lang="en", max_id=tmpTweets[-1].id, count=100)
            for tweet in tmpTweets:
                if tweet.created_at < endDate and tweet.created_at > startDate:
                    tweets.append(tweet)
                    hashtag_list.append(hashtag)
                    teams.append(team)
    except:
        print("No tweets for hashtag: " + hashtag)
    return tweets, hashtag_list, teams

def tweets_to_csv(hashtag_dict, date, file_name):
    # save tweets to csv file
    tweets = []
    hashtags = []
    teams = []
    for key, value in hashtag_dict.items():
        for hashtag in value:
            print('starting ', hashtag)
            tweets_temp, hashtag_list_temp, teams_temp = get_tweets(key, hashtag, date.day, date.month, date.year,
                                                                    date.hour, date.minute)
            tweets += tweets_temp
            hashtags += hashtag_list_temp
            teams += teams_temp
    df = to_short_df(tweets, hashtags, teams)
    df.to_csv(file_name, sep=';', encoding='utf-8')


### LOAD TWITTER API CREDENTIALS FROM .ENV FILE

In [4]:
load_dotenv(dotenv_path=ENV_PATH)

consumer_key = os.getenv("CONSUMER_KEY")
consumer_secret = os.getenv("CONSUMER_SECRET")
access_key = os.getenv("ACCESS_KEY")
access_secret = os.getenv("ACCESS_SECRET")


### CONNECT WITH TWITTER API

In [5]:
auth = tweepy.OAuthHandler(consumer_key, consumer_secret)
auth.set_access_token(access_key, access_secret)
api = tweepy.API(auth, wait_on_rate_limit=True)


### DEFINE THE MATCH ON WHICH YOU WANT TO SCRAP TWEETS

In [7]:
MATCH = "SOU-CHE"

### LOAD MATCH DATA

In [8]:
file_name, team1, team2, match_start, first_part_end, second_part_start, \
match_end, hashtags_team1, hashtags_team2 = load_match_data(MATCH)
match_hashtags = {team1: hashtags_team1, team2: hashtags_team2}
scrapping_start_date = datetime.datetime.strptime(match_start, "%Y-%m-%d %H:%M:%S") - datetime.timedelta(minutes=30)
TWEETS_CSV_PATH = os.path.join(MAIN_PATH, DATA_PATH, file_name)


#### BEFORE YOU RUN TWEETS_TO_CSV FUNCTION,  CHECK IF THE DATA IS CORRECT LOADED

In [9]:
file_name, team1, team2, match_start, first_part_end, second_part_start, match_end, hashtags_team1, hashtags_team2


('SOU_CHE.csv',
 'Southampton',
 'Chelsea',
 '2018-10-07 13:15:00',
 '2018-10-07 14:02:00',
 '2018-10-07 14:17:00',
 '2018-10-07 15:06:00',
 ['#saintsfc', '#southamptonfc', '#wemarchon'],
 ['#chelseafc',
  '#chelsea',
  '#cfc',
  '#cfcfamily',
  '#cfcfans',
  '#chelseafans',
  '#coyb',
  '#comeonyoublues',
  '#theblues',
  '#blueisthecolour'])

### GET TWEETS AND SAVE TO CSV
##### WARNING: With basic twitter-api-account you can only get tweets maximum one week old !

In [10]:
tweets_to_csv(match_hashtags, scrapping_start_date, TWEETS_CSV_PATH)


starting  #saintsfc
starting  #southamptonfc
starting  #wemarchon
starting  #chelseafc
starting  #chelsea
starting  #cfc
starting  #cfcfamily
starting  #cfcfans
starting  #chelseafans
starting  #coyb
starting  #comeonyoublues
starting  #theblues
starting  #blueisthecolour
