# Download Tweets from Relevant Profiles

In [8]:
import pandas as pd
from tweetscrape.profile_tweets import TweetScrapperProfile 
import datetime

In [2]:
%cd data

/Users/tilia/HZ/SmartCleanCityHackZurich19/data


In [11]:
%ls

180_days_maxT_24hrPrecip.csv
180_days_mean1hrT_1hrPrecip.csv
180_days_meanT_24hrPrecip.csv
2019-09-27-basel-collections.csv
2019-09-27-basel-image-metadata.csv
2019-09-27-basel-measures-FEAT-TempPrecip.csv
2019-09-27-basel-measures-FEAT.csv
2019-09-27-basel-measures-cleaned.csv
2019-09-27-basel-measures-prediction-cleaned.csv
2019-09-27-basel-measures-prediction.csv
2019-09-27-basel-measures.csv
all_tweets.csv
all_tweets_relcols.csv
event_cal.csv
twitter.csv
twitter_accounts.txt
[34mtwitter_data[m[m/


In [3]:
with open('twitter_accounts.txt', 'r') as ta_f:
    profiles = [i.rstrip('\n') for i in ta_f.readlines()]

profiles

['baselcommunity', 'BaselStadt', 'baseltourism', 'BVB_Leitstelle', 'jsdBS']

## Download last n tweets from relevant profiles (CAREFUL, this overwrites the current file!!)

In [4]:
def get_tweets(user_profile, tnum):
    '''
    Downloads tnum last tweets for a given user profile and saves download as csv file. 
    Expects a directory 'twitter_data' in the current working directory
    '''
    
    dump_path = f'twitter_data/{user_profile}-{tnum}_tweets.csv'

    tweet_scrapper = TweetScrapperProfile(user_profile, tnum, dump_path, 'csv')
    tweet_count, tweet_id, tweet_time, dump_path = tweet_scrapper.get_profile_tweets()
    print("Extracted {0} tweets till {1} at {2} for user {3}".format(tweet_count, tweet_time, dump_path, user_profile))
    
    
for p in profiles:
    
    get_tweets(p, 500)

Extracted 501 tweets till 2018-05-28 at twitter_data/baselcommunity-500_tweets2.csv for user baselcommunity
Extracted 500 tweets till 2019-01-23 at twitter_data/BaselStadt-500_tweets2.csv for user BaselStadt
Extracted 500 tweets till 2018-09-19 at twitter_data/baseltourism-500_tweets2.csv for user baseltourism
Extracted 500 tweets till 2019-04-01 at twitter_data/BVB_Leitstelle-500_tweets2.csv for user BVB_Leitstelle
Extracted 500 tweets till 2016-04-05 at twitter_data/jsdBS-500_tweets2.csv for user jsdBS


## Add date information in default formatting and join all dataframes into a global dataframe

In [5]:
def get_daydate(date_str):
    '''Returns shortened date string in the format %Y-%m-%d.'''
    
    d = datetime.datetime.strptime(date_str, '%Y-%m-%d %H:%M:%S')
       
    return str(datetime.datetime.strftime(d, '%Y-%m-%d'))

In [12]:
all_tweets = pd.DataFrame()

for user_profile in profiles:

    tw = pd.read_csv(f'twitter_data/{user_profile}.csv')
    tw['date'] = tw.time.apply(lambda x: datetime.datetime.fromtimestamp(int(x)/1000))
    tw['day_date'] = tw.date.apply(lambda x: get_daydate(str(x)))
    all_tweets = all_tweets.append(tw)
      
all_tweets.to_csv('twitter_data/all_tweets.csv', index=None, header=True)

## Write out another csv file which only contains columns corresponding to (possibly) relevant features

In [14]:
all_tweets_relcols = all_tweets[['id', 'author', 're_tweeter', 'text', 'hashtags', 'mentions', 'favorite_count', 'day_date']].copy()

all_tweets_relcols.to_csv('twitter_data/all_tweets_relcols.csv', index=None, header=True)