# Updating Data Extraction

- 07/11/20
This notebook is for exploring alternative (programmatic) methods to get all of the pertinent tweets from Donald Trump

In [None]:
!pip install -U fsds
from fsds.imports import *

# Getting Stock Data

In [1]:
# ## Attempt to Get Stock Data With Code

import requests
import pandas as pd

def download_stock_data(fpath='data/ive_minute_tick_bidask_API.csv',
                       verbose=True):
    """Downloads up-to-date IVE S&P 500 1-min aggregate data from 
    http://www.kibot.com/free_historical_data.aspx
    
    Args:
        fpath (str): csv filepath to save (Default='data/ive_minute_tick_bidask_API.csv')
        verbose (bool): Display file info (Default=True)
        
    Returns:
        stock_df: DataFrame with correct headers and datetime index"""
    agg_url = 'http://api.kibot.com/?action=history&symbol=IVE&interval=tickbidask1&bp=1&user=guest'
    response = requests.get(agg_url,
                            allow_redirects=True)

    ## Save output to csv file
    with open(fpath,'wb') as file:
        file.write(response.content)
        
        
    ## Load in Stock Data Frame with headers (then save)
    headers = ['Date','Time','BidOpen','BidHigh','BidLow','BidClose','AskOpen','AskHigh','AskLow','AskClose']
    stock_df = pd.read_csv('data/ive_minute_tick_bidask_API.csv',names=headers)

# 
    ## Make Combined Date Time column and Drop Origs
    stock_df['datetime'] = pd.to_datetime(stock_df['Date'].astype(str)+' '+stock_df['Time'].astype(str))
    stock_df.to_csv(fpath,index=False)
        
    if verbose:
        print('[i] Data successfully downloaded and saved as:')
        print(' - ',fpath)
        
    return pd.read_csv(fpath,parse_dates=['datetime'],index_col='datetime')

stock_df = download_stock_data()

[i] Data successfully downloaded and saved as:
 -  data/ive_minute_tick_bidask_API.csv


In [2]:
stock_df.head()

Unnamed: 0_level_0,Unnamed: 0,Date,Time,BidOpen,BidHigh,BidLow,BidClose,AskOpen,AskHigh,AskLow,AskClose
datetime,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
2009-09-28 09:30:00,0,09/28/2009,09:30,39.35,39.35,39.35,39.35,39.42,39.42,39.42,39.42
2009-09-28 09:31:00,1,09/28/2009,09:31,39.38,39.38,39.36,39.38,39.39,39.39,39.38,39.39
2009-09-28 09:32:00,2,09/28/2009,09:32,39.39,39.43,39.39,39.43,39.41,39.45,39.41,39.45
2009-09-28 09:33:00,3,09/28/2009,09:33,39.42,39.42,39.42,39.42,39.43,39.45,39.43,39.44
2009-09-28 09:34:00,4,09/28/2009,09:34,39.42,39.42,39.41,39.41,39.44,39.44,39.42,39.42


## TWINT
- ABANDONED (for now. Search errors/always run)

In [11]:
# # !pip install -U twint
# !pip install -U -e git+https://github.com/twintproject/twint.git@origin/master#egg=twint

1. https://github.com/twintproject/twint
2. https://github.com/twintproject/twint/wiki/Scraping-functions

In [12]:
# import twint

# c = twint.Config()
# c.Limit = 20
# c.Since="2017-01-01"
# c.Username = 'realDonaldTrump'
# c.Pandas = True
# c.Store_csv = True
# c.Output='_twint_extracted_tweets.csv'

In [13]:
# twint.run.Profile(c)

In [14]:
# twint.run.Search(c)

# Tweets_df = twint.storage.panda.Tweets_df

## Trump Twitter Archive Download

- https://github.com/bpb27/trump_tweet_data_archive/archive/master.zip

- ABANDONED (No tweets from 2019-2020)

In [53]:
import os,glob,shutil
tweet_folder = 'Trump Tweets/'
os.makedirs(tweet_folder,exist_ok=True)

In [54]:
import requests

url = 'https://github.com/bpb27/trump_tweet_data_archive/archive/master.zip'
r = requests.get(url, allow_redirects=True)

tweet_zipfile = tweet_folder+'archive.zip'
with open(tweet_zipfile, 'wb') as file:
    file.write(r.content)

os.listdir(tweet_folder)

['.DS_Store', 'archive.zip']

In [93]:
from  zipfile import ZipFile
zipfile= ZipFile(tweet_zipfile,'r')
# zipfile.printdir()
file_list = zipfile.filelist
file_list
#     zipfile.extractall('Trump Tweets/')

[<ZipInfo filename='trump_tweet_data_archive-master/' external_attr=0x10>,
 <ZipInfo filename='trump_tweet_data_archive-master/.gitignore' file_size=9>,
 <ZipInfo filename='trump_tweet_data_archive-master/README.md' compress_type=deflate file_size=685 compress_size=411>,
 <ZipInfo filename='trump_tweet_data_archive-master/condensed_2009.json.zip' compress_type=deflate file_size=4544 compress_size=4539>,
 <ZipInfo filename='trump_tweet_data_archive-master/condensed_2010.json.zip' compress_type=deflate file_size=11162 compress_size=11155>,
 <ZipInfo filename='trump_tweet_data_archive-master/condensed_2011.json.zip' file_size=58511>,
 <ZipInfo filename='trump_tweet_data_archive-master/condensed_2012.json.zip' file_size=273295>,
 <ZipInfo filename='trump_tweet_data_archive-master/condensed_2013.json.zip' file_size=600669>,
 <ZipInfo filename='trump_tweet_data_archive-master/condensed_2014.json.zip' file_size=464484>,
 <ZipInfo filename='trump_tweet_data_archive-master/condensed_2015.json.z

In [94]:
## Extract All Non-Git Files
non_git_files = [file for file in file_list if '.git' not in file.filename]
for file in non_git_files:
    zipfile.extract(file,path=tweet_folder)


In [95]:
import glob
extracted_zipfiles = glob.glob(tweet_folder+'trump_tweet_data_archive-master/*.json.zip')
extracted_zipfiles

['Trump Tweets/trump_tweet_data_archive-master/master_2018.json.zip',
 'Trump Tweets/trump_tweet_data_archive-master/condensed_2018.json.zip',
 'Trump Tweets/trump_tweet_data_archive-master/master_2012.json.zip',
 'Trump Tweets/trump_tweet_data_archive-master/master_2013.json.zip',
 'Trump Tweets/trump_tweet_data_archive-master/condensed_2012.json.zip',
 'Trump Tweets/trump_tweet_data_archive-master/condensed_2013.json.zip',
 'Trump Tweets/trump_tweet_data_archive-master/master_2009.json.zip',
 'Trump Tweets/trump_tweet_data_archive-master/condensed_2009.json.zip',
 'Trump Tweets/trump_tweet_data_archive-master/master_2015.json.zip',
 'Trump Tweets/trump_tweet_data_archive-master/master_2014.json.zip',
 'Trump Tweets/trump_tweet_data_archive-master/condensed_2015.json.zip',
 'Trump Tweets/trump_tweet_data_archive-master/condensed_2014.json.zip',
 'Trump Tweets/trump_tweet_data_archive-master/master_2011.json.zip',
 'Trump Tweets/trump_tweet_data_archive-master/master_2010.json.zip',
 '

In [96]:
master_files = list(filter(lambda x: 'master_' in x, extracted_zipfiles))
condensed_files = list(filter(lambda x: 'condensed' in x, extracted_zipfiles))
condensed_files

['Trump Tweets/trump_tweet_data_archive-master/condensed_2018.json.zip',
 'Trump Tweets/trump_tweet_data_archive-master/condensed_2012.json.zip',
 'Trump Tweets/trump_tweet_data_archive-master/condensed_2013.json.zip',
 'Trump Tweets/trump_tweet_data_archive-master/condensed_2009.json.zip',
 'Trump Tweets/trump_tweet_data_archive-master/condensed_2015.json.zip',
 'Trump Tweets/trump_tweet_data_archive-master/condensed_2014.json.zip',
 'Trump Tweets/trump_tweet_data_archive-master/condensed_2011.json.zip',
 'Trump Tweets/trump_tweet_data_archive-master/condensed_2010.json.zip',
 'Trump Tweets/trump_tweet_data_archive-master/condensed_2016.json.zip',
 'Trump Tweets/trump_tweet_data_archive-master/condensed_2017.json.zip']

In [99]:
import pandas as pd


df = pd.concat([pd.read_json(zfile) for zfile in condensed_files[:2]])
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 7041 entries, 0 to 3530
Data columns (total 8 columns):
 #   Column                   Non-Null Count  Dtype              
---  ------                   --------------  -----              
 0   source                   7041 non-null   object             
 1   id_str                   7041 non-null   int64              
 2   text                     7041 non-null   object             
 3   created_at               7041 non-null   datetime64[ns, UTC]
 4   retweet_count            7041 non-null   int64              
 5   in_reply_to_user_id_str  444 non-null    float64            
 6   favorite_count           7041 non-null   int64              
 7   is_retweet               7041 non-null   bool               
dtypes: bool(1), datetime64[ns, UTC](1), float64(1), int64(3), object(2)
memory usage: 446.9+ KB


In [100]:
df['index'] = pd.to_datetime(df['created_at'])
df

Unnamed: 0,source,id_str,text,created_at,retweet_count,in_reply_to_user_id_str,favorite_count,is_retweet,index
0,Twitter for iPhone,1079888205351145472,HAPPY NEW YEAR! https://t.co/bHoPDPQ7G6,2018-12-31 23:53:06+00:00,33548,,136012,False,2018-12-31 23:53:06+00:00
1,Twitter for iPhone,1079830268708556800,"....Senator Schumer, more than a year longer t...",2018-12-31 20:02:52+00:00,17456,25073877.0,65069,False,2018-12-31 20:02:52+00:00
2,Twitter for iPhone,1079830267274108928,Heads of countries are calling wanting to know...,2018-12-31 20:02:52+00:00,21030,,76721,False,2018-12-31 20:02:52+00:00
3,Twitter for iPhone,1079763923845419008,It’s incredible how Democrats can all use thei...,2018-12-31 15:39:15+00:00,29610,,127485,False,2018-12-31 15:39:15+00:00
4,Twitter for iPhone,1079763419908243456,"I’m in the Oval Office. Democrats, come back f...",2018-12-31 15:37:14+00:00,30957,,132439,False,2018-12-31 15:37:14+00:00
...,...,...,...,...,...,...,...,...,...
3526,Twitter Web Client,154288899324260352,"""The Wall Street Journal has reported that Oba...",2012-01-03 19:51:54+00:00,75,,8,False,2012-01-03 19:51:54+00:00
3527,Twitter Web Client,154284061995696128,"""The Democrats' solution is the same solution ...",2012-01-03 19:32:41+00:00,142,,10,False,2012-01-03 19:32:41+00:00
3528,Twitter Web Client,154282197371731968,I will be on the @todayshow tomorrow morning t...,2012-01-03 19:25:16+00:00,58,,7,False,2012-01-03 19:25:16+00:00
3529,Twitter Web Client,154279702578728960,My @FoxNews interview with @gretawire discussi...,2012-01-03 19:15:22+00:00,24,,3,False,2012-01-03 19:15:22+00:00


In [102]:
# df.to_csv(tweet_folder+"trump_twitter_archive_extract.csv",index=False)

In [104]:
df.sort_values('created_at')

Unnamed: 0,source,id_str,text,created_at,retweet_count,in_reply_to_user_id_str,favorite_count,is_retweet,index
3530,Twitter Web Client,154270999859298304,"My @foxandfriends interview discussing the ""Ma...",2012-01-03 18:40:47+00:00,19,,12,False,2012-01-03 18:40:47+00:00
3529,Twitter Web Client,154279702578728960,My @FoxNews interview with @gretawire discussi...,2012-01-03 19:15:22+00:00,24,,3,False,2012-01-03 19:15:22+00:00
3528,Twitter Web Client,154282197371731968,I will be on the @todayshow tomorrow morning t...,2012-01-03 19:25:16+00:00,58,,7,False,2012-01-03 19:25:16+00:00
3527,Twitter Web Client,154284061995696128,"""The Democrats' solution is the same solution ...",2012-01-03 19:32:41+00:00,142,,10,False,2012-01-03 19:32:41+00:00
3526,Twitter Web Client,154288899324260352,"""The Wall Street Journal has reported that Oba...",2012-01-03 19:51:54+00:00,75,,8,False,2012-01-03 19:51:54+00:00
...,...,...,...,...,...,...,...,...,...
4,Twitter for iPhone,1079763419908243456,"I’m in the Oval Office. Democrats, come back f...",2018-12-31 15:37:14+00:00,30957,,132439,False,2018-12-31 15:37:14+00:00
3,Twitter for iPhone,1079763923845419008,It’s incredible how Democrats can all use thei...,2018-12-31 15:39:15+00:00,29610,,127485,False,2018-12-31 15:39:15+00:00
2,Twitter for iPhone,1079830267274108928,Heads of countries are calling wanting to know...,2018-12-31 20:02:52+00:00,21030,,76721,False,2018-12-31 20:02:52+00:00
1,Twitter for iPhone,1079830268708556800,"....Senator Schumer, more than a year longer t...",2018-12-31 20:02:52+00:00,17456,25073877.0,65069,False,2018-12-31 20:02:52+00:00


In [88]:
df = pd.read_csv(tweet_folder+'trump_twitter_archive_extract.csv',
                 parse_dates=['created_at'],index_col='created_at')

  interactivity=interactivity, compiler=compiler, result=result)


In [89]:
df

Unnamed: 0_level_0,source,id_str,text,retweet_count,in_reply_to_user_id_str,favorite_count,is_retweet
created_at,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
2018-12-31 23:53:06+00:00,Twitter for iPhone,1079888205351145472,HAPPY NEW YEAR! https://t.co/bHoPDPQ7G6,33548.0,,136012.0,False
2018-12-31 20:02:52+00:00,Twitter for iPhone,1079830268708556800,"....Senator Schumer, more than a year longer t...",17456.0,25073877.0,65069.0,False
2018-12-31 20:02:52+00:00,Twitter for iPhone,1079830267274108928,Heads of countries are calling wanting to know...,21030.0,,76721.0,False
2018-12-31 15:39:15+00:00,Twitter for iPhone,1079763923845419008,It’s incredible how Democrats can all use thei...,29610.0,,127485.0,False
2018-12-31 15:37:14+00:00,Twitter for iPhone,1079763419908243456,"I’m in the Oval Office. Democrats, come back f...",30957.0,,132439.0,False
...,...,...,...,...,...,...,...
2017-01-01 06:49:33+00:00,Twitter for iPhone,815449868739211264,RT @DonaldJTrumpJr: Happy new year everyone. #...,6847.0,,0.0,True
2017-01-01 05:44:17+00:00,Twitter for iPhone,815433444591304704,RT @EricTrump: 2016 was such an incredible yea...,6941.0,,0.0,True
2017-01-01 05:43:23+00:00,Twitter for iPhone,815433217595547648,RT @Reince: Happy New Year + God's blessings t...,7144.0,,0.0,True
2017-01-01 05:39:13+00:00,Twitter for iPhone,815432169464197120,RT @DanScavino: On behalf of our next #POTUS &...,5548.0,,0.0,True
