In [1]:
# Import packages
import pandas as pd
import tweepy

In [2]:
# Set authorization keys
auth = tweepy.OAuthHandler('XXX', 'XXX')
auth.set_access_token("XX-XX", 
    "XXX")

api = tweepy.API(auth, wait_on_rate_limit=True)

In [3]:
imp_cols = ['created_at', 'text', 'truncated', 'geo', 'coordinates','place', 'id',
            'contributors', 'is_quote_status', 'retweet_count', 'favorite_count', 'lang']
nested_cols = {
    'entities': ['hashtags', 'symbols'],
    'metadata': ['iso_language_code'],
    'user': ['location', 'description', 'followers_count', 'friends_count', 'verified', 'lang']
}

In [4]:
democrats_slug = 'house-democrats'
democrats_owner_name = 'thedemocrats'

republicans_slug = 'house-republicans'
republicans_owner_name = 'HouseGOP'

In [5]:
def flatten_twitter_json(tweet):
    flattened_dict = {}
    for key in tweet.keys():
        if key in imp_cols:
            flattened_dict[key] = tweet[key]
        if key in nested_cols:
            for name in nested_cols[key]:
                flattened_dict[key+ '_' +name] = tweet[key][name]
    return flattened_dict

In [6]:
def get_list_timeline(slug, owner_name, n):
    _max_queries = 100

    tweets = tweet_batch = api.list_timeline(slug=slug,
                                             owner_screen_name=owner_name,
                                             count=100,
                                             include_rts=False
                                            )
    ct = 1
    while len(tweets) < n and ct < _max_queries:
        tweet_batch = api.list_timeline(slug=slug, 
                                        owner_screen_name=owner_name,
                                        count=100,
                                        max_id=tweet_batch.max_id,
                                        include_rts=False
                                       )
        tweets.extend(tweet_batch)
        ct += 1
    return tweets

In [7]:
def get_df_from_slug(slug, owner_name, count):
    list_tweets = get_list_timeline(slug, owner_name, 1000)
        
    list_tweets_json = [tweet._json for tweet in list_tweets]
    list_tweets_flattened = [flatten_twitter_json(tweet) for tweet in list_tweets_json]
    
    return pd.DataFrame(list_tweets_flattened).set_index('id')

In [8]:
df_democrats = get_df_from_slug(democrats_slug, democrats_owner_name, 1000)

In [9]:
df_republicans = get_df_from_slug(republicans_slug, republicans_owner_name, 1000)

In [10]:
print(df_democrats.shape, df_republicans.shape)
print(df_democrats.columns)

(1059, 19) (1062, 19)
Index(['created_at', 'text', 'truncated', 'entities_hashtags',
       'entities_symbols', 'user_location', 'user_description',
       'user_followers_count', 'user_friends_count', 'user_verified',
       'user_lang', 'geo', 'coordinates', 'place', 'contributors',
       'is_quote_status', 'retweet_count', 'favorite_count', 'lang'],
      dtype='object')


In [11]:
df_democrats.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1059 entries, 1180545258293063680 to 1180122014373351426
Data columns (total 19 columns):
created_at              1059 non-null object
text                    1059 non-null object
truncated               1059 non-null bool
entities_hashtags       1059 non-null object
entities_symbols        1059 non-null object
user_location           1059 non-null object
user_description        1059 non-null object
user_followers_count    1059 non-null int64
user_friends_count      1059 non-null int64
user_verified           1059 non-null bool
user_lang               0 non-null object
geo                     0 non-null object
coordinates             0 non-null object
place                   43 non-null object
contributors            0 non-null object
is_quote_status         1059 non-null bool
retweet_count           1059 non-null int64
favorite_count          1059 non-null int64
lang                    1059 non-null object
dtypes: bool(3), int64(4), ob

In [12]:
df_republicans.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1062 entries, 1180545384478859264 to 1179813477239668736
Data columns (total 19 columns):
created_at              1062 non-null object
text                    1062 non-null object
truncated               1062 non-null bool
entities_hashtags       1062 non-null object
entities_symbols        1062 non-null object
user_location           1062 non-null object
user_description        1062 non-null object
user_followers_count    1062 non-null int64
user_friends_count      1062 non-null int64
user_verified           1062 non-null bool
user_lang               0 non-null object
geo                     0 non-null object
coordinates             0 non-null object
place                   12 non-null object
contributors            0 non-null object
is_quote_status         1062 non-null bool
retweet_count           1062 non-null int64
favorite_count          1062 non-null int64
lang                    1062 non-null object
dtypes: bool(3), int64(4), ob

In [13]:
df_republicans.head()

Unnamed: 0_level_0,created_at,text,truncated,entities_hashtags,entities_symbols,user_location,user_description,user_followers_count,user_friends_count,user_verified,user_lang,geo,coordinates,place,contributors,is_quote_status,retweet_count,favorite_count,lang
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1
1180545384478859264,Sat Oct 05 18:08:47 +0000 2019,ICYMI: 61 Members of Congress have cosponsored...,True,[],[],"Arizona, USA",Representing Arizona's Fighting Fifth.,43102,1443,True,,,,,,False,8,19,en
1180542322909548549,Sat Oct 05 17:56:37 +0000 2019,Congratulations to the 2019 Congressional Vete...,True,[],[],#TX31,U.S. Congressman proudly serving the 31st Dist...,15419,846,True,,,,,,False,0,1,en
1180535462810787841,Sat Oct 05 17:29:22 +0000 2019,More moments from N. Houston/Montgomery County...,True,[],[],"The Woodlands, Texas",Dad. Husband. Top Republican @WaysandMeansGOP....,49237,748,True,,,,,,False,0,4,en
1180535448684367872,Sat Oct 05 17:29:18 +0000 2019,Celebrating 75 years at the Busy Bee Restauran...,False,[],[],Eastern/Southeastern Ohio,Proudly representing #Ohio's 6th Congressional...,36911,728,True,,,,,,False,0,3,en
1180534029692612609,Sat Oct 05 17:23:40 +0000 2019,Imagine being accused of a crime and not havin...,True,[],[],"Dickinson, ND and Washington, DC","I'm a lifelong, devoted North Dakotan and a ti...",9180,241,True,,,,,,False,2,12,en


In [14]:
df_republicans.to_csv('republican_tweets_raw.csv')
df_democrats.to_csv('democrat_tweets_raw.csv')

In [15]:
df_republicans_relevant = df_republicans[['text', "truncated"]]
df_democrats_relevant = df_democrats[['text', 'truncated']]

In [16]:
df_republicans_relevant['party'] = 1

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.


In [17]:
df_democrats_relevant['party'] = 0

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.


In [18]:
df_republicans_relevant.to_csv('republican_tweets.csv')
df_democrats_relevant.to_csv('democrat_tweets.csv')