# Top Tweets
In this notebook, we fetch the top 100 tweets from the top 1000 Twitter accounts with the most followers.

Imports

In [2]:
import pandas as pd
import numpy as np
from tqdm import tqdm

import twint
import nest_asyncio
nest_asyncio.apply()

Get top accounts and drop any users that don't have many tweets ([thanks to this gist](https://gist.github.com/mbejda/9c3353780270e7298763))

In [6]:
df = pd.read_csv('../data/top_accounts.csv')
df = df.dropna(subset=['tweet_count'])
top_accounts = df[df.tweet_count >= 2500]['twitter_username'].to_list()
top_accounts[:10]

766


['justinbieber',
 'katyperry',
 'rihanna',
 'Cristiano',
 'ladygaga',
 'elonmusk',
 'TheEllenShow',
 'KimKardashian',
 'selenagomez',
 'jtimberlake']

Iterate through top accounts and get tweets with at least 50,000 likes and 1,000 replies

In [52]:
for i in tqdm(range(len(top_accounts))):
    # Fetch user
    username = top_accounts[i]
    
    # Run the query
    c = twint.Config()
    c.Username = username
    c.Pandas = True
    c.Hide_output = True
    c.Min_likes = 50000
    c.Min_replies = 1000
    twint.run.Search(c)

    # Store results
    tweets = twint.storage.panda.Tweets_df
    if len(tweets) > 0:
        tweets = tweets.head(100)
        tweets = tweets[['id', 'tweet']]
        tweets.to_csv(f'../data/tweets/{username}.csv', index=False)

100%|██████████| 766/766 [22:09<00:00,  1.74s/it]

[!] No more data! Scraping will stop now.
found 0 deleted tweets in this search.





Concatenate results into a single CSV

In [23]:
import os
import glob

# Concatenate all the tweets
usernames = [os.path.basename(f).split('.')[0] for f in glob.glob('../data/tweets/*.csv')]
tweets = pd.concat([pd.read_csv(f'../data/tweets/{username}.csv') for username in usernames])

# Add a column for usernames
tweet_counts = [len(pd.read_csv(f'../data/tweets/{username}.csv')) for username in usernames]
users_col = []
for name, ct in zip(usernames, tweet_counts):
    for i in range(ct):
        users_col.append(name)
tweets.insert(0, 'username', users_col)

# Save as CSV
tweets.to_csv('../data/parent_tweets.csv', index=False)
len(tweets)

4237