## Installing and importing required modules

In [None]:
!pip install snscrape

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting snscrape
  Downloading snscrape-0.6.2.20230320-py3-none-any.whl (71 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m71.8/71.8 kB[0m [31m1.7 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: snscrape
Successfully installed snscrape-0.6.2.20230320


In [None]:
import pandas as pd
import datetime
from tqdm.notebook import tqdm # to display status bar
import snscrape.modules.twitter as sntwitter

# For google colab
from google.colab import files

## Checking tweet data fields

In [None]:
scraper = sntwitter.TwitterSearchScraper("gpt4")

In [None]:
for tweet in scraper.get_items():
  break

In [None]:
tweet

Tweet(url='https://twitter.com/mikeypiro/status/1647635297906331650', date=datetime.datetime(2023, 4, 16, 16, 17, 21, tzinfo=datetime.timezone.utc), rawContent='Some up-front disclaimers for #AutoGPT, I am using API keys that are free/rate limited. I would imagine that with #GPT4 API access, the agents I have been tinkering with will be better. (If anyone from @OpenAI wants to boost that, DMs are open, #please and #thankyou).', renderedContent='Some up-front disclaimers for #AutoGPT, I am using API keys that are free/rate limited. I would imagine that with #GPT4 API access, the agents I have been tinkering with will be better. (If anyone from @OpenAI wants to boost that, DMs are open, #please and #thankyou).', id=1647635297906331650, user=User(username='mikeypiro', id=117491537, displayname='MikeyPiro', rawDescription='COO @CPGCLUB\n\n⌐◨-◨', renderedDescription='COO @CPGCLUB\n\n⌐◨-◨', descriptionLinks=None, verified=False, created=datetime.datetime(2010, 2, 25, 18, 41, 53, tzinfo=datet

## Example 1: Scraping by keyword and language


In [None]:
# Query
query = "donald trump"             # keyword

# query = "#chatgpt"      # hashtag
# query = "@elonmusk"     # mention
# query = "chat gpt"      # keyphrase

# Language
language = "en"

scraper = sntwitter.TwitterSearchScraper(query + " lang:" + language)

In [None]:
tweets = []
tweet_count = 100

for i, tweet in tqdm(enumerate(scraper.get_items()), total = tweet_count + 1):
    data = [
        tweet.date, 
        tweet.id, 
        tweet.rawContent, 
        tweet.user.username, 
        tweet.likeCount, 
        tweet.retweetCount ]
    
    tweets.append(data)
    if i >= tweet_count:
        break

  0%|          | 0/101 [00:00<?, ?it/s]

In [None]:
tweet_df = pd.DataFrame(tweets,  columns = ['date', 'id', 'content', 'username', 'like_count', 'retweet_count'])

In [None]:
tweet_df.to_csv('my_data.csv', index=False)

In [None]:
files.download('my_data.csv')

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

## Example 2: Tweets from specific user and date range

In [None]:
# Username
username = "elonmusk" 

# Language
language = "en"

# Date range
since = "2023-04-01"
until = "2023-04-15"

scraper1 = sntwitter.TwitterSearchScraper("from:" + username + " lang:" + language + " since:" + since + " until:" + until)

In [None]:
tweets1 = []

for i, tweet in tqdm(enumerate(scraper1.get_items())):
  data = [
      tweet.date,
      tweet.id,
      tweet.rawContent,
      tweet.user.username,
      tweet.likeCount,
      tweet.retweetCount ]

  tweets1.append(data)

0it [00:00, ?it/s]

In [None]:
tweet_df1 = pd.DataFrame(tweets1,  columns = ['date', 'id', 'content', 'username', 'like_count', 'retweet_count'])

In [None]:
tweet_df1.to_csv('my_data1.csv', index=False)

In [None]:
files.download('my_data1.csv')

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

## How I scraped 500k tweets on ChatGPT

Scraping 500k tweets using snscrape, converting it to CSV and downloading it


In [None]:
# Query
query = "(chatgpt OR chat gpt OR @chatgpt OR #chatgpt)"

# Language
language = "en"

# Scraping 
scraper = sntwitter.TwitterSearchScraper(query + " lang:" + language)

In [None]:
tweets1 = []
tweet_count = 500000

for i, tweet in tqdm(enumerate(scraper.get_items()), total = tweet_count):
    data = [
        tweet.date, 
        tweet.id, 
        tweet.rawContent, 
        tweet.user.username, 
        tweet.likeCount, 
        tweet.retweetCount ]
    
    tweets1.append(data)
    if i > tweet_count:
        break

  0%|          | 0/500000 [00:00<?, ?it/s]

Woah! that took over 7 hours!

In [None]:
# Making the data into dataframe with suitable columns
tweet_df1 = pd.DataFrame(tweets1,  columns = ['date', 'id', 'content', 'username', 'like_count', 'retweet_count'])

# Converting dataframe to .csv file
tweet_df1.to_csv('my_data_500k.csv', index=False)

# Download the CSV file to your local machine
files.download('my_data_500k.csv')