In [1]:
# Import Pandas for dataframe conversion and snscrape to scrape twitter

import pandas as pd
import snscrape.modules.twitter as sntwitter
import re

In [2]:
# Get only english tweets
query = "lang:en"

# Build array for storing the tweets
tweets = []

# Get 3500 tweets. This is equal to around train + test set. I am doing 5 times the reddit test set
# because in some tests I did a lot of them are neutral which are kind of useless.
limit = 3500

In [3]:
# Use provided TwitterSearchScraper and iterate over all tweets from get_items
for tweet in sntwitter.TwitterSearchScraper(query).get_items():

  # If the array hits the limit of 3500, stop getting tweets
  if len(tweets) == limit:
    break

  # Get the date, text content, username, id and url. Don't know if we need id and url
  tweets.append([tweet.date, tweet.content, tweet.user.username, tweet.id, tweet.url])

In [4]:
# Make a dataframe from the tweet array
tweets_df = pd.DataFrame(tweets, columns=['Date', 'Tweet', 'User', 'Tweet ID', 'Tweet Url'])

In [5]:
# Remove @, #, RT, links, and new line escape character
def cleanTweets(tweet):
  tweet = re.sub('@[A-Za-z0-9_]+', '', tweet)
  tweet = re.sub('#', '', tweet)
  tweet = re.sub('RT[\s]+', '', tweet)
  tweet = re.sub('https?:\/\/\S+', '', tweet)
  tweet = re.sub('\n', ' ', tweet)
  return tweet

In [7]:
# Apply cleanTweets to every single item in the tweets column
tweets_df['Cleaned Tweets'] = tweets_df['Tweet'].apply(cleanTweets)

In [8]:
tweets_df.head()

Unnamed: 0,Date,Tweet,User,Tweet ID,Tweet Url,Cleaned Tweets
0,2022-11-10 09:28:34+00:00,Hardik Pandya best T20I knock .. rampage sir 🙏🥵,TheSandeepTweet,1590637529577193473,https://twitter.com/TheSandeepTweet/status/159...,Hardik Pandya best T20I knock .. rampage sir 🙏🥵
1,2022-11-10 09:28:34+00:00,"@fatima__kk Ok,my number watts app is 67627980",lasseni_tangara,1590637529556217856,https://twitter.com/lasseni_tangara/status/159...,"Ok,my number watts app is 67627980"
2,2022-11-10 09:28:34+00:00,@Pihusha71 Yes I Love You,akbor_mollah,1590637529489117184,https://twitter.com/akbor_mollah/status/159063...,Yes I Love You
3,2022-11-10 09:28:34+00:00,mama’s home.,0FCHARMS,1590637529434271744,https://twitter.com/0FCHARMS/status/1590637529...,mama’s home.
4,2022-11-10 09:28:34+00:00,let go of what isIf I should meet thee #鉴穴 #鉴逼...,OmarySa63134440,1590637529375858688,https://twitter.com/OmarySa63134440/status/159...,let go of what isIf I should meet thee 鉴穴 鉴逼 鉴...


In [10]:
# Save it to a csv
tweets_df.to_csv('Tweets 11-10-2022.csv', index=False)