In [1]:
import tweepy
print(tweepy.__version__)


4.16.0


In [None]:
import os
from dotenv import load_dotenv
import tweepy
import time
import pandas as pd


load_dotenv()

API_KEY = os.getenv("API_KEY")
API_SECRET = os.getenv("API_SECRET")
ACCESS_TOKEN = os.getenv("ACCESS_TOKEN")
ACCESS_TOKEN_SECRET = os.getenv("ACCESS_TOKEN_SECRET")
BEARER_TOKEN = os.getenv("BEARER_TOKEN")  # You'll need to add this to your .env file

# Authenticate with API v2 using Bearer Token
client = tweepy.Client(
    bearer_token=BEARER_TOKEN,
    consumer_key=API_KEY,
    consumer_secret=API_SECRET,
    access_token=ACCESS_TOKEN,
    access_token_secret=ACCESS_TOKEN_SECRET,
    wait_on_rate_limit=True
)

# Test authentication
try:
    me = client.get_me()
    print(f"Authentication successful! Logged in as: {me.data.username}")
except Exception as e:
    print(f"Authentication failed: {e}")
    exit()

# Search query
searchquery = "angry -is:retweet lang:en"  # Excludes retweets and filters for English

# Configuration
total_number = 1500 # total number of tweets to collect
max_results = 100    # tweets per request (10-100 for API v2)

text = []
tweet_ids = []
count = 0

print(f"Starting to collect up to {total_number} tweets with query: '{searchquery}'")
print("Note: Free tier has limitations on tweet volume and search recency\n")

try:
    # Use Paginator for API v2
    for response in tweepy.Paginator(
        client.search_recent_tweets,
        query=searchquery,
        max_results=max_results,
        tweet_fields=['lang', 'created_at'],
        limit=total_number // max_results + 1
    ):
        if response.data:
            for tweet in response.data:
                if len(text) >= total_number:
                    break
                
                text.append(tweet.text)
                tweet_ids.append(tweet.id)
                count += 1
                
                if count % 100 == 0:
                    print(f"Collected {count} tweets...")
        
        if len(text) >= total_number:
            break
    
except tweepy.TweepyException as e:
    print(f"Error occurred: {e}")
except KeyboardInterrupt:
    print("\nCollection interrupted by user.")

print(f"\nCollection complete! Total tweets collected: {len(text)}")

# Create dataframe
if text:
    d = {
        "text": text, 
        "id": [1] * len(text),  # 1 is angry
        "tweet_id": tweet_ids
    }
    df = pd.DataFrame(data=d)
    
    df.to_csv('upset.csv', header=True, index=False, encoding='utf-8')
    print(f"Saved {len(text)} tweets to upset.csv")
else:
    print("No tweets collected. Please check your API access level.")

Step 3 (Cleaning the data and getting the words that appear):

If you have followed what I have done till now and checked your csv files you will notice that some of the tweets have weird symbols. Our first goal is to get rid of them. Afterwards, we want to produce a csv file with the words that appear in our sentences.

To clean the data, the first thing we do is to import any libraries we need and import the csv we are interested in as well getting rid of any “nan” values (line 10 does this).



In [7]:
import pandas as pd
import string

# import my csv file
df = pd.read_csv('C:/Users/datas/OneDrive/Desktop/twitter_project/upset.csv')

# remove any rows with a "nan" in them
df = df.dropna(axis=0, how = 'any')



We then create a function that, given a text, removes any character or string of characters that are not readable in ASCII values. We then make all the texts lower case. I use df[‘text’] because that is the name of the column I stored the text values in the csv file.



In [8]:
def removetext(text):
    return "".join([i if ord(i) < 128 else '' for i in text ])

# here I am doing the actual removing
df['text'] = df['text'].apply(removetext)

# make all my texts lower case
df ['text'] = df['text'].apply(lambda x: x.lower())


Also remove any unwanted punctuation.  
With that our dataframe is clean.

In [9]:
df['text'] = df['text'].apply(lambda x: x.replace(',',''))
df['text'] = df['text'].apply(lambda x: x.replace('\n',''))
df['text'] = df['text'].apply(lambda x: x.replace('?',''))
df['text'] = df['text'].apply(lambda x: x.replace('!',''))
df['text'] = df['text'].apply(lambda x: x.replace('"',''))
df['text'] = df['text'].apply(lambda x: x.replace(';',''))
df['text'] = df['text'].apply(lambda x: x.replace('#',''))
df['text'] = df['text'].apply(lambda x: x.replace('&amp',''))

Our next goal is to get the unique words from it that appear.

In [11]:
# here I get each unique keyword from dataframe
array = df['text'].str.split('', expand=True).stack().value_counts()
print(array) #to see what this looks like

#make a dataframe of the words and frequency with the words appear
d = {'word': array.index, 'frequency':array}
df2 = pd.DataFrame(data=d)

#get rid of anywords thatare mentioned less than 10 times
df2['frequency'] = df2['frequency'][df2['frequency']>10]




     4833
e    2722
t    2087
a    2012
n    1755
o    1743
i    1585
s    1460
r    1452
h    1140
l     971
y     815
d     807
u     796
g     735
c     648
m     638
p     532
w     470
b     437
f     425
      398
.     365
k     312
@     297
v     252
/     129
j     112
'      94
_      65
x      52
1      52
z      49
:      48
9      41
2      40
0      36
q      35
3      34
4      32
6      32
8      26
7      24
5      17
-      15
*       5
|       4
(       3
)       3
&       2
=       1
%       1
[       1
]       1
Name: count, dtype: int64


Line of code "array = df['text'].str.split('', expand=True).stack().value_counts()" does two things. It first splits every string in each row of the dataframe into individual words. It does this by splitting after every space. Afterwards, “.stack().value_counts()” finds a new/unique word and counts how many times it appears. It stores these values into the variable called “array.”

In [15]:
print(df2.columns)


Index(['word', 'frequency'], dtype='object')


In [16]:
import re

import re

# words you want to remove
words_to_remove = [':(', 'https://t', ':((', ':(((', ':((((', ':(((((', ':', '(']

# build safe regex pattern
pattern = '|'.join(map(re.escape, words_to_remove))

# remove any rows whose 'word' value contains those unwanted strings
df2 = df2[~df2['word'].str.contains(pattern, case=False, na=False)]

# save cleaned data
df2.to_csv('unsmile_words.csv', header=True, index=False, encoding='utf-8')




Step 4 (Get the bag of words):  
Now that I have the list of words that appear in each individual file (happy, fun, sad, and : ( ), I want to combine them all into one dataframe and save this into a csv file called wordbag.csv.



In [17]:
happy = pd.read_csv('happy_words.csv')
sad = pd.read_csv('sad_wrods.csv')
unsmile = pd.read_csv('unsmile_words.csv')
fun = pd.read_csv('fun_words,csv')

wordbag = pd.concat([happy, sad, unsmile, fun]).drop_duplicates().reset_index(drop=True)

print(wordbag)

wordbag.to_csv('wordbag.csv')

FileNotFoundError: [Errno 2] No such file or directory: 'happy_words.csv'