In [None]:
# preprocessing.ipynb

# This notebook demonstrates how to preprocess tweets collected from the Twitter API.
# It includes cleaning, tokenization, and stopword removal using NLTK and Hugging Face Transformers.

import re
import nltk
from transformers import BertTokenizer

# Download NLTK stopwords
nltk.download('stopwords')

# Initialize the tokenizer and stopwords
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
stopwords = set(nltk.corpus.stopwords.words('english'))

# Sample tweet
sample_tweet = "There’s been a massive earthquake in California! #earthquake #disaster"

# Cleaning function
def clean_tweet(tweet):
    tweet = re.sub(r'http\S+', '', tweet)
    tweet = re.sub(r'@\w+', '', tweet)
    tweet = re.sub(r'#', '', tweet)
    tweet = re.sub(r'\W', ' ', tweet)
    tweet = tweet.lower()
    return tweet

# Apply cleaning
cleaned_tweet = clean_tweet(sample_tweet)
cleaned_tweet = ' '.join([word for word in cleaned_tweet.split() if word not in stopwords])

# Tokenize
tokenized_tweet = tokenizer(cleaned_tweet, padding='max_length', truncation=True, max_length=128, return_tensors="pt")
tokenized_tweet
