# Tweet Sentiment Analysis with BERT

Performing sentiment analysis using BERT (fine-tuning)

In [60]:
import pandas as pd
import re
import nltk
from nltk.tokenize import TweetTokenizer
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

In [61]:
# Download NLTK resources if not already done
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('omw-1.4')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Loong\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\Loong\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     C:\Users\Loong\AppData\Roaming\nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


True

In [62]:
# read the data
data = pd.read_csv('../data/data.csv')

# preview the ddata
data.head()

Unnamed: 0,sentiment,message,tweetid
0,-1,@tiniebeany climate change is an interesting h...,792927353886371840
1,1,RT @NatGeoChannel: Watch #BeforeTheFlood right...,793124211518832641
2,1,Fabulous! Leonardo #DiCaprio's film on #climat...,793124402388832256
3,1,RT @Mick_Fanning: Just watched this amazing do...,793124635873275904
4,2,"RT @cnalive: Pranita Biswasi, a Lutheran from ...",793125156185137153


In [63]:
# Drop unnecessary columns and duplicates
data = data.drop_duplicates(subset=['message'])

In [64]:
# Initialize tools
tokenizer = TweetTokenizer()
stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()

# Selective cleaning
### Keeping hashtag words

In [65]:
def clean_text(text):
    text = text.lower()  # Convert to lowercase
    text = re.sub(r'\brt\b', '<rt>', text)  # Replace 'RT' retweet tag with <rt>
    text = re.sub(r'@\w+', '<mention>', text)  # Replace @mentions with <mention>
    text = re.sub(r"http\S+|www\S+|https\S+", '<url>', text, flags=re.MULTILINE)  # Replace URLs with <url>
    text = re.sub(r'#', '', text)  # Remove only the '#' symbol, keep the hashtag word
    text = re.sub(r'[^a-zA-Z<>\s]', '', text)  # Remove special characters and numbers, keep < and > for tags
    text = re.sub(r'\s+', ' ', text).strip()  # Remove extra whitespace

    # Remove stopwords
    from nltk.corpus import stopwords
    stop_words = set(stopwords.words('english'))
    tokens = text.split()
    tokens = [word for word in tokens if word not in stop_words]

    # Lemmatization
    from nltk.stem import WordNetLemmatizer
    lemmatizer = WordNetLemmatizer()
    tokens = [lemmatizer.lemmatize(word) for word in tokens]

    text = ' '.join(tokens)
    return text

In [66]:
# Apply cleaning and store in a new DataFrame
data_selective_clean = data.copy()
data_selective_clean['clean_message'] = data_selective_clean['message'].apply(clean_text)

# Display the cleaned data
print(data_selective_clean[['message', 'clean_message']].head())

                                             message  \
0  @tiniebeany climate change is an interesting h...   
1  RT @NatGeoChannel: Watch #BeforeTheFlood right...   
2  Fabulous! Leonardo #DiCaprio's film on #climat...   
3  RT @Mick_Fanning: Just watched this amazing do...   
4  RT @cnalive: Pranita Biswasi, a Lutheran from ...   

                                       clean_message  
0  <mention> climate change interesting hustle gl...  
1  <rt> <mention> watch beforetheflood right <men...  
2  fabulous leonardo dicaprios film climate chang...  
3  <rt> <mention> watched amazing documentary leo...  
4  <rt> <mention> pranita biswasi lutheran odisha...  


In [None]:
# Save cleaned data
data_selective_clean.to_csv('../data/data_selective_clean.csv', index=False)

### Clean all symbols, mentions, hashtags, special characters

In [68]:
def clean_text(text):
    text = text.lower()  # Convert to lowercase
    text = re.sub(r'\brt\b', '', text)  # Remove 'RT' retweet tag
    text = re.sub(r'@\w+', '', text)    # Remove @mentions entirely
    text = re.sub(r'#\w+', '', text)    # Remove hashtags and hashtag words entirely
    text = re.sub(r"http\S+|www\S+|https\S+", '', text, flags=re.MULTILINE)  # Remove URLs
    text = re.sub(r'[^a-zA-Z\s]', '', text)  # Remove special characters and numbers
    text = re.sub(r'\s+', ' ', text).strip() # Remove extra whitespace

    # Remove stopwords
    from nltk.corpus import stopwords
    stop_words = set(stopwords.words('english'))
    tokens = text.split()
    tokens = [word for word in tokens if word not in stop_words]
    
    # Lemmatization
    from nltk.stem import WordNetLemmatizer
    lemmatizer = WordNetLemmatizer()
    tokens = [lemmatizer.lemmatize(word) for word in tokens]

    text = ' '.join(tokens)

    return text

In [69]:
# Apply cleaning and store in a new DataFrame
data_clean_all = data.copy()
data_clean_all['clean_message'] = data_selective_clean['message'].apply(clean_text)

# Display the cleaned data
print(data_clean_all[['message', 'clean_message']].head())

                                             message  \
0  @tiniebeany climate change is an interesting h...   
1  RT @NatGeoChannel: Watch #BeforeTheFlood right...   
2  Fabulous! Leonardo #DiCaprio's film on #climat...   
3  RT @Mick_Fanning: Just watched this amazing do...   
4  RT @cnalive: Pranita Biswasi, a Lutheran from ...   

                                       clean_message  
0  climate change interesting hustle global warmi...  
1  watch right travel world tackle climate change...  
2  fabulous leonardo film change brilliant watch via  
3  watched amazing documentary leonardodicaprio c...  
4  pranita biswasi lutheran odisha give testimony...  


In [None]:
# Save cleaned data
data_clean_all.to_csv('../data/data_clean_all.csv', index=False)

# Please save your data in the `data` folder