In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats as stat

In [2]:
tweet = pd.read_csv('/content/sample_data/assign_sentiment.csv', parse_dates=['day_date'])

In [6]:
tweet.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3717964 entries, 0 to 3717963
Data columns (total 4 columns):
 #   Column      Dtype         
---  ------      -----         
 0   Unnamed: 0  int64         
 1   tweet_id    int64         
 2   body        object        
 3   day_date    datetime64[ns]
dtypes: datetime64[ns](1), int64(2), object(1)
memory usage: 113.5+ MB


In [3]:
tweet = tweet.iloc[:,1:]

In [4]:
tweet.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3717964 entries, 0 to 3717963
Data columns (total 3 columns):
 #   Column    Dtype         
---  ------    -----         
 0   tweet_id  int64         
 1   body      object        
 2   day_date  datetime64[ns]
dtypes: datetime64[ns](1), int64(1), object(1)
memory usage: 85.1+ MB


In [None]:
min_date = tweet['day_date'].min()
max_date = tweet['day_date'].max()

print(f"Date range in the dataset is from {min_date} to {max_date}")

Date range in the dataset is from 2015-01-01 00:00:00 to 2019-12-31 00:00:00


In [5]:
tweet_bert = tweet.copy()

In [6]:
# Step 1: Split the DataFrame into 3 subsets
subset1 = tweet_bert.iloc[:len(tweet_bert) // 3].reset_index(drop=True)
subset2 = tweet_bert.iloc[len(tweet_bert) // 3:2 * len(tweet_bert) // 3].reset_index(drop=True)
subset3 = tweet_bert.iloc[2 * len(tweet_bert) // 3:].reset_index(drop=True)


## Data Preparation

### for VADER: A Rule-Based Sentiment Analysis Tool

we'll clean up the text by removing URLs, mentions, hashtags, and unnecessary punctuation, while retaining the emoticons and capitalization that VADER uses for sentiment scoring.

In [None]:
import pandas as pd
import re
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer

In [None]:
# Initialize VADER sentiment analyzer
analyzer = SentimentIntensityAnalyzer()

def preprocess_tweet(tweet):
    # Remove URLs
    tweet = re.sub(r"http\S+|www\S+", '', tweet)
    # Remove mentions and hashtags
    tweet = re.sub(r"@\S+|#\S+", '', tweet)
    # Remove special characters and extra spaces
    tweet = re.sub(r"[^\w\s]", '', tweet)
    tweet = tweet.strip()
    return tweet

tweet['cleaned_body'] = tweet['body'].apply(preprocess_tweet)

# Apply VADER sentiment analysis
tweet['sentiment_score'] = tweet['cleaned_body'].apply(lambda tweet: analyzer.polarity_scores(tweet)['compound'])

In [None]:
tweet.to_csv('vader_sentiment_score.csv', index=False)

### BERT: A Pre-trained Transformer Model

In [11]:
pip install datasets

Collecting datasets
  Downloading datasets-2.21.0-py3-none-any.whl.metadata (21 kB)
Collecting pyarrow>=15.0.0 (from datasets)
  Downloading pyarrow-17.0.0-cp310-cp310-manylinux_2_28_x86_64.whl.metadata (3.3 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess (from datasets)
  Downloading multiprocess-0.70.16-py310-none-any.whl.metadata (7.2 kB)
Downloading datasets-2.21.0-py3-none-any.whl (527 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m527.3/527.3 kB[0m [31m35.0 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading dill-0.3.8-py3-none-any.whl (116 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m11.4 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading pyarrow-17.0.0-cp310-cp310-manylinux_2_28_x86_64.whl (39.9 MB)
[

In [7]:
import pandas as pd
import re
from transformers import DistilBertTokenizer, DistilBertForSequenceClassification
from transformers import pipeline
import torch

# Remove the corrupted model files
from transformers import AutoTokenizer, AutoModelForSequenceClassification
import shutil
import os

from datasets import Dataset


In [8]:
subset2.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1239321 entries, 0 to 1239320
Data columns (total 3 columns):
 #   Column    Non-Null Count    Dtype         
---  ------    --------------    -----         
 0   tweet_id  1239321 non-null  int64         
 1   body      1239321 non-null  object        
 2   day_date  1239321 non-null  datetime64[ns]
dtypes: datetime64[ns](1), int64(1), object(1)
memory usage: 28.4+ MB


In [None]:
# Find the cache directory
cache_dir = os.path.join(os.path.expanduser("~"), ".cache", "huggingface", "hub")

# Print cache directory path to see where the model is stored
print(f"Model cache directory: {cache_dir}")

# You might need to manually navigate to this directory and delete the model files if they are corrupted.
# Or you can run the following commands to try clearing the model cache:
try:
    shutil.rmtree(cache_dir)
    print("Model cache cleared.")
except Exception as e:
    print(f"Failed to clear cache: {e}")

# Re-download the model and tokenizer
model_name = "distilbert-base-uncased-finetuned-sst-2-english"
tokenizer = DistilBertTokenizer.from_pretrained(model_name, force_download=True)
model = DistilBertForSequenceClassification.from_pretrained(model_name, force_download=True)

Model cache directory: /Users/susanoo/.cache/huggingface/hub
Model cache cleared.


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/629 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/629 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/629 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

device=0 means GPU, -1 means CPU

We use a pre-trained BERT model (distilbert-base-uncased-finetuned-sst-2-english) that is fine-tuned on the SST-2 dataset for sentiment analysis.

In [9]:
# Re-download the model and tokenizer
model_name = "distilbert-base-uncased-finetuned-sst-2-english"
tokenizer = DistilBertTokenizer.from_pretrained(model_name, force_download=True)
model = DistilBertForSequenceClassification.from_pretrained(model_name, force_download=True)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/629 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/629 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/629 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

In [16]:
# Check if GPU is available
device = 0 if torch.cuda.is_available() else -1

# Initialize sentiment analysis pipeline
sentiment_analysis = pipeline("sentiment-analysis", model=model, tokenizer=tokenizer, device=device)

def preprocess_tweet(tweet):
    tweet = re.sub(r"http\S+|www\S+", '', tweet)

    tweet = re.sub(r"@\S+|#\S+", '', tweet)

    tweet = re.sub(r"[^\w\s]", '', tweet)
    tweet = tweet.strip().lower()
    return tweet

# Apply preprocessing to the "body" column
subset3['cleaned_body'] = subset3['body'].apply(preprocess_tweet)

# Convert pandas DataFrame to Hugging Face Dataset
tweets_dataset = Dataset.from_pandas(subset3[['cleaned_body']])

# Function to apply sentiment analysis
def compute_sentiment(batch):
    # Use pipeline to get sentiment for the batch
    results = sentiment_analysis(batch['cleaned_body'])
    # Extract sentiment labels and scores
    batch['sentiment'] = [result['label'] for result in results]
    batch['sentiment_score'] = [result['score'] for result in results]
    return batch

# Apply the sentiment analysis in batches using Hugging Face Dataset's map function
tweets_dataset = tweets_dataset.map(compute_sentiment, batched=True, batch_size=128)

# Convert the dataset back to a pandas DataFrame for final output
tweets_df = tweets_dataset.to_pandas()


Map:   0%|          | 0/1239322 [00:00<?, ? examples/s]

In [11]:
subset2.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1239321 entries, 0 to 1239320
Data columns (total 4 columns):
 #   Column        Non-Null Count    Dtype         
---  ------        --------------    -----         
 0   tweet_id      1239321 non-null  int64         
 1   body          1239321 non-null  object        
 2   day_date      1239321 non-null  datetime64[ns]
 3   cleaned_body  1239321 non-null  object        
dtypes: datetime64[ns](1), int64(1), object(2)
memory usage: 37.8+ MB


In [12]:
tweets_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1239321 entries, 0 to 1239320
Data columns (total 3 columns):
 #   Column           Non-Null Count    Dtype  
---  ------           --------------    -----  
 0   cleaned_body     1239321 non-null  object 
 1   sentiment        1239321 non-null  object 
 2   sentiment_score  1239321 non-null  float64
dtypes: float64(1), object(2)
memory usage: 28.4+ MB


In [18]:
# Assuming subset1 and tweets_df are your DataFrames and have the same number of rows
subset3['sentiment'] = tweets_df['sentiment']
subset3['sentiment_score'] = tweets_df['sentiment_score']

In [19]:
subset3.head()

Unnamed: 0,tweet_id,body,day_date,cleaned_body,sentiment,sentiment_score
0,1007091733480574981,And it's also why I love being short $TSLA. No...,2018-06-14,and its also why i love being short tsla no ot...,NEGATIVE,0.863435
1,1007091780347748352,Fair enough but the $TSLA game will go on only...,2018-06-14,fair enough but the tsla game will go on only ...,NEGATIVE,0.998713
2,1007091797573718016,Seems like there is some understanding of the ...,2018-06-14,seems like there is some understanding of the ...,NEGATIVE,0.992498
3,1007091901286289412,$TSLA see this link to complete the puzzle,2018-06-14,tsla see this link to complete the puzzle,POSITIVE,0.998109
4,1007092690779947008,#DayTrading #Trading #Stocks #StockMarket $SPY...,2018-06-14,spy qqq aapl fb twtr nflx tsla,NEGATIVE,0.988116


In [20]:
# Save the results to a new CSV file
subset3.to_csv('/content/sample_data/bert_sentiment_subset3.csv', index=False)

## Choose a Sentiment Analysis Model

we will use vader and DistilBERT

## Batch Processing