# Sentiment Analysis in Python

Comparison the true sentiment with several methods. From :

1. NLTK Python
2. Hugging Face using ROBERTA

# Install Framework Library

In [2]:
pip install nltk

Note: you may need to restart the kernel to use updated packages.


In [4]:
import nltk
nltk.download('vader_lexicon')

[nltk_data] Downloading package vader_lexicon to
[nltk_data]     /usr/share/nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!


True

# Hugging Face API

# Importing Library and Dataset



In [5]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
plt.style.use('ggplot')

In [6]:
import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

/kaggle/input/forex-sentiment-dataset/sentiment_annotated_with_texts (1).csv


In [8]:
# Read in data from CSV

df = pd.read_csv('/kaggle/input/forex-sentiment-dataset/sentiment_annotated_with_texts (1).csv')
pd.set_option('display.max_rows', 300) # set up max rows

In [9]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2291 entries, 0 to 2290
Data columns (total 10 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   published_at        2291 non-null   object 
 1   ticker              2291 non-null   object 
 2   true_sentiment      2291 non-null   object 
 3   title               2291 non-null   object 
 4   author              2291 non-null   object 
 5   url                 2291 non-null   object 
 6   source              2291 non-null   object 
 7   text                2291 non-null   object 
 8   finbert_sentiment   2291 non-null   object 
 9   finbert_sent_score  2291 non-null   float64
dtypes: float64(1), object(9)
memory usage: 179.1+ KB


In [10]:
df.drop(['finbert_sentiment', 'finbert_sent_score' ], axis=1, inplace=True)
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2291 entries, 0 to 2290
Data columns (total 8 columns):
 #   Column          Non-Null Count  Dtype 
---  ------          --------------  ----- 
 0   published_at    2291 non-null   object
 1   ticker          2291 non-null   object
 2   true_sentiment  2291 non-null   object
 3   title           2291 non-null   object
 4   author          2291 non-null   object
 5   url             2291 non-null   object
 6   source          2291 non-null   object
 7   text            2291 non-null   object
dtypes: object(8)
memory usage: 143.3+ KB


In [11]:
data_selected = df[['text','true_sentiment']]
data_selected.columns = ['text','label']

data_selected = data_selected.head(200)
data_selected.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 200 entries, 0 to 199
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   text    200 non-null    object
 1   label   200 non-null    object
dtypes: object(2)
memory usage: 3.2+ KB


In [12]:
# adding id, so we can find out how many data out there count and set it into 200 rows

data_selected['id'] = range(200)
cols = ['id'] + [col for col in data_selected.columns if col != 'id']
data_selected = data_selected[cols]

In [13]:
data_selected.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 200 entries, 0 to 199
Data columns (total 3 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   id      200 non-null    int64 
 1   text    200 non-null    object
 2   label   200 non-null    object
dtypes: int64(1), object(2)
memory usage: 4.8+ KB


## Quick EDA

In [21]:
counts = df['true_sentiment'].value_counts().sort_index()

# Create a bar chart
fig = px.bar(x=counts.index, y=counts.values,
             title='Sentiment Score',
             labels={'x': 'Score', 'y': 'Count'},
             width=1000, height=1000)

# Show the plot
fig.show()

# 1. NLTK Framework

In [15]:
from nltk.sentiment import SentimentIntensityAnalyzer
from tqdm.notebook import tqdm

# Create an instance of SentimentIntensityAnalyzer
sia = SentimentIntensityAnalyzer()


The twython library has not been installed. Some functionality from the twitter package will not be available.



In [16]:
testing_data = data_selected['text'][100]

sia.polarity_scores(testing_data)

{'neg': 0.0, 'neu': 0.97, 'pos': 0.03, 'compound': 0.3818}

In [17]:
# Run the polarity score on the entire dataset

res = {}
for i, row in tqdm(data_selected.iterrows(), total=len(data_selected)):
    text = row['text']
    myid = row['id']
    res[myid] = sia.polarity_scores(text)

  0%|          | 0/200 [00:00<?, ?it/s]

In [22]:
vaders = pd.DataFrame(res).T
vaders = vaders.reset_index().rename(columns={'index': 'id'})
vaders = vaders.merge(data_selected, how='left')
vaders.head()

Unnamed: 0,id,neg,neu,pos,compound,text,label
0,0,0.025,0.865,0.111,0.877,The Euro was able to appreciate particularly s...,Positive
1,1,0.07,0.876,0.054,0.0258,EUR/CHF yesterday broke above 1.00. Economists...,Positive
2,2,0.05,0.932,0.018,-0.5204,EUR/CHF vaults parity for the first time since...,Neutral
3,3,0.084,0.781,0.135,0.7058,EUR/CHF climbs back above parity. Economists a...,Positive
4,4,0.032,0.841,0.127,0.743,EUR/CHF has broken out above the sideways rang...,Positive


## Plot VADER results

In [20]:
def plot_bar(data, x_col, y_col, title, color_negative='red', color_neutral='yellow', color_positive='blue'):
    data[x_col] = data[x_col].astype('category')
    colors = [mcolors.to_rgba(color_negative), mcolors.to_rgba(color_neutral), mcolors.to_rgba(color_positive)]
    ax = sns.barplot(data=data, x=x_col, y=y_col, ax=axs[0], color=colors)
    ax.set_title(title)

plot_bar(vaders, 'label', 'compound', 'difference from true sentiment and nltk', color_negative='red', color_neutral='yellow', color_positive='blue')
plt.tight_layout()
plt.show()

NameError: name 'mcolors' is not defined

# 2. Hugging Face Pretrained Model

In [23]:
from transformers import AutoTokenizer
from transformers import AutoModelForSequenceClassification
from scipy.special import softmax

In [24]:
MODEL = f"cardiffnlp/twitter-roberta-base-sentiment"
tokenizer = AutoTokenizer.from_pretrained(MODEL)
model = AutoModelForSequenceClassification.from_pretrained(MODEL)

config.json:   0%|          | 0.00/747 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/150 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/499M [00:00<?, ?B/s]


TypedStorage is deprecated. It will be removed in the future and UntypedStorage will be the only storage class. This should only matter to you if you are using storages directly.  To access UntypedStorage directly, use tensor.untyped_storage() instead of tensor.storage()



In [25]:
# Run for Roberta Model
encoded_text = tokenizer(testing_data, return_tensors='pt')
output = model(**encoded_text)
scores = output[0][0].detach().numpy()
scores = softmax(scores)
scores_dict = {
    'roberta_neg' : scores[0],
    'roberta_neu' : scores[1],
    'roberta_pos' : scores[2]
}
print(scores_dict)

{'roberta_neg': 0.1311302, 'roberta_neu': 0.7802591, 'roberta_pos': 0.0886107}


In [None]:
def polarity_scores_roberta(testing_data):
    encoded_text = tokenizer(testing_data, return_tensors='pt')
    output = model(**encoded_text)
    scores = output[0][0].detach().numpy()
    scores = softmax(scores)
    scores_dict = {
        'roberta_neg' : scores[0],
        'roberta_neu' : scores[1],
        'roberta_pos' : scores[2]
    }
    return scores_dict

In [None]:
res = {}
for i, row in tqdm(data_selected.iterrows(), total=len(data_selected)):
    try:
        text = row['text']
        myid = row['id']
        vader_result = sia.polarity_scores(text)
        vader_result_rename = {}
        for key, value in vader_result.items():
            vader_result_rename[f"vader_{key}"] = value
        roberta_result = polarity_scores_roberta(text)
        both = {**vader_result_rename, **roberta_result}
        res[myid] = both
    except RuntimeError:
        print(f'Broke for id {myid}')

In [None]:
results_df = pd.DataFrame(res).T
results_df = results_df.reset_index().rename(columns={'index': 'id'})
results_df = results_df.merge(data_selected, how='left')

## Compare Scores between models

In [None]:
results_df.info()

# Fine Tuning Using LORA

# Comparing 2 Framework

In [None]:
sns.pairplot(data=results_df,
             vars=['vader_neg', 'vader_neu', 'vader_pos',
                  'roberta_neg', 'roberta_neu', 'roberta_pos'],
            hue='label',
            palette='tab10')
plt.show()

# Model Review using F-Score