In [1]:
# Data handling and numerical operations
import pandas as pd                                 # Data manipulation and analysis
import numpy as np                                  # Numerical operations

# Data visualization
import matplotlib.pyplot as plt                     # Plotting graphs
import seaborn as sns                               # Advanced data visualization

# Natural Language Processing libraries
import nltk
from nltk.corpus import stopwords                   # Stopwords in text processing
from nltk.stem.porter import PorterStemmer          # Stemming words
import re                                           # Regular expression operations
from wordcloud import WordCloud, STOPWORDS          # Generating word cloud visualizations


# Tensorflow and huggingface
import tensorflow as tf
from transformers import BertTokenizer, TFBertForSequenceClassification, BertConfig

# Scikit-learn
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.model_selection import train_test_split, RepeatedKFold, cross_val_score
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix, roc_auc_score
from sklearn.preprocessing import LabelBinarizer
from sklearn.metrics import RocCurveDisplay
from sklearn.pipeline import Pipeline

# Utility libraries
import chardet                                      # Detecting character encoding
from collections import Counter
from scipy.sparse import hstack                # Handling collections of data

from urllib.parse import urlparse                   # parsing base url values
%matplotlib inline
# Downloading NLTK resources
nltk.download('stopwords')

  from .autonotebook import tqdm as notebook_tqdm





[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\TOSHIBA\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [2]:
from transformers import BertTokenizer, TFBertForSequenceClassification
from transformers import pipeline

In [3]:
model = TFBertForSequenceClassification.from_pretrained("ProsusAI/finbert",num_labels=3, from_pt=True)
tokenizer = BertTokenizer.from_pretrained("ProsusAI/finbert")
nlp = pipeline("sentiment-analysis", model=model, tokenizer=tokenizer)




Some weights of the PyTorch model were not used when initializing the TF 2.0 model TFBertForSequenceClassification: ['bert.embeddings.position_ids']
- This IS expected if you are initializing TFBertForSequenceClassification from a PyTorch model trained on another task or with another architecture (e.g. initializing a TFBertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFBertForSequenceClassification from a PyTorch model that you expect to be exactly identical (e.g. initializing a TFBertForSequenceClassification model from a BertForSequenceClassification model).
All the weights of TFBertForSequenceClassification were initialized from the PyTorch model.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFBertForSequenceClassification for predictions without further training.


In [4]:
df = pd.read_feather("../Data/Data145_wostem.feather")

In [5]:
df.head(5)

Unnamed: 0,text,sentiment,base_url
0,video i was in my office i was minding my own ...,0,twitter.com
1,the price of lumber lb f is down since hitting...,0,twitter.com
2,who says the american dream is dead,-1,buff.ly
3,barry silbert is extremely optimistic on bitco...,1,twitter.com
4,how satellites avoid attacks and space junk wh...,-1,on.forbes.com


In [None]:
df.drop_duplicates(subset=['text', 'base_url'], inplace=True)

In [None]:
df.reset_index(inplace=True, drop = True)

In [4]:
X = df[['text', 'base_url']]
Y = df['sentiment']

In [6]:
# Concatenate text, base_url and company names data with special tag
X_combined = X['text'] + ' <EOF> ' + X['base_url']
X_combined

In [None]:
Y

In [8]:
sentiment_mapping = {1: 'positive', -1: 'negative', 0:'neutral'}
Y= df['sentiment'].map(sentiment_mapping)

In [None]:
Y

In [None]:
sentences=[]
for text in X_combined:
    sentences.append(text)
sentences

In [None]:
results = nlp(sentences[0:10])
print(results)

In [None]:
predicted_labels = []
for result in results:
    predicted_labels.append(result['label'])

In [None]:
predicted_labels

In [None]:
# Calculate evaluation metrics
print(classification_report(Y[0:10], predicted_labels))

              precision    recall  f1-score   support

    negative       0.00      0.00      0.00         2
     neutral       0.62      0.83      0.71         6
    positive       1.00      0.50      0.67         2

    accuracy                           0.60        10
   macro avg       0.54      0.44      0.46        10
weighted avg       0.57      0.60      0.56        10



In [None]:
# confusion matrix
cm = confusion_matrix(Y[0:10], predicted_labels)
cm

In [None]:
# printing confusion matrix
def plot_conf_mat(cm):
    class_names=['Negative', 'Neutral', 'Positive'] # name  of classes
    fig, ax = plt.subplots()
    tick_marks = np.arange(len(class_names))
    plt.xticks(tick_marks, labels=[-1,0,1])
    plt.yticks(tick_marks, labels=class_names)
    # create heatmap
    sns.heatmap(pd.DataFrame(cm), annot=True, cmap="YlGnBu" ,fmt='g')
    ax.xaxis.set_label_position("top")
    plt.tight_layout()
    plt.title('Confusion matrix', y=1.1)
    plt.ylabel('Actual label')
    plt.xlabel('Predicted label')

In [None]:
# plotting confusion matrix
plot_conf_mat(cm)