In [4]:
import pandas as pd
import numpy as np
import spacy

In [6]:
df=pd.read_csv('/content/daily_news_headlines.csv')

In [7]:
!python -m spacy download en_core_web_sm
nlp=spacy.load('en_core_web_sm', disable=['ner','parser'])

Collecting en-core-web-sm==3.7.1
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.7.1/en_core_web_sm-3.7.1-py3-none-any.whl (12.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m12.8/12.8 MB[0m [31m47.1 MB/s[0m eta [36m0:00:00[0m
[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_sm')
[38;5;3m⚠ Restart to reload dependencies[0m
If you are in a Jupyter or Colab notebook, you may need to restart Python in
order to load all the package's dependencies. You can do this by selecting the
'Restart kernel' or 'Restart runtime' option.


In [8]:
def tokenize(doc):
    return [t.lemma_.lower() for t in nlp(doc) if \
            not t.is_stop \
            and not t.is_punct \
            and len(t)>2 \
            and not t.is_space]

In [9]:
!pip install -U gensim==4.*



In [10]:
import collections
from gensim.models.keyedvectors import KeyedVectors

In [11]:
!pip install gdown

!gdown "https://drive.google.com/uc?id=1BpfbHu4denceXiv8yfdY3EHgjKIcULku"
embedding_file = './GoogleNews-vectors-negative300.bin.gz'

Downloading...
From (original): https://drive.google.com/uc?id=1BpfbHu4denceXiv8yfdY3EHgjKIcULku
From (redirected): https://drive.google.com/uc?id=1BpfbHu4denceXiv8yfdY3EHgjKIcULku&confirm=t&uuid=a1826891-d71a-45fd-a8b3-2d1d46fba459
To: /content/GoogleNews-vectors-negative300.bin.gz
100% 1.65G/1.65G [00:37<00:00, 43.5MB/s]


In [12]:
word_vectors = KeyedVectors.load_word2vec_format(embedding_file, binary=True,limit=200000)

In [13]:
def sentence_vectorizer(doc):
    # This will store the vectors for all statements
    statement_vectors = []

    for statement in doc:
        # Initialize the sum vector for the current statement
        sum_vector = np.zeros((300,))
        tokened=tokenize(statement)
        for word in tokened:
            try:
                # Add the vector for the current word to the sum vector
                sum_vector+=word_vectors[word]
            except KeyError:
                # If the word is not found in the word_vectors, skip it
                pass

        # Append the sum vector for the current statement to the list
        statement_vectors.append(sum_vector)

    # Return the list of summed vectors for all statements
    return statement_vectors

In [14]:
print(df.Headline)

0       New government policies expected to boost market
1      Market volatility continues amid political unc...
2      Positive market trends benefit tech sector stocks
3      CEO resignation leads to stock price drop for ...
4          Company C reports record profits, stock rises
                             ...                        
360     New government policies expected to boost market
361    CEO resignation leads to stock price drop for ...
362        Economic slowdown expected to hit tech stocks
363     New government policies expected to boost market
364    CEO resignation leads to stock price drop for ...
Name: Headline, Length: 365, dtype: object


In [15]:
sentence_vectorizer(df.Headline)

[array([-4.03015137e-01,  2.43286133e-01, -1.46621704e-01,  2.59765625e-01,
        -3.94409180e-01, -8.22753906e-01,  7.81250000e-03, -1.49047852e-01,
         4.18212891e-01,  1.72607422e-01, -4.61425781e-01, -3.73046875e-01,
        -5.57861328e-02,  1.88232422e-01, -1.45336914e+00,  4.72656250e-01,
         1.31103516e-01,  7.27722168e-01,  9.91210938e-02, -5.76049805e-01,
        -1.93115234e-01,  6.22726440e-01,  3.72802734e-01, -1.59912109e-01,
         3.35937500e-01,  3.74084473e-01, -2.56347656e-02,  6.70074463e-01,
         7.24609375e-01,  7.10449219e-01, -4.72412109e-02, -8.04321289e-01,
        -5.39550781e-01, -2.99377441e-01, -2.00439453e-01, -4.05853271e-01,
         1.55517578e-01,  1.42089844e-01,  4.66552734e-01,  4.32922363e-01,
         3.15551758e-01, -7.59521484e-01,  4.91455078e-01,  5.68847656e-01,
        -4.78507996e-01, -1.14013672e+00,  1.56982422e-01,  4.18090820e-01,
        -4.80224609e-01,  1.52832031e-01,  9.10156250e-01, -4.84619141e-02,
        -2.7

In [16]:
preprocessed_trained_corpus=sentence_vectorizer(df['Headline'])

In [17]:
import spacy
import matplotlib.pyplot as plt
import numpy as np
from sklearn import metrics
from sklearn import model_selection
from sklearn.dummy import DummyClassifier
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import Pipeline

In [18]:
train_data, val_data, train_labels, val_labels = train_test_split(preprocessed_trained_corpus, df['Sentiment'], train_size=0.80,random_state=1)

In [19]:
from sklearn.preprocessing import OneHotEncoder

In [20]:
encoder = OneHotEncoder(sparse=False)

In [24]:
train_labels=np.array(train_labels)

In [25]:
train_labels = train_labels.reshape(-1, 1)

# Fit and transform labels
train_labels = encoder.fit_transform(train_labels)



In [26]:
train_labels

array([[0., 0., 1.],
       [0., 0., 1.],
       [0., 0., 1.],
       [0., 0., 1.],
       [0., 1., 0.],
       [0., 0., 1.],
       [0., 1., 0.],
       [0., 0., 1.],
       [1., 0., 0.],
       [1., 0., 0.],
       [0., 1., 0.],
       [1., 0., 0.],
       [1., 0., 0.],
       [1., 0., 0.],
       [1., 0., 0.],
       [1., 0., 0.],
       [1., 0., 0.],
       [1., 0., 0.],
       [0., 1., 0.],
       [1., 0., 0.],
       [1., 0., 0.],
       [0., 0., 1.],
       [0., 0., 1.],
       [1., 0., 0.],
       [0., 1., 0.],
       [1., 0., 0.],
       [0., 1., 0.],
       [1., 0., 0.],
       [0., 0., 1.],
       [0., 0., 1.],
       [0., 0., 1.],
       [0., 1., 0.],
       [0., 0., 1.],
       [1., 0., 0.],
       [0., 1., 0.],
       [1., 0., 0.],
       [0., 0., 1.],
       [0., 0., 1.],
       [1., 0., 0.],
       [0., 0., 1.],
       [1., 0., 0.],
       [0., 0., 1.],
       [1., 0., 0.],
       [0., 0., 1.],
       [0., 1., 0.],
       [0., 0., 1.],
       [0., 1., 0.],
       [1., 0

In [27]:
val_labels=np.array(val_labels)

In [28]:
val_labels = val_labels.reshape(-1, 1)

# Fit and transform labels
val_labels = encoder.transform(val_labels)

In [29]:
train_data=np.array(train_data)

In [30]:
val_data=np.array(val_data)

In [32]:
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers

NUM_UNITS = 128

# "set_seed" is called to ensure we get the same weights every time. Comment out this
# line to get different weight initializations.
tf.random.set_seed(0)

# "kernel_initializer" is passed to ensure we get the same weights every time. Remove
# the parameter to get different weight initializations.
model = keras.Sequential([
  layers.Dense(256, activation='relu', kernel_initializer=tf.keras.initializers.random_normal(seed=1)),
  layers.Dense(128, activation='relu', kernel_initializer=tf.keras.initializers.random_normal(seed=1)),
  layers.Dense(NUM_UNITS, activation='relu', kernel_initializer=tf.keras.initializers.random_normal(seed=1)),
  layers.Dense(3, activation='softmax', kernel_initializer=tf.keras.initializers.random_normal(seed=1))
]
)

# Compile model
model.compile(optimizer='adam',
              loss='categorical_crossentropy',  # Use categorical_crossentropy for one-hot encoded labels
              metrics=['accuracy'])

NUM_EPOCHS = 20
BATCH_SIZE = 128

# Train model
#model.fit(np.array(preprocessed_trained_corpus),np.array(one_hot_labels), epochs=10, batch_size=32,validation_split=0.2)
history = model.fit((train_data),(train_labels), epochs=NUM_EPOCHS, batch_size=BATCH_SIZE, validation_data=((val_data), (val_labels)))

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


In [33]:
y_pred=model.predict(val_data)



In [42]:
y_pred

array([[0.39352766, 0.24144106, 0.36503124],
       [0.4183187 , 0.39425275, 0.18742853],
       [0.41324458, 0.3579872 , 0.22876807],
       [0.41324458, 0.3579872 , 0.22876807],
       [0.34245345, 0.36084202, 0.2967045 ],
       [0.30754408, 0.17143592, 0.52102005],
       [0.30754408, 0.17143592, 0.52102005],
       [0.34245345, 0.36084202, 0.2967045 ],
       [0.3481382 , 0.3274491 , 0.3244126 ],
       [0.30754408, 0.17143592, 0.52102005],
       [0.4863014 , 0.26095766, 0.2527409 ],
       [0.23067912, 0.49775258, 0.2715684 ],
       [0.3481382 , 0.3274491 , 0.3244126 ],
       [0.37364373, 0.18828548, 0.43807083],
       [0.23067912, 0.49775258, 0.2715684 ],
       [0.4863014 , 0.26095766, 0.2527409 ],
       [0.23067912, 0.49775258, 0.2715684 ],
       [0.4183187 , 0.39425275, 0.18742853],
       [0.34245345, 0.36084202, 0.2967045 ],
       [0.39352766, 0.24144106, 0.36503124],
       [0.30754408, 0.17143592, 0.52102005],
       [0.37364373, 0.18828548, 0.43807083],
       [0.

In [51]:
for i in range(len(y_pred)):
  m=max(y_pred[i])
  y_pred[i]=[x*(1/x) if x==m else 0 for x in y_pred[i]]

In [52]:
y_pred

array([[1., 0., 0.],
       [1., 0., 0.],
       [1., 0., 0.],
       [1., 0., 0.],
       [0., 1., 0.],
       [0., 0., 1.],
       [0., 0., 1.],
       [0., 1., 0.],
       [1., 0., 0.],
       [0., 0., 1.],
       [1., 0., 0.],
       [0., 1., 0.],
       [1., 0., 0.],
       [0., 0., 1.],
       [0., 1., 0.],
       [1., 0., 0.],
       [0., 1., 0.],
       [1., 0., 0.],
       [0., 1., 0.],
       [1., 0., 0.],
       [0., 0., 1.],
       [0., 0., 1.],
       [1., 0., 0.],
       [0., 1., 0.],
       [1., 0., 0.],
       [1., 0., 0.],
       [1., 0., 0.],
       [0., 0., 1.],
       [0., 1., 0.],
       [1., 0., 0.],
       [0., 1., 0.],
       [0., 0., 1.],
       [0., 0., 1.],
       [0., 1., 0.],
       [1., 0., 0.],
       [1., 0., 0.],
       [0., 0., 1.],
       [0., 1., 0.],
       [0., 0., 1.],
       [1., 0., 0.],
       [1., 0., 0.],
       [1., 0., 0.],
       [0., 1., 0.],
       [1., 0., 0.],
       [1., 0., 0.],
       [0., 1., 0.],
       [0., 1., 0.],
       [0., 0

In [47]:
pip install scikit-learn




In [53]:
from sklearn.metrics import f1_score


In [59]:
f1=f1_score(val_labels, y_pred, average='weighted')

In [60]:
f1

0.408210913448625

In [76]:
from sklearn.metrics import recall_score


In [78]:
recall=recall_score(val_labels,y_pred,average='weighted')

In [79]:
recall

0.410958904109589

In [85]:
from sklearn.metrics import multilabel_confusion_matrix


In [87]:
mcm = multilabel_confusion_matrix(val_labels, y_pred)

In [88]:
# Print confusion matrix for each label
for i, cm in enumerate(mcm):
    print(f"Confusion Matrix for label {i}:")
    print(cm)
    print()

Confusion Matrix for label 0:
[[20 23]
 [15 15]]

Confusion Matrix for label 1:
[[41 11]
 [14  7]]

Confusion Matrix for label 2:
[[42  9]
 [14  8]]



In [94]:
accuracy=multilabel_accuracy_score(val_labels,y_pred)

In [95]:
accuracy

0.410958904109589