<a href="https://colab.research.google.com/github/i-bukhari/Sentiment-Analysis-Lyrics/blob/main/Embedding_GRU_LSTM.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

This notebook demonstrates the implementation of a sentiment analysis model on song lyrics using a combination of Word Embeddings, Gated Recurrent Unit (GRU), and Long short-term memory (LSTM) architectures. It includes data preprocessing steps, model training, and evaluation to classify the lyrics as positive or negative based on valence scores.

## Import libraries

In [1]:
!pip install langdetect

Collecting langdetect
  Downloading langdetect-1.0.9.tar.gz (981 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m981.5/981.5 kB[0m [31m7.7 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: langdetect
  Building wheel for langdetect (setup.py) ... [?25l[?25hdone
  Created wheel for langdetect: filename=langdetect-1.0.9-py3-none-any.whl size=993227 sha256=4a5b09d2228c3946c0f4b57aefb26034648acc8334e97aff34b55faa6bd26085
  Stored in directory: /root/.cache/pip/wheels/95/03/7d/59ea870c70ce4e5a370638b5462a7711ab78fba2f655d05106
Successfully built langdetect
Installing collected packages: langdetect
Successfully installed langdetect-1.0.9


In [16]:
from langdetect import detect

import pandas as pd
import numpy as np
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, accuracy_score, f1_score

from textblob import TextBlob

import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize

from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense, GRU
from tensorflow.keras.initializers import Constant
from tensorflow.keras.optimizers import Adam

## Data Preprocessing

###Removing Empty Lyrics and Non-english Lyrics

In [3]:
def is_english(text):
    try:
        lang = detect(text)
        return lang == 'en'
    except:

        return False
def lang(text):
    if is_english(text):
        return 1
    else:
        return
def positivity(valence_score): #9-point scale valence score
        if valence_score<=4.5:
            return 0
        else:
            return 1

original_df = pd.read_csv('muse_v3_with_lyrics_filled.csv') #upload csv manually, data scraped and extracted into csv
data = original_df[['lyric','valence_tags']]
data = data.dropna(subset=['lyric'])
data['lang'] = data['lyric'].apply(lang)
data = data.dropna(subset=['lang'])
data['positivity'] = data['valence_tags'].apply(positivity)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data['positivity'] = data['valence_tags'].apply(positivity)


In [4]:
def modify_sent(input_string):
    lines = input_string.split('\n')
    output_string = '. '.join(lines)
    return output_string

data['lyrics'] = data['lyric'].apply(modify_sent)

In [5]:
###### BALANCE DATA #######

negative_data = data[data['positivity'] == 0]
num_negative = len(negative_data)

positive_data = data[data['positivity'] == 1]
positive_sampled = positive_data.sample(n=num_negative, replace=False, random_state=42)

balanced_data = pd.concat([negative_data, positive_sampled])
balanced_data = balanced_data.sample(frac=1, random_state=42).reset_index(drop=True)
balanced_data = balanced_data.drop(columns=['lyric','lang'])

In [6]:
balanced_data

Unnamed: 0,valence_tags,positivity,lyrics
0,6.184659,1,"If I could make a wish, I think I'd pass. Can'..."
1,1.964286,0,"Magician, magician, take me upon your wings. A..."
2,7.012289,1,[Verse 1]. You wanna know if I know why?. I ca...
3,2.513333,0,"Yes I've been waiting, but you just don't come..."
4,5.802817,1,"[Intro]. Mmmh, pacify. Mmmh, pacify. Mmmh. Cla..."
...,...,...,...
16845,6.843000,1,[Verse 1]. After all these implements and text...
16846,7.345556,1,"[Intro]. Oh, ooh. Erica Kane. . [Verse 1]. She..."
16847,3.940000,0,Open the lid of the chest of Man. Let the drea...
16848,2.530000,0,"Close your eyes, we're coming down. Close your..."


In [7]:
def lyric_sentiment(lyrics):
  sentence = lyrics.split(". ")

  score = []
  threshold = 0

  for i in sentence:
    score.append(TextBlob(i).sentiment.polarity)

  important_score = [i for i in score if i!=0]

  if np.mean(important_score) > threshold :
    return 1

  else :
    return 0

In [8]:
df_copy_b = balanced_data.copy()
df_copy_b['polarity'] = df_copy_b['lyrics'].apply(lyric_sentiment)

  return _methods._mean(a, axis=axis, dtype=dtype,


In [9]:
df_copy_b

Unnamed: 0,valence_tags,positivity,lyrics,polarity
0,6.184659,1,"If I could make a wish, I think I'd pass. Can'...",1
1,1.964286,0,"Magician, magician, take me upon your wings. A...",1
2,7.012289,1,[Verse 1]. You wanna know if I know why?. I ca...,0
3,2.513333,0,"Yes I've been waiting, but you just don't come...",1
4,5.802817,1,"[Intro]. Mmmh, pacify. Mmmh, pacify. Mmmh. Cla...",1
...,...,...,...,...
16845,6.843000,1,[Verse 1]. After all these implements and text...,0
16846,7.345556,1,"[Intro]. Oh, ooh. Erica Kane. . [Verse 1]. She...",0
16847,3.940000,0,Open the lid of the chest of Man. Let the drea...,0
16848,2.530000,0,"Close your eyes, we're coming down. Close your...",0


# Model analysis based on "Positivity" score

"Positivity" score extracted from "Valenence Tags" and Song Lyrics

## TF-IDF

In [None]:
# Download necessary NLTK resources
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('punkt')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [None]:
stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()

def preprocess_text(text):
    tokens = word_tokenize(text)
    tokens = [lemmatizer.lemmatize(word) for word in tokens if word not in stop_words and word.isalpha()]
    return ' '.join(tokens)

balanced_data['processed_lyrics'] = balanced_data['lyrics'].apply(preprocess_text)

In [None]:
# Vectorizing the text with new parameters
tfidf = TfidfVectorizer(max_df=0.7, min_df=5, ngram_range=(1, 2))
X_tfidf = tfidf.fit_transform(balanced_data['processed_lyrics'])

### Random Forest + TFIDF

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X_tfidf, balanced_data['positivity'], test_size=0.3, random_state=42)

rf_classifier = RandomForestClassifier(n_estimators=100, random_state=42)

rf_classifier.fit(X_train, y_train)

y_pred = rf_classifier.predict(X_test)

print("Classification Report:")
print(classification_report(y_test, y_pred))
print("Accuracy Score:", accuracy_score(y_test, y_pred))

Classification Report:
              precision    recall  f1-score   support

           0       0.60      0.62      0.61      2554
           1       0.60      0.58      0.59      2500

    accuracy                           0.60      5054
   macro avg       0.60      0.60      0.60      5054
weighted avg       0.60      0.60      0.60      5054

Accuracy Score: 0.5989315393747526


## GloVe Embedding

### Random Forest

In [None]:
# Load GloVe embeddings
def load_glove_embeddings(path):
    embeddings_dict = {}
    with open(path, 'r', encoding='utf-8') as file:
        for line in file:
            values = line.split()
            word = values[0]
            vector = np.asarray(values[1:], "float32")
            embeddings_dict[word] = vector
    return embeddings_dict

glove_embeddings = load_glove_embeddings('glove.6B.300d.txt')

def text_to_mean_vector(text, embeddings):
    words = word_tokenize(text.lower())
    vectors = [embeddings.get(word, np.zeros(300)) for word in words]  # Adjusted the dimension to 300
    return np.mean(vectors, axis=0) if vectors else np.zeros(300)


if 'processed_lyrics' not in balanced_data.columns:
    print("Error: 'processed_lyrics' column is missing")
else:

    balanced_data['mean_embedding'] = balanced_data['processed_lyrics'].apply(lambda x: text_to_mean_vector(x, glove_embeddings))

    if balanced_data['mean_embedding'].isnull().any():
        print("Error: Missing embeddings in data")
    if 'positivity' not in balanced_data.columns:
        print("Error: 'positivity' labels are missing")
    else:
        X = np.array(balanced_data['mean_embedding'].tolist())
        y = balanced_data['positivity'].values

        print("Length of X:", len(X))
        print("Length of y:", len(y))

        if len(X) == len(y):

            X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


            rf_model = RandomForestClassifier(n_estimators=100, random_state=42)
            rf_model.fit(X_train, y_train)


            y_pred = rf_model.predict(X_test)
            report = classification_report(y_test, y_pred, target_names=['Negative', 'Positive'])
            print("Classification Report with Random Forest and GloVe Embeddings: \n", report)
        else:
            print("Error: The lengths of X and y do not match.")


Length of X: 16844
Length of y: 16844
Classification Report with Random Forest and GloVe Embeddings: 
               precision    recall  f1-score   support

    Negative       0.58      0.61      0.60      1688
    Positive       0.59      0.56      0.58      1681

    accuracy                           0.59      3369
   macro avg       0.59      0.59      0.59      3369
weighted avg       0.59      0.59      0.59      3369



### LSTM + GloVe Embeddings

#### Learning Rate = 0.001

In [None]:
def load_glove_embeddings(path):
    embeddings_dict = {}
    with open(path, 'r', encoding='utf-8') as file:
        for line in file:
            values = line.split()
            word = values[0]
            vector = np.asarray(values[1:], "float32")
            embeddings_dict[word] = vector
    return embeddings_dict


glove_embeddings = load_glove_embeddings('glove.6B.300d.txt')


vocab_size = 10000  # Adjust as per your vocabulary
embedding_dim = 300  # GloVe vectors dimension
max_length = 100    # Length of each input sequence


tokenizer = Tokenizer(num_words=vocab_size)
tokenizer.fit_on_texts(balanced_data['lyrics'])


sequences = tokenizer.texts_to_sequences(balanced_data['lyrics'])


padded_sequences = pad_sequences(sequences, maxlen=max_length)


embedding_matrix = np.zeros((vocab_size, embedding_dim))
for word, i in tokenizer.word_index.items():
    if i < vocab_size:
        embedding_vector = glove_embeddings.get(word)
        if embedding_vector is not None:
            embedding_matrix[i] = embedding_vector


learning_rate = 0.001  # Change as needed

optimizer = Adam(learning_rate=learning_rate)

model = Sequential()
model.add(Embedding(input_dim=vocab_size, output_dim=embedding_dim, embeddings_initializer=Constant(embedding_matrix), input_length=max_length, trainable=False))
model.add(LSTM(128, return_sequences=True))
model.add(LSTM(128))
model.add(Dense(1, activation='sigmoid'))


model.compile(optimizer=optimizer, loss='binary_crossentropy', metrics=['accuracy'])


# Model architecture
model = Sequential()
model.add(Embedding(input_dim=vocab_size, output_dim=embedding_dim, embeddings_initializer=Constant(embedding_matrix), input_length=max_length, trainable=False))
model.add(LSTM(128, return_sequences=True))
model.add(LSTM(128))
model.add(Dense(1, activation='sigmoid'))


model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])


labels = balanced_data['positivity'].values

X_train, X_test, y_train, y_test = train_test_split(padded_sequences, labels, test_size=0.2, random_state=42)


model.fit(X_train, y_train, epochs=10, batch_size=64, validation_data=(X_test, y_test))

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.src.callbacks.History at 0x7f5f143ce320>

In [None]:
loss, accuracy = model.evaluate(X_test, y_test)
print(f"Test Accuracy: {accuracy*100:.2f}%")


y_pred_probs = model.predict(X_test)
y_pred = (y_pred_probs > 0.5).astype('int32')

from sklearn.metrics import classification_report, f1_score

# Calculate F1 score and print classification report
f1 = f1_score(y_test, y_pred)
print(f"F1 Score: {f1:.2f}")
report = classification_report(y_test, y_pred, target_names=['Negative', 'Positive'])
print("Detailed Classification Report: \n", report)

Test Accuracy: 55.39%
F1 Score: 0.55
Detailed Classification Report: 
               precision    recall  f1-score   support

    Negative       0.55      0.56      0.56      1688
    Positive       0.55      0.55      0.55      1681

    accuracy                           0.55      3369
   macro avg       0.55      0.55      0.55      3369
weighted avg       0.55      0.55      0.55      3369



#### Learning Rate = 0.01

In [None]:
# Load GloVe embeddings
def load_glove_embeddings(path):
    embeddings_dict = {}
    with open(path, 'r', encoding='utf-8') as file:
        for line in file:
            values = line.split()
            word = values[0]
            vector = np.asarray(values[1:], "float32")
            embeddings_dict[word] = vector
    return embeddings_dict

glove_embeddings = load_glove_embeddings('glove.6B.300d.txt')

vocab_size = 10000  # Adjust as per your vocabulary
embedding_dim = 300  # GloVe vectors dimension
max_length = 100    # Length of each input sequence


tokenizer = Tokenizer(num_words=vocab_size)
tokenizer.fit_on_texts(balanced_data['lyrics'])


sequences = tokenizer.texts_to_sequences(balanced_data['lyrics'])


padded_sequences = pad_sequences(sequences, maxlen=max_length)


embedding_matrix = np.zeros((vocab_size, embedding_dim))
for word, i in tokenizer.word_index.items():
    if i < vocab_size:
        embedding_vector = glove_embeddings.get(word)
        if embedding_vector is not None:
            embedding_matrix[i] = embedding_vector



learning_rate = 0.01  # Change as needed

optimizer = Adam(learning_rate=learning_rate)

# Model architecture remains the same
model = Sequential()
model.add(Embedding(input_dim=vocab_size, output_dim=embedding_dim, embeddings_initializer=Constant(embedding_matrix), input_length=max_length, trainable=False))
model.add(LSTM(128, return_sequences=True))
model.add(LSTM(128))
model.add(Dense(1, activation='sigmoid'))


model.compile(optimizer=optimizer, loss='binary_crossentropy', metrics=['accuracy'])


labels = balanced_data['positivity'].values


X_train, X_test, y_train, y_test = train_test_split(padded_sequences, labels, test_size=0.2, random_state=42)


model.fit(X_train, y_train, epochs=10, batch_size=64, validation_data=(X_test, y_test))


Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.src.callbacks.History at 0x7f5f20ccbe20>

In [None]:
loss, accuracy = model.evaluate(X_test, y_test)
print(f"Test Accuracy: {accuracy*100:.2f}%")


y_pred_probs = model.predict(X_test)
y_pred = (y_pred_probs > 0.5).astype('int32')

from sklearn.metrics import classification_report, f1_score

f1 = f1_score(y_test, y_pred)
print(f"F1 Score: {f1:.2f}")
report = classification_report(y_test, y_pred, target_names=['Negative', 'Positive'])
print("Detailed Classification Report: \n", report)

Test Accuracy: 55.92%
F1 Score: 0.56
Detailed Classification Report: 
               precision    recall  f1-score   support

    Negative       0.56      0.55      0.56      1688
    Positive       0.56      0.57      0.56      1681

    accuracy                           0.56      3369
   macro avg       0.56      0.56      0.56      3369
weighted avg       0.56      0.56      0.56      3369



### Logistic Regression + GloVe Embeddings

In [None]:
def text_to_mean_vector(text, embeddings):
    words = word_tokenize(text.lower())
    vectors = [embeddings.get(word, np.zeros(300)) for word in words]
    return np.mean(vectors, axis=0) if vectors else np.zeros(300)

if 'processed_lyrics' not in balanced_data.columns:
    print("Error: 'processed_lyrics' column is missing")
else:
    balanced_data['mean_embedding'] = balanced_data['processed_lyrics'].apply(lambda x: text_to_mean_vector(x, glove_embeddings))

    if balanced_data['mean_embedding'].isnull().any():
        print("Error: Missing embeddings in data")
    if 'positivity' not in balanced_data.columns:
        print("Error: 'positivity' labels are missing")
    else:
        X = np.array(balanced_data['mean_embedding'].tolist())
        y = balanced_data['positivity'].values

        print("Length of X:", len(X))
        print("Length of y:", len(y))

        if len(X) == len(y):
            X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

            log_reg_model = LogisticRegression(max_iter=1000)
            log_reg_model.fit(X_train, y_train)

            y_pred = log_reg_model.predict(X_test)
            report = classification_report(y_test, y_pred, target_names=['Negative', 'Positive'])
            print("Classification Report with Logistic Regression and GloVe Embeddings: \n", report)
        else:
            print("Error: The lengths of X and y do not match.")


Length of X: 16844
Length of y: 16844
Classification Report with Logistic Regression and GloVe Embeddings: 
               precision    recall  f1-score   support

    Negative       0.63      0.60      0.61      1688
    Positive       0.61      0.65      0.63      1681

    accuracy                           0.62      3369
   macro avg       0.62      0.62      0.62      3369
weighted avg       0.62      0.62      0.62      3369



## GRU Model  - Positivity

In [None]:
# Load GloVe embeddings
def load_glove_embeddings(path):
    embeddings_dict = {}
    with open(path, 'r', encoding='utf-8') as file:
        for line in file:
            values = line.split()
            word = values[0]
            vector = np.asarray(values[1:], "float32")
            embeddings_dict[word] = vector
    return embeddings_dict

glove_embeddings = load_glove_embeddings('glove.6B.300d.txt')

# Parameters
vocab_size = 10000  # Choose based on your vocabulary size
max_length = 100    # Length of each padded sequence

tokenizer = Tokenizer(num_words=vocab_size)
tokenizer.fit_on_texts(balanced_data['lyrics'])

sequences = tokenizer.texts_to_sequences(balanced_data['lyrics']) # Convert texts to sequences

padded_sequences = pad_sequences(sequences, maxlen=max_length)

# Create an embedding matrix
embedding_dim = 300  # Dimensionality of GloVe vectors
embedding_matrix = np.zeros((vocab_size, embedding_dim))
for word, i in tokenizer.word_index.items():
    if i < vocab_size:
        embedding_vector = glove_embeddings.get(word)
        if embedding_vector is not None:
            embedding_matrix[i] = embedding_vector

# Model architecture
model = Sequential()
model.add(Embedding(input_dim=vocab_size, output_dim=embedding_dim, embeddings_initializer=Constant(embedding_matrix), input_length=max_length, trainable=False))
model.add(GRU(128, return_sequences=True))  # GRU layer with 128 units
model.add(GRU(128))  # Another GRU layer; adjust the number of units as necessary
model.add(Dense(1, activation='sigmoid'))  # Output layer for binary classification

model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
print(model.summary())

labels = balanced_data['positivity'].values

X_train, X_test, y_train, y_test = train_test_split(padded_sequences, labels, test_size=0.2, random_state=42)

model.fit(X_train, y_train, epochs=20, batch_size=64, validation_data=(X_test, y_test))

Model: "sequential_7"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_5 (Embedding)     (None, 100, 300)          3000000   
                                                                 
 gru_4 (GRU)                 (None, 100, 128)          165120    
                                                                 
 gru_5 (GRU)                 (None, 128)               99072     
                                                                 
 dense_6 (Dense)             (None, 1)                 129       
                                                                 
Total params: 3264321 (12.45 MB)
Trainable params: 264321 (1.01 MB)
Non-trainable params: 3000000 (11.44 MB)
_________________________________________________________________
None
Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epo

<keras.src.callbacks.History at 0x7f5ec1edd330>

In [None]:
# Evaluate the model on the test set
loss, accuracy = model.evaluate(X_test, y_test)
print(f"Test Accuracy: {accuracy*100:.2f}%")

from sklearn.metrics import classification_report, f1_score

# Generate predictions
y_pred_probs = model.predict(X_test)
y_pred = (y_pred_probs > 0.5).astype('int32')  # Convert probabilities to binary labels

# Calculate F1 score
f1 = f1_score(y_test, y_pred)  # Default is binary F1
print(f"F1 Score: {f1:.2f}")

# Classification report
report = classification_report(y_test, y_pred, target_names=['Negative', 'Positive'])
print("Detailed Classification Report: \n", report)

Test Accuracy: 54.70%
F1 Score: 0.53
Detailed Classification Report: 
               precision    recall  f1-score   support

    Negative       0.54      0.59      0.57      1688
    Positive       0.55      0.50      0.53      1681

    accuracy                           0.55      3369
   macro avg       0.55      0.55      0.55      3369
weighted avg       0.55      0.55      0.55      3369



# Using Polararity from TextBlob

##GRU Model - Polarity

In [10]:
vocab_size = 10000  # Choose based on your vocabulary size
max_length = 100    # Length of each padded sequence

tokenizer = Tokenizer(num_words=vocab_size)
tokenizer.fit_on_texts(df_copy_b['lyrics'])

sequences = tokenizer.texts_to_sequences(df_copy_b['lyrics'])

padded_sequences = pad_sequences(sequences, maxlen=max_length)

In [18]:
# Model architecture
model = Sequential()
model.add(Embedding(input_dim=vocab_size, output_dim=embedding_dim, embeddings_initializer=Constant(embedding_matrix), input_length=max_length, trainable=False))
model.add(GRU(128, return_sequences=True))  # GRU layer with 128 units
model.add(GRU(128))  # Another GRU layer; adjust the number of units as necessary
model.add(Dense(1, activation='sigmoid'))  # Output layer for binary classification

model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
print(model.summary())

labels = df_copy_b['polarity'].values

# Split
X_train, X_test, y_train, y_test = train_test_split(padded_sequences, labels, test_size=0.2, random_state=42)

# Train
model.fit(X_train, y_train, epochs=10, batch_size=64, validation_data=(X_test, y_test))


Model: "sequential_1"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_1 (Embedding)     (None, 100, 300)          3000000   
                                                                 
 gru_2 (GRU)                 (None, 100, 128)          165120    
                                                                 
 gru_3 (GRU)                 (None, 128)               99072     
                                                                 
 dense_1 (Dense)             (None, 1)                 129       
                                                                 
Total params: 3264321 (12.45 MB)
Trainable params: 264321 (1.01 MB)
Non-trainable params: 3000000 (11.44 MB)
_________________________________________________________________
None
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.src.callbacks.History at 0x7fc227d95390>

In [19]:
# Evaluate on the test set
loss, accuracy = model.evaluate(X_test, y_test)
print(f"Test Accuracy: {accuracy*100:.2f}%")

Test Accuracy: 64.01%


In [17]:
# Generate predictions
y_pred_probs = model.predict(X_test)
y_pred = (y_pred_probs > 0.5).astype('int32')  # Convert probabilities to binary labels

# Calculate F1 score
f1 = f1_score(y_test, y_pred)
print(f"F1 Score: {f1:.2f}")

report = classification_report(y_test, y_pred, target_names=['Negative', 'Positive'])
print("Detailed Classification Report: \n", report)

F1 Score: 0.78
Detailed Classification Report: 
               precision    recall  f1-score   support

    Negative       0.00      0.00      0.00      1213
    Positive       0.64      1.00      0.78      2157

    accuracy                           0.64      3370
   macro avg       0.32      0.50      0.39      3370
weighted avg       0.41      0.64      0.50      3370



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
