In [23]:
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense
from sklearn.model_selection import train_test_split
import pandas as pd
import numpy as np


In [24]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [25]:
# load and read text file
df = pd.read_csv('/content/drive/MyDrive/Data Science/Advanced topic Machine Learning/Datasets/dev.tsv',delimiter= '\t')
df

Unnamed: 0,doc_id,review,rating
0,227093,i got my nexplanon in december 2013. two weeks...,1
1,219510,prescribed before and after cataract surgery w...,1
2,199407,i'm just finishing up my first month on the sh...,1
3,160884,took this medication for 4 months it didn't do...,0
4,33397,i think if you already had some underlying iss...,1
...,...,...,...
1995,220390,i have been on loestrin for about 2 months now...,0
1996,192099,i hate this pill its the second birth control ...,0
1997,39334,i am young and considered obese. i am a health...,0
1998,133858,i have used restasis for about a year now and ...,0


## Data Cleaning:
removing the duplicate

In [26]:
len(df)

2000

In [27]:
df = df.drop_duplicates()
X=df['review'].values
Y = df['rating'].values
# X=df.drop_duplicates() #drop duplicate reviews
len(X),len(Y)

(2000, 2000)

In [28]:
# Split Dataset into training and text set
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.2, random_state=42)


In [29]:
# Tokenize text
tokenizer = Tokenizer(num_words=5000)
tokenizer.fit_on_texts(X_train)

X_train = tokenizer.texts_to_sequences(X_train)
X_test = tokenizer.texts_to_sequences(X_test)

In [30]:
# Pad sequences to ensure uniform input size
maxlen = 100
X_train = pad_sequences(X_train, padding='post', maxlen=maxlen)
X_test = pad_sequences(X_test, padding='post', maxlen=maxlen)

## Build the Bidirectional LSTM Model
Bidirectional LSTM model can capture patterns from both the beginning and the end of the sequences, providing a richer representation of the data.

In [31]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, Bidirectional, LSTM, Dense, Dropout
from tensorflow.keras.optimizers import Adam

input_dim = 10000 + 1
output_dim = 256
input_length = 100

model = Sequential()
model.add(Embedding(input_dim=input_dim, output_dim=output_dim, input_length=input_length))
model.add(Bidirectional(LSTM(256, dropout=0.3, recurrent_dropout=0.3)))
model.add(Dense(128, activation='relu'))
model.add(Dropout(0.5))
model.add(Dense(64, activation='relu'))
model.add(Dropout(0.5))
model.add(Dense(1, activation='sigmoid'))

optimizer = Adam(learning_rate=0.001)  # Use Adam optimizer with a specified learning rate
model.compile(optimizer=optimizer, loss='binary_crossentropy', metrics=['accuracy'])

model.summary()



Model: "sequential_4"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_4 (Embedding)     (None, 100, 256)          2560256   
                                                                 
 bidirectional_4 (Bidirecti  (None, 512)               1050624   
 onal)                                                           
                                                                 
 dense_12 (Dense)            (None, 128)               65664     
                                                                 
 dropout_8 (Dropout)         (None, 128)               0         
                                                                 
 dense_13 (Dense)            (None, 64)                8256      
                                                                 
 dropout_9 (Dropout)         (None, 64)                0         
                                                      

### Train the Model

In [32]:
history = model.fit(X_train, y_train, epochs=20, validation_data=(X_test, y_test), batch_size=64)


Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


### Evaluating Model's Performance
After training, evaluate the performance on the test set


In [33]:
import numpy as np
from sklearn.metrics import f1_score

# Assuming model.predict(X_test) returns probabilities
y_pred_prob = model.predict(X_test)

# Convert probabilities to class labels based on a 0.5 threshold
y_pred = np.where(y_pred_prob > 0.5, 1, 0)

# Calculate F1 Score
# If y_pred_prob is 2D (e.g., shape (n_samples, 2) for binary classification with keras), use y_pred_prob[:, 1]
if y_pred_prob.ndim > 1 and y_pred_prob.shape[1] == 2:
    y_pred = np.argmax(y_pred_prob, axis=1)

f1 = f1_score(y_test, y_pred,average='weighted')
print(f"F1 Score: {f1}")


F1 Score: 0.6993233082706769


## Saving model

In [34]:
# Save the entire model to a HDF5 file
model.save('my_sentiment_model.h5')


  saving_api.save_model(


## Loading model

In [35]:
# Load the model
from tensorflow.keras.models import load_model
model_sentiment= load_model('my_sentiment_model.h5')

## Using the traning dataset to train model

In [36]:
train_data = pd.read_csv('/content/drive/MyDrive/Data Science/Advanced topic Machine Learning/Datasets/train.tsv',delimiter= '\t')
test_data = pd.read_csv('/content/drive/MyDrive/Data Science/Advanced topic Machine Learning/Datasets/test.tsv',delimiter='\t')

train_data.head(5)
test_data.head(10)



Unnamed: 0,doc_id,review,rating
0,37451,we switched on the advice of the then new adhd...,-10
1,193385,"helps me sleep, but i don't notice much relaxi...",-10
2,216485,"sometimes use it. really good effect, my wife ...",-10
3,84065,i first started this birth control two years a...,-10
4,146801,i was prescribed these for an onset of panic a...,-10
5,125177,i decided to take dulcolax last night because ...,-10
6,213875,omg this stuff burns 100x worse than the yi!! ...,-10
7,84859,i only took this medication for the first mont...,-10
8,29397,it has been 1 year and i'm still suffering fro...,-10
9,207232,i would never take the time to do this if this...,-10


In [37]:
# replace the rating -10 to 0 in the test dataset
import pandas as pd

# Assuming 'test_data' is your DataFrame and 'rating' is the column with -10 placeholders
test_data['rating'] = test_data['rating'].replace(-10, 0)

test_data.head(5)

Unnamed: 0,doc_id,review,rating
0,37451,we switched on the advice of the then new adhd...,0
1,193385,"helps me sleep, but i don't notice much relaxi...",0
2,216485,"sometimes use it. really good effect, my wife ...",0
3,84065,i first started this birth control two years a...,0
4,146801,i was prescribed these for an onset of panic a...,0


In [38]:
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

x_train,y_train = train_data['review'], train_data['rating']

# tokenizer the vocabulary in review
tokenizer = Tokenizer(num_words=10000)
tokenizer.fit_on_texts(x_train)

# Convert texts to sequences of integers
x_train_sequences = tokenizer.texts_to_sequences(x_train)

# Pad sequences to have the same length

x_train_padded = pad_sequences(x_train_sequences, maxlen=100, padding='post')

# traing my model
train_history = model_sentiment.fit(x_train_padded,y_train, epochs=20, batch_size=64)


Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


## Evaluation
Using the test dataset to evaluate my model's performance

In [41]:
x_text_data, y_test_data = test_data['review'], test_data['rating']

# Tokenize evaluation data
x_eval_sequences = tokenizer.texts_to_sequences(x_text_data)

# Pad sequences to have the same length
x_eval_padded = pad_sequences(x_eval_sequences, maxlen=100, padding='post')

# Predict probabilities
y_pred_prob = model.predict(x_eval_padded)

# Convert probabilities to binary labels based on a threshold
y_pred = (y_pred_prob > 0.5).astype(int)

# Calculate F1 Score
# If y_pred_prob is 2D (e.g., shape (n_samples, 2) for binary classification with keras), use y_pred_prob[:, 1]
if y_pred_prob.ndim > 1 and y_pred_prob.shape[1] == 2:
    y_pred = np.argmax(y_pred_prob, axis=1)

from sklearn.metrics import f1_score

# Calculate F1-weighted Score
f1_weighted = f1_score(y_test_data, y_pred, average='weighted')
print(f"F1-weighted Score: {f1_weighted}")



F1-weighted Score: 0.61735222951953
