<a href="https://colab.research.google.com/github/lilylusvardii/Final_Project_Code/blob/main/practical_work_w21010751_KV6003.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install --upgrade transformers
!pip install --upgrade tf_keras
import os
os.environ['TF_USE_LEGACY_KERAS'] = '1'
!pip show tensorflow transformers tf_keras

In [None]:
!pip install langdetect
!python -m spacy download it_core_news_sm
!python -m spacy download en_core_web_sm

pre trained BERT model fine tuned on dataset, unit testing used here.

In [None]:
import time
import pandas as pd
import tensorflow as tf
import torch
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix
import seaborn as sns
import matplotlib.pyplot as plt
from transformers import DistilBertTokenizer, TFDistilBertForSequenceClassification
import unittest
import sys

#unit testing
class unitTesting(unittest.TestCase):
    def setUp(self):
        self.ds = pd.DataFrame({
            'text': ['Awful experience!!!', 'Molto Bello.', 'It was okay, I Guess?'],
            'rating': [1, 5, 3]
        })
    @staticmethod
    def toSent(rating):
        if rating <= 2:
            return 0  #negative
        elif rating == 3:
            return 1  #neutral
        else:
            return 2  #positive

    def preprocessing(self, train_texts, test_texts, train_labels, test_labels, tokeniser=None):
        if tokeniser:
            MAX_LENGTH = 512
            train_encodings = tokeniser(train_texts.tolist(), truncation=True, padding=True, max_length=MAX_LENGTH, return_tensors="tf")
            test_encodings = tokeniser(test_texts.tolist(), truncation=True, padding=True, max_length=MAX_LENGTH, return_tensors="tf")
            train_labels = tf.convert_to_tensor(train_labels.tolist())
            test_labels = tf.convert_to_tensor(test_labels.tolist())
            return train_encodings, test_encodings, train_labels, test_labels
        else:
            return train_texts, test_texts, train_labels, test_labels

    def test_mapping(self):
        self.assertEqual(unitTesting.toSent(1), 0)  #negative
        self.assertEqual(unitTesting.toSent(5), 2)  #positive
        self.assertEqual(unitTesting.toSent(3), 1)  #neutral

    def test_preprocess(self):
        self.ds['sentiment'] = self.ds['rating'].apply(unitTesting.toSent)
        train_texts, test_texts, train_labels, test_labels = train_test_split(self.ds['text'], self.ds['sentiment'], test_size=0.3)
        train_texts, test_texts, train_labels, test_labels = self.preprocessing(train_texts, test_texts, train_labels, test_labels)
        self.assertEqual(len(train_texts), 2)
        self.assertEqual(len(test_texts), 1)

    def test_model(self):
        self.ds['sentiment'] = self.ds['rating'].apply(unitTesting.toSent)
        train_texts, test_texts, train_labels, test_labels = train_test_split(self.ds['text'], self.ds['sentiment'], test_size=0.3)
        tokeniser = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')
        train_encodings, test_encodings, train_labels, test_labels = self.preprocessing(train_texts, test_texts, train_labels, test_labels, tokeniser=tokeniser)
        classes = 3
        BERTmodel = TFDistilBertForSequenceClassification.from_pretrained("distilbert-base-uncased", num_labels=classes)
        optimizer = tf.keras.optimizers.Adam(learning_rate=5e-5)
        loss = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)
        metrics = ['accuracy']
        BERTmodel.compile(optimizer=optimizer, loss=loss, metrics=metrics)
        trained = BERTmodel.fit(
            {'input_ids': train_encodings['input_ids'], 'attention_mask': train_encodings['attention_mask']},
            train_labels,
            validation_data=(
                {'input_ids': test_encodings['input_ids'], 'attention_mask': test_encodings['attention_mask']},
                test_labels
            ),
            batch_size=32,
            epochs=3
        )
        self.assertIsNotNone(trained)


def main_model():
    start = time.time() #timing training time
    print("started timer!")

    #loading dataset
    ds = pd.read_csv('reviews.csv', encoding='utf-8')
    print("dataset loaded!")
    #mapping ratings to sentiments
    ds['sentiment'] = ds['rating'].apply(unitTesting.toSent)
    print("sentiments mapped!")

    #data split into training/testing 70/30
    train_texts, test_texts, train_labels, test_labels = train_test_split(ds['text'], ds['sentiment'], test_size=0.3, random_state=42)

    #loading bert tokeniser
    tokeniser = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')
    #preprocessing
    MAX_LENGTH = 512
    train_encodings = tokeniser(train_texts.tolist(), truncation=True, padding=True, max_length=MAX_LENGTH, return_tensors="tf")
    test_encodings = tokeniser(test_texts.tolist(), truncation=True, padding=True, max_length=MAX_LENGTH, return_tensors="tf")
    #converting to tensor
    train_labels = tf.convert_to_tensor(train_labels.tolist())
    test_labels = tf.convert_to_tensor(test_labels.tolist())

    #loading bert model
    classes = 3
    BERTmodel = TFDistilBertForSequenceClassification.from_pretrained("distilbert-base-uncased", num_labels=classes)
    optimizer = tf.keras.optimizers.Adam(learning_rate=5e-5)
    loss = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)
    metrics = ['accuracy']
    BERTmodel.compile(optimizer=optimizer, loss=loss, metrics=metrics)

    #training/finetuning model
    BERTmodel.fit(
        {'input_ids': train_encodings['input_ids'], 'attention_mask': train_encodings['attention_mask']},
        train_labels,
        validation_data=(
            {'input_ids': test_encodings['input_ids'], 'attention_mask': test_encodings['attention_mask']},
            test_labels
        ),
        batch_size=16,
        epochs=3
    )
    print("training complete!")

    #making predictions
    predictions = BERTmodel.predict(test_encodings)
    print("predictions have been made!")
    # Printing evaluation metrics
    print(classification_report(test_labels, predictions[0].argmax(axis=1)))
    #confusion matrix
    confusion = confusion_matrix(test_labels, predictions[0].argmax(axis=1))
    plt.figure(figsize=(8,8))
    sns.heatmap(confusion, annot=True, fmt="d", cmap='PiYG', xticklabels=['negative', 'neutral', 'positive'], yticklabels=['negative', 'neutral', 'positive'])
    plt.xlabel('predicted')
    plt.ylabel('actual')
    plt.title('confusion matrix')
    plt.show()

    end = time.time()  #ending training timer
    print(f"training time in secs= {end - start}") #outputs in seconds

    #saving model for future use using .keras
    from google.colab import drive
    drive.mount('/content/drive') #mounting google drive
    #save path
    filepath = '/content/drive/My Drive/DistilBERT_FineTuned.keras'
    BERTmodel.save(filepath)
    print("model saved successfully in google drive!")

#test already ran commented out code to run it
if __name__ == '__main__':
    #if len(sys.argv) > 1 and sys.argv[1] == 'run-main':
        main_model()
    #else:
        #unittest.main(argv=['first-arg-is-ignored'], exit=False)

LSTM model trained and saved to google drive. Unit testing used here too.

In [None]:
import time
import numpy as np
import pandas as pd
import spacy
from langdetect import detect
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, SpatialDropout1D, LSTM, Dense
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix
import seaborn as sns
import matplotlib.pyplot as plt
from keras.utils import to_categorical
from tensorflow.keras.models import save_model
import unittest
import sys


#unit testing
class unitTesting(unittest.TestCase):
    def setUp(self):
        #setup DataFrame with sample data
        self.ds = pd.DataFrame({
            'text': ['Awful experience!!!', 'Molto Bello.', 'It was okay, I Guess?'],
            'rating': [1, 5, 3]
        })

        #loading spacy models
        self.ENspacy = spacy.load('en_core_web_sm')
        self.ITspacy = spacy.load('it_core_news_sm')

        #applying preprocessing
        self.ds['preprocessed'] = self.ds['text'].apply(self.preprocess)

        #tokensing and padding
        tokeniser = Tokenizer(num_words=5000)
        tokeniser.fit_on_texts(self.ds['preprocessed'])
        sequences = tokeniser.texts_to_sequences(self.ds['preprocessed'])
        self.data = pad_sequences(sequences, maxlen=100)
        self.labels = np.array([self.toSent(rating) for rating in self.ds['rating']])

    def preprocess(self, text):
        try:
            #detecting language and choosing model based on this
            doc = self.ENspacy(text) if detect(text) == 'en' else self.ITspacy(text)
            tokens = [token.text.lower() for token in doc if not token.is_punct and not token.is_stop]
            #joining tokens back together
            return ' '.join(tokens)
        except Exception as e:
            print(f"error with preprocessing: {e}")

    @staticmethod
    def toSent(rating):
        #mapping ratings to sentiments
        if rating <= 2:
            return 0  #negative
        elif rating == 3:
            return 1  #neutral
        else:
            return 2  #positive


    def test_mapping(self):
        #testing mapping function
        self.assertEqual(self.toSent(1), 0)  #negative
        self.assertEqual(self.toSent(5), 2)  #positive
        self.assertEqual(self.toSent(3), 1)  #neutral

    def test_preprocessing(self):
        #testing preprocessing
        expected_outputs = ['awful experience', 'bello', 'okay guess']
        processed_texts = self.ds['preprocessed'].tolist()
        self.assertEqual(processed_texts, expected_outputs)

    def test_tokenisation(self):
        #testing tokenisation
        self.assertEqual(len(self.data), len(self.ds))  #data length check
        self.assertEqual(self.data.shape[1], 100)  #sequence length check

    def test_training(self):
        X_train, X_test, y_train, y_test = train_test_split(self.data, self.labels, test_size=0.3, random_state=42)
        #building LSTM model
        model = Sequential([
            Embedding(input_dim=5000, output_dim=64, input_length=100),
            LSTM(64),
            Dense(3, activation='softmax')
        ])
        #compiling model
        model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])
        #training model
        history = model.fit(X_train, y_train, validation_data=(X_test, y_test), batch_size=16, epochs=3)#smaller for testing
        self.assertIsNotNone(history)  #making sure model has actually been trained
        #testing if training ran sucessfully
        results = model.evaluate(X_test, y_test)


def main_model():
        #using this time import to time how long training takes
        start = time.time()
        print ("started timer!")

        #importing data
        print ("importing data from dataset!")
        ds = pd.read_csv('reviews.csv', encoding='utf-8')

        #mapping ratings to sentiments
        print ("mapping sentiments")
        def toSent (rating):
            if rating <= 2:
                return 'negative'
            if rating == 3:
                return 'neutral'
            else:
                return 'positive'

        #sentiment values now in new column
        ds['sentiment'] = ds['rating'].apply(toSent)

        #preprocesing for both english and italian - using spaCy
        #loading the spacy models
        ENspacy = spacy.load('en_core_web_sm')
        ITspacy = spacy.load('it_core_news_sm')

        print("preprocessing started!")
        def preprocess(text):
            try:
               #detecting language and choosing spacy model based off this
               doc = ENspacy(text) if detect(text) == 'en' else ITspacy(text)

            except Exception as e:
                print(f"error with preprocessing= {e}")

            tokens = [token.text.lower() for token in doc if not token.is_punct and not token.is_stop]
            #joining the text back together
            return ' '.join(tokens)

        #applying the spacy preprocessing to dataset
        ds['preprocessed'] = ds['text'].apply(preprocess)

        #tokenising
        tokeniser = Tokenizer(num_words=5000)
        tokeniser.fit_on_texts(ds['preprocessed'])
        sequences = tokeniser.texts_to_sequences(ds['preprocessed'])
        data = pad_sequences(sequences, maxlen=100)


        #coverting labels back to numeric values again for use in model
        labels = ds['sentiment'].map({'negative': 0, 'neutral': 1, 'positive': 2})

        #splitting training/testing data 70/30
        X_train, X_test, y_train, y_test = train_test_split(data, labels, test_size=0.3, random_state=42, stratify=labels)

        #one-hot encode labels
        y_train = to_categorical(y_train, num_classes=3)
        y_test = to_categorical(y_test, num_classes=3)

        #building model
        print("model is now training, nearly done!")
        model = Sequential([
            Embedding(input_dim=5000, output_dim=128, input_length=100),
            SpatialDropout1D(0.3),
            LSTM(100, dropout=0.3, recurrent_dropout=0.3),
            Dense(3, activation='softmax')
        ])
        model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

        #training model
        model.fit(X_train, y_train, epochs=5, batch_size=64, validation_split=0.1)

        #evaluation metrics for LSTM
        print("evaluating model! Results soon...")
        y_pred = model.predict(X_test, batch_size=64)
        y_pred_classes = np.argmax(y_pred, axis=1)
        print(classification_report(np.argmax(y_test, axis=1), y_pred_classes,  labels=[0, 1, 2], target_names=['negative', 'neutral', 'positive']))#results outputted
        #confusion matrix
        confusion = confusion_matrix (np.argmax(y_test, axis=1), y_pred_classes)
        plt.figure(figsize=(8,8))
        sns.heatmap(confusion, annot=True, fmt="d", cmap='PiYG', xticklabels=['negative', 'neutral', 'positive'], yticklabels=['negative', 'neutral', 'positive'])
        plt.xlabel('predicted')
        plt.ylabel('actual')
        plt.title('confusion matrix')
        plt.show()

        end = time.time()#ending training timer
        trainingTime = end - start #outputs in seconds
        print(f"training time in secs= {trainingTime}")

        #saving model for future use using .keras
        from google.colab import drive
        drive.mount('/content/drive') #mounting google drive
        #save path
        filepath = '/content/drive/My Drive/ML_LSTM.keras'
        model.save(filepath)
        print("model saved successfully in google drive!")

#test already ran commented out code to run it
if __name__ == '__main__':
    #if len(sys.argv) > 1 and sys.argv[1] == 'run-main':
        main_model()
    #else:
        #unittest.main(argv=['first-arg-is-ignored'], exit=False)

SVM model trained and saved to google drive. Unit testing used here.

In [None]:
import time
import pandas as pd
import numpy as np
import spacy
from langdetect import detect
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import SVC
from sklearn.metrics import classification_report, confusion_matrix
import matplotlib.pyplot as plt
import seaborn as sns
import pickle
import unittest
import sys


class unitTesting(unittest.TestCase):
    def setUp(self):
        #setup with sample data
        self.ds = pd.DataFrame({
            'text': ['Awful experience!!!', 'Molto Bello.', 'It was okay, I Guess?'],
            'rating': [1, 5, 3]
        })
        #mapping ratings to sentiments
        self.ds['sentiment'] = self.ds['rating'].apply(self.toSent)
        #loading spacy models
        self.ENspacy = spacy.load('en_core_web_sm')
        self.ITspacy = spacy.load('it_core_news_sm')

        #preprocesssing and feature extraction
        self.ds['processedText'] = self.ds['text'].apply(self.preprocess)
        self.vectorizer = TfidfVectorizer(max_features=3000)
        self.features = self.vectorizer.fit_transform(self.ds['processedText'])
        self.labels = self.ds['sentiment']

    @staticmethod
    def toSent(rating):
        if rating <= 2:
            return 'negative'
        elif rating == 3:
            return 'neutral'
        else:
            return 'positive'

    def preprocess(self, text): #doing preprocessing on the test set of data
         try:
            doc = self.ENspacy(text) if detect(text) == 'en' else self.ITspacy(text)
            return ' '.join([token.text.lower() for token in doc if not token.is_punct and not token.is_stop])
         except Exception as e:
            return f"error with preprocessing= {str(e)}"

    def test_preprocessing(self): #preprocessing test
        expected_outputs = ['awful experience', 'bello', 'okay guess']
        processed_texts = self.ds['processedText'].tolist()
        for processed, expected in zip(processed_texts, expected_outputs):
            self.assertEqual(processed, expected, f"expected {expected} but got {processed}")
        actual_features = self.features.shape[1]
        expected_features = min(3000, len(self.vectorizer.get_feature_names_out()))
        self.assertEqual(actual_features, expected_features, f"expected {expected_features} features, but got {actual_features}")

    def test_training(self): #model training test
        X_train, X_test, y_train, y_test = train_test_split(self.features, self.labels, test_size=0.3, random_state=42)
        SVMmodel = SVC(kernel='linear', class_weight='balanced')
        SVMmodel.fit(X_train, y_train)
        predictions = SVMmodel.predict(X_test)
        self.assertIsNotNone(predictions, "model failed to make predictions")


def main_model():
        #using this time import to time how long training takes
        start = time.time()
        print ("started timer!")

        #importing data
        print ("importing dataset!")
        ds = pd.read_csv('reviews.csv', encoding='utf-8')

        #mapping ratings to sentiments
        print ("mapping sentiments")
        def toSent (rating):
            if rating <= 2:
                return 'negative'
            if rating == 3:
                return 'neutral'
            else:
                return 'positive'

        #sentiment values now in new column
        ds['sentiment'] = ds['rating'].apply(toSent)

        #preprocesing for both english and italian - using spaCy
        #loading the spacy models
        ENspacy = spacy.load('en_core_web_sm')
        ITspacy = spacy.load('it_core_news_sm')

        print("preprocessing started!")
        def preprocess(text):
            try:
               #detecting language and choosing spacy model based off this
                doc = ENspacy(text) if detect(text) == 'en' else ITspacy(text)
            except Exception as e:
                print(f"error with preprocessing= {e}")
                return '{e}'

            #tokenisation and getting rid of stop words and punctuation
            tokens = [token.text.lower() for token in doc if not token.is_punct and not token.is_stop]
                #joining the text back together
            return ' '.join(tokens)

        print("applying preprocesing to dataset!")
        #applying preprocessing to the dataset
        ds['processedText'] = ds['text'].apply(preprocess)

        print ("extracting feautres!")
        #feature extraction of preprocessed text
        vectoriser = TfidfVectorizer(max_features=3000)
        x = vectoriser.fit_transform(ds['processedText'])
        y = ds['sentiment']

        print("starting training! Nearly done")
        #splitting training/testing data 70/30
        X_train, X_test, y_train, y_test = train_test_split(x, y, test_size=0.3, random_state=42)


        #training svm model - intially usedlinear kernel
        SVMmodel = SVC(kernel='linear', class_weight='balanced')
        SVMmodel.fit(X_train, y_train)
        #forming predictions
        y_pred = SVMmodel.predict(X_test)

        #evaluation metrics - outputted
        print(classification_report(y_test, y_pred, zero_division=0))#addition based on warnings in terminal
        #confusion matrix
        confusion = confusion_matrix(y_test, y_pred)
        plt.figure(figsize=(8,8))
        sns.heatmap(confusion, annot=True, fmt="d", cmap='PiYG', xticklabels=['negative', 'neutral', 'positive'], yticklabels=['negative', 'neutral', 'positive'])
        plt.xlabel('predicted')
        plt.ylabel('actual')
        plt.title('confusion matrix')
        plt.show()

        end = time.time()#ending training timer
        trainingTime = end - start #outputs in seconds
        print(f"training time in secs= {trainingTime}")

        #saving model for future use using pickle
        from google.colab import drive
        drive.mount('/content/drive')
        filepath='/content/drive/My Drive/ML_SVM'
        pickle.dump(SVMmodel, open(filepath, 'wb'))
        print("the model has been saved sucessfully")

#test already ran commented out code to run it
if __name__ == '__main__':
    #if len(sys.argv) > 1 and sys.argv[1] == 'run-main':
        main_model()
    #else:
        #unittest.main(argv=['first-arg-is-ignored'], exit=False)


Loading SVM model on sample of dataset - for demo

In [None]:
import pandas as pd
import spacy
import pickle
from sklearn.feature_extraction.text import TfidfVectorizer
from langdetect import detect
from sklearn.metrics import classification_report
from google.colab import drive
#mounting googel drive
drive.mount('/content/drive')
#loading saved SVM model
modelpath = '/content/drive/My Drive/deliverables_w21010751_KV6003/ML_SVM'
with open(modelpath, 'rb') as file:
    SVMmodel = pickle.load(file)
print("model loaded sucessfully!")
#importing data
print ("importing sample of dataset!")
ds = pd.read_csv('reviews.csv', encoding='utf-8')
sample_size = 1000 #size of sample dataset

#mapping ratings to sentiments
print ("mapping sentiments")
def toSent (rating):
    if rating <= 2:
        return 'negative'
    if rating == 3:
        return 'neutral'
    else:
        return 'positive'

#sentiment values now in new column
ds['sentiment'] = ds['rating'].apply(toSent)
demo_data = ds.groupby('sentiment').apply(lambda x: x.sample(int(sample_size/len(ds['sentiment'].unique())))).reset_index(drop=True)

#preprocesing for both english and italian - using spaCy
#loading the spacy models
ENspacy = spacy.load('en_core_web_sm')
ITspacy = spacy.load('it_core_news_sm')

print("preprocessing started!")
def preprocess(text):
    try:
        #detecting language and choosing spacy model based off this
        doc = ENspacy(text) if detect(text) == 'en' else ITspacy(text)
    except Exception as e:
        print(f"error with preprocessing= {e}")
        return '{e}'

    #tokenisation and getting rid of stop words and punctuation
    tokens = [token.text.lower() for token in doc if not token.is_punct and not token.is_stop]
        #joining the text back together
    return ' '.join(tokens)

print("applying preprocesing to dataset!")
#applying preprocessing to the dataset
demo_data['processedText'] = demo_data['text'].apply(preprocess)

print ("extracting feautres!")
#feature extraction of preprocessed text
vectoriser = TfidfVectorizer(max_features=3000)
x = vectoriser.fit_transform(demo_data['processedText'])
y = demo_data['sentiment']

predictions = SVMmodel.predict(x)
demo_data['predictions'] = predictions
#classification report on the loaded model
print (classification_report(demo_data['sentiment'], demo_data['predictions']))

Loading LSTM model on sample of dataset - for demo

In [None]:
import pandas as pd
import numpy as np
import spacy
from langdetect import detect
from tensorflow.keras.models import load_model
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.utils import to_categorical
from sklearn.metrics import classification_report
from google.colab import drive
#mounting google drive
drive.mount('/content/drive')
#loading saved LSTM model
modelpath = '/content/drive/My Drive/deliverables_w21010751_KV6003/ML_LSTM.keras'
LSTMmodel = load_model(modelpath)
print("model loaded successfully!")
#importing data
print ("importing sample of dataset!")
ds = pd.read_csv('reviews.csv', encoding='utf-8')
sample_size = 1000 #size of sample dataset

#mapping ratings to sentiments
print ("mapping sentiments")
def toSent (rating):
    if rating <= 2:
        return 'negative'
    if rating == 3:
        return 'neutral'
    else:
        return 'positive'

#sentiment values now in new column
ds['sentiment'] = ds['rating'].apply(toSent)

demo_data = ds.groupby('sentiment').apply(lambda x: x.sample(int(sample_size/len(ds['sentiment'].unique())))).reset_index(drop=True)
#preprocesing for both english and italian - using spaCy
#loading the spacy models
ENspacy = spacy.load('en_core_web_sm')
ITspacy = spacy.load('it_core_news_sm')

print("preprocessing started!")
def preprocess(text):
    try:
        #detecting language and choosing spacy model based off this
        doc = ENspacy(text) if detect(text) == 'en' else ITspacy(text)

    except Exception as e:
        print(f"error with preprocessing= {e}")

    tokens = [token.text.lower() for token in doc if not token.is_punct and not token.is_stop]
    #joining the text back together
    return ' '.join(tokens)

#applying the spacy preprocessing to dataset
demo_data['preprocessed'] = demo_data['text'].apply(preprocess)

#tokenising
tokeniser = Tokenizer(num_words=5000)
tokeniser.fit_on_texts(demo_data['preprocessed'])
sequences = tokeniser.texts_to_sequences(demo_data['preprocessed'])
data = pad_sequences(sequences, maxlen=100)


#coverting labels back to numeric values again for use in model
labels = demo_data['sentiment'].map({'negative': 0, 'neutral': 1, 'positive': 2})

#making predictions
predictions = LSTMmodel.predict(data)
predicted_classes = np.argmax(predictions, axis=1)
#classification report
print(classification_report(labels, predicted_classes, target_names=['negative', 'neutral', 'positive']))




Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
model loaded successfully!
importing sample of dataset!
mapping sentiments




preprocessing started!
              precision    recall  f1-score   support

    negative       0.35      0.34      0.34       333
     neutral       0.35      0.37      0.36       333
    positive       0.28      0.27      0.28       333

    accuracy                           0.33       999
   macro avg       0.33      0.33      0.33       999
weighted avg       0.33      0.33      0.33       999



Loading DistilBERT model on sample of dataset - for demo

In [None]:
import pandas as pd
import numpy as np
import tensorflow as tf
from transformers import DistilBertTokenizer, TFDistilBertForSequenceClassification
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import classification_report
from google.colab import drive

#mounting google drive
drive.mount('/content/drive')
#loading saved BERT model
modelpath = '/content/drive/My Drive/deliverables_w21010751_KV6003/DistilBERT_FineTuned.keras'
BERTmodel = load_model(modelpath)
print("model loaded successfully!")

#importing data
print("importing sample of dataset!")
ds = pd.read_csv('reviews.csv', encoding='utf-8')
sample_size = 1000

#mapping ratings to sentiments
print("mapping sentiments")
def toSent(rating):
    if rating <= 2:
        return 'negative'
    if rating == 3:
        return 'neutral'
    else:
        return 'positive'

ds['sentiment'] = ds['rating'].apply(toSent)
#sampling data evenly so equal negative, neutral, positive reviews in small sample
demo_data = ds.groupby('sentiment').apply(lambda x: x.sample(int(sample_size/len(ds['sentiment'].unique())))).reset_index(drop=True)
#splitting into test train
train_texts, test_texts, train_labels, test_labels = train_test_split(demo_data['text'], demo_data['sentiment'], test_size=0.3, random_state=42)

#encoding labels
label_encoder = LabelEncoder()
train_labels = label_encoder.fit_transform(train_labels)
test_labels = label_encoder.transform(test_labels)

#loading tokenizer
tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')
#preprocessing
MAX_LENGTH = 512
train_encodings = tokenizer(train_texts.tolist(), truncation=True, padding=True, max_length=MAX_LENGTH, return_tensors="tf")
test_encodings = tokenizer(test_texts.tolist(), truncation=True, padding=True, max_length=MAX_LENGTH, return_tensors="tf")

#converting to tensors
train_dataset = tf.data.Dataset.from_tensor_slices((dict(train_encodings), train_labels))
test_dataset = tf.data.Dataset.from_tensor_slices((dict(test_encodings), test_labels))

#compiling model
optimizer = tf.keras.optimizers.Adam(learning_rate=5e-5)
loss = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)
metrics = ['accuracy']
BERTmodel.compile(optimizer=optimizer, loss=loss, metrics=metrics)

#making predictions
predictions = BERTmodel.predict(test_encodings)
predicted_classes = np.argmax(predictions.logits, axis=1)
print(classification_report(test_labels, predicted_classes))


