### Model Deployment - Test the Model



In [None]:
# Basic imports for data manipulation and visualization
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

# scikit-learn imports for model loading and possibly preprocessing
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

# Joblib or Pickle for loading your trained model
import joblib






Load the Model

In [None]:
import os
print("Current Working Directory: ", os.getcwd())
# List files in the current directory
print("Files in Current Directory: ", os.listdir('.'))


Current Working Directory:  /content
Files in Current Directory:  ['.config', 'model.joblib', 'tfidf_vectorizer.joblib', 'sample_data']


In [None]:
from joblib import load
# Loading the combined model and vectorizer
model = load('model.joblib')


In [None]:
import nltk
nltk.download('punkt')
nltk.download('wordnet')
nltk.download('omw-1.4')
nltk.download('stopwords')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

# Pre Process

In [None]:
# Function to remove punctuations from text
import re
import string
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer

def remove_punctuation(text):
    regular_punct = string.punctuation
    #return re.sub(r'[#!"#$%&\'()*+,-./:;<=>?@[\]^_`{|}~]', '', str(text))
    return str(re.sub(r'['+regular_punct+']', '', str(text)))

# Function to remove URLs from text
def remove_urls(text):
    return re.sub(r'http[s]?://\S+', '', text)

# Function to convert the text into lower case
def lower_case(text):
    return text.lower()

# Function to lemmatize text
def lemmatize(text):
  wordnet_lemmatizer = WordNetLemmatizer()

  tokens = nltk.word_tokenize(text)
  lemma_txt = ''
  for w in tokens:
    lemma_txt = lemma_txt + wordnet_lemmatizer.lemmatize(w) + ' '

  return lemma_txt

In [None]:
import torch
from transformers import AutoTokenizer

# Load the BERT tokenizer
tokenizer = AutoTokenizer.from_pretrained("bert-base-cased")

# Tokenize the text data
def tokenize_data(data):
    return tokenizer(data, return_tensors='pt', padding=True, truncation=True)

# Convert tokenized data into tensors
def convert_to_tensors(data):
    input_ids = data['input_ids']
    attention_mask = data['attention_mask']

    # Pad or truncate input_ids and attention_mask to ensure they have length 512
    max_length = 512
    padded_input_ids = torch.nn.functional.pad(input_ids, (0, max_length - input_ids.size(1)), value=tokenizer.pad_token_id)
    padded_attention_mask = torch.nn.functional.pad(attention_mask, (0, max_length - attention_mask.size(1)), value=0)

    data = {'input_ids': padded_input_ids, 'attention_mask': padded_attention_mask}
    return {key: torch.tensor(val) for key, val in data.items()}

def predict_class(input_text, model):
    # Preprocess input text
    input_text = remove_punctuation(input_text)
    input_text = remove_urls(input_text)
    input_text = lower_case(input_text)
    input_text = lemmatize(input_text)

    # Tokenize input text
    tokenized_text = tokenize_data(str(input_text))

    # Convert tokenized data into tensors
    data_tensors = convert_to_tensors(tokenized_text)

    # Convert PyTorch tensors to NumPy arrays
    data_numpy = data_tensors['input_ids'].numpy()

    # Reshape the data to match Naive Bayes' input requirements
    data_flattened = data_numpy.reshape(data_numpy.shape[0], -1)

    # Make predictions using the model
    predicted_class = model.predict(data_flattened)
    return predicted_class

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/49.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/213k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/436k [00:00<?, ?B/s]

In [None]:
def map_prediction_to_label(prediction):
    class_names = {0: "Negative", 1: "Positive"}  # Assuming 0 is Negative, and 1 is Positive
    return class_names.get(prediction, "Unknown")


## Predictions through examples

In [None]:
from joblib import load


# Making a prediction
sample_text = "If only to avoid making this type of film in the future. This film is interesting as an experiment but tells no cogent story"
prediction = pipeline.predict([sample_text])[0]  # Assuming we're dealing with a single prediction

# Map the numeric prediction to a class name
prediction_label = map_prediction_to_label(prediction)
print(f"Prediction: {prediction_label}")


Prediction: Negative


In [None]:
# Making a prediction
sample_text = '''nation used in the film, creates an eerie and unforgettable cinematic experience.<br /><br />The participation of Barbara Suwkowa and Eddie Constantine in the cast are two guilty pleasures that should be seen and enjoyed.
Max Von Sydow provides his great voice as the narrator.<br /><br />A one of a kind movie! Four stars (highest rating).
Before Dogma 95: when Lars used movies as art, not just a story. A beautiful painting about love and death. This is one of my favorite movies of all time. The color... The music... Just perfect.'''
prediction = pipeline.predict([sample_text])[0]  # Assuming we're dealing with a single prediction

# Map the numeric prediction to a class name
prediction_label = map_prediction_to_label(prediction)
print(f"Prediction: {prediction_label}")

Prediction: Positive


In [None]:
# Making a prediction
sample_text = "I did not like the movie at all"
prediction = pipeline.predict([sample_text])[0]  # Assuming we're dealing with a single prediction

# Map the numeric prediction to a class name
prediction_label = map_prediction_to_label(prediction)
print(f"Prediction: {prediction_label}")


Prediction: Negative


In [None]:
# Making a prediction
sample_text = "I loved the movie"
prediction = pipeline.predict([sample_text])[0]  # Assuming we're dealing with a single prediction

# Map the numeric prediction to a class name
prediction_label = map_prediction_to_label(prediction)
print(f"Prediction: {prediction_label}")


Prediction: Positive
