This notebook is based on another notebook where we fine-tuned Bert model for sentiment analysis 

here is the link for the notebook [

In [2]:
import torch
import re
import spacy
import numpy as np
import pandas as pd
from transformers import BertTokenizer, BertForSequenceClassification
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import FunctionTransformer
from tqdm import tqdm
from joblib import Parallel, delayed

# Load Spacy model
nlp = spacy.load('en_core_web_lg')

# Precompile regex for URL and repeated characters/words
url_pattern = re.compile(r'http\S+|www\S+|https\S+')
repeated_char_pattern = re.compile(r'(.)\1{3,}')
repeated_word_pattern = re.compile(r'\b(\w+)( \1\b)+')

# Custom stopwords excluding important negation words
stopwords_to_keep = {"not", "no", "nor"}
custom_stopwords = nlp.Defaults.stop_words - stopwords_to_keep

# Function to remove repeated characters and words
def remove_gibberish(text):
    text = repeated_char_pattern.sub(r'\1', text)
    text = repeated_word_pattern.sub(r'\1', text)
    return text

# Main text preprocessing function
def preprocess_text(text):
    # Remove URLs
    text = url_pattern.sub('', text)
    text = remove_gibberish(text)
    
    doc = nlp(text)
    
    # Lemmatize, remove stopwords, punctuation, digits, long tokens
    tokens = [token.lemma_.lower() for token in doc if not (
        token.is_punct or token.is_digit or len(token.text) > 20 or token.text in custom_stopwords)]
    
    # Join tokens back into clean text
    clean_text = " ".join(tokens)
    
    return clean_text

# Apply preprocessing to a series of texts in parallel
def combined_preprocessing(text_series):
    results = Parallel(n_jobs=-1, backend="multiprocessing")(
        delayed(preprocess_text)(text) for text in tqdm(text_series, desc="Processing texts")
    )
    return pd.Series(results)

# Define a pipeline with custom preprocessing step
preprocess_pipe = Pipeline([
    ('preprocess', FunctionTransformer(combined_preprocessing, validate=False)),
])

# --- BERT Tokenization and Inference ---

# Load pretrained BERT model and tokenizer
model = torch.load('/kaggle/input/preprocessing-pipeline-and-modeling-amazon-reviews/bert_model', map_location=torch.device('cpu'))  # Load saved model
model.eval()  # Set model to evaluation mode
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

# Function to preprocess text for BERT tokenization
def preprocess_for_bert(text_list, max_len):
    input_ids = []
    attention_masks = []

    # Tokenize each sentence and create attention masks
    for text in text_list:
        encoded_dict = tokenizer.encode_plus(
            text,                      # Text to encode
            add_special_tokens=True,    # Add '[CLS]' and '[SEP]'
            max_length=max_len,         # Pad or truncate to max length
            pad_to_max_length=True,
            return_attention_mask=True, # Create attention mask
            return_tensors='pt',        # Return PyTorch tensors
        )
        input_ids.append(encoded_dict['input_ids'])
        attention_masks.append(encoded_dict['attention_mask'])

    input_ids = torch.cat(input_ids, dim=0)
    attention_masks = torch.cat(attention_masks, dim=0)
    
    return input_ids, attention_masks

# Function to perform inference using the BERT model
def predict(text_list, max_len, device):
    # Step 1: Apply custom preprocessing (tokenization, stopword removal, etc.)
    preprocessed_texts = preprocess_pipe.transform(pd.Series(text_list))
    
    # Step 2: Convert preprocessed text into BERT input format
    input_ids, attention_masks = preprocess_for_bert(preprocessed_texts, max_len)

    # Move inputs to the correct device
    input_ids = input_ids.to(device)
    attention_masks = attention_masks.to(device)

    # Perform inference without gradient calculation
    with torch.no_grad():
        outputs = model(input_ids, token_type_ids=None, attention_mask=attention_masks)

    # Extract logits and convert to predicted labels
    logits = outputs.logits
    predictions = torch.argmax(logits, dim=1).cpu().numpy()

    return predictions




  model = torch.load('/kaggle/input/preprocessing-pipeline-and-modeling-amazon-reviews/bert_model', map_location=torch.device('cpu'))  # Load saved model


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

In [4]:
# Example usage
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model.to(device)

predictions_mapping = {
    0: "Positive",
    1: "Negative"
}

# Example list of sentences for inference
texts = [
    "This is a Greate Product, i can't wait to try it ",
    "This product is terrible and I am not happy with it."
]

# Define the maximum length for tokenizing (use the same max_len as training)
max_len = 200  

# Get predictions for the input sentences
predictions = predict(texts, max_len, device)

for i, text in enumerate(texts):
    print(f"Text: {text}")
    print(f"Prediction: {predictions_mapping[predictions[i]]}")  # 0 for negative, 1 for positive 
    print("")

Processing texts: 100%|██████████| 2/2 [00:00<00:00,  5.92it/s]


Text: This is a Greate Product, i can't wait to try it 
Prediction: Positive

Text: This product is terrible and I am not happy with it.
Prediction: Negative

