In [15]:
#!pip install torch
#!pip install sentence_transformers
#!pip install tf-keras
#!pip install datasets
#!pip install nltk
pip install tensorflow[and-cuda]

pip3 install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu126

In [37]:
import numpy as np
import pandas as pd
import torch
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report
from sentence_transformers import SentenceTransformer
from sklearn.feature_extraction.text import TfidfVectorizer
from datasets import load_dataset
from nltk.tokenize import word_tokenize

In [17]:
# Load dataset
dataset = load_dataset("imdb")

Generating train split: 100%|██████████| 25000/25000 [00:00<00:00, 207626.07 examples/s]
Generating test split: 100%|██████████| 25000/25000 [00:00<00:00, 266629.37 examples/s]
Generating unsupervised split: 100%|██████████| 50000/50000 [00:00<00:00, 325257.34 examples/s]


In [18]:
dataset

DatasetDict({
    train: Dataset({
        features: ['text', 'label'],
        num_rows: 25000
    })
    test: Dataset({
        features: ['text', 'label'],
        num_rows: 25000
    })
    unsupervised: Dataset({
        features: ['text', 'label'],
        num_rows: 50000
    })
})

In [38]:
# Convert dataset to DataFrame
def dataset_to_dataframe(dataset):
    df = pd.DataFrame(dataset)
    return df


train_df = dataset_to_dataframe(dataset["train"])
test_df = dataset_to_dataframe(dataset["test"])


# Extract text and labels
def preprocess_data(df, sample_size=5000):
    df_sample = df.sample(n=sample_size, random_state=42)
    texts = df_sample["text"].tolist()
    labels = df_sample["label"].tolist()
    return texts, labels

train_texts, train_labels = preprocess_data(train_df, sample_size=5000)
test_texts, test_labels = preprocess_data(test_df, sample_size=1000)

train_df.head()

Unnamed: 0,text,label
0,I rented I AM CURIOUS-YELLOW from my video sto...,0
1,"""I Am Curious: Yellow"" is a risible and preten...",0
2,If only to avoid making this type of film in t...,0
3,This film was probably inspired by Godard's Ma...,0
4,"Oh, brother...after hearing about this ridicul...",0


Task:

1. Now create embeddings using TFIDF & Transformer model as used in example notebook
2. Try some other transformer models from HuggingFace to do embedding - 'microsoft/deberta-v3-large' or 'roberta-large-mnli'
2. Train and evaluate models with each embedding separately
3. Train a model with all embeddings and evaluate
4. Which one is performing the best, why do you think it is the best?

### 1. Now create embeddings using TFIDF & Transformer model as used in example notebook

In [39]:
# all-MiniLM-L6-v2: the best performance, paraphrase-MiniLM-L3-v2: the fastest, very low comparatively performance
def model_creater(sentence_transformer):    
    model = SentenceTransformer(sentence_transformer) 
    print("Generating transformer embeddings...")
    X_train_transformer = model.encode(train_texts, convert_to_numpy=True)
    X_test_transformer = model.encode(test_texts, convert_to_numpy=True)
    return X_train_transformer, X_test_transformer

In [40]:
# 2. TF-IDF Features
def tf_idf(_max_features=5000):
    vectorizer = TfidfVectorizer(max_features=_max_features)
    X_train_tfidf = vectorizer.fit_transform(train_texts).toarray()
    X_test_tfidf = vectorizer.transform(test_texts).toarray()
    return X_train_tfidf, X_test_tfidf

In [41]:
### scentence transofmrer, different ones, see online
### play with the maxfeatures 5000 number

### 2. Try some other transformer models from HuggingFace to do embedding - 'microsoft/deberta-v3-large' or 'roberta-large-mnli'

In [45]:
X_train_transformer, X_test_transformer = model_creater("all-MiniLM-L6-v2")

Generating transformer embeddings...


In [None]:
X_train_transformer, X_test_transformer = model_creater("paraphrase-MiniLM-L3-v2")

In [43]:
X_train_tfidf, X_test_tfidf = tf_idf(5000)

In [None]:
X_train_tfidf, X_test_tfidf = tf_idf(1000)

### 3. Train and evaluate models with each embedding separately

In [47]:
# Train and Evaluate Models
def train_and_evaluate(X_train, X_test, train_labels, test_labels, method):
    clf = RandomForestClassifier(n_estimators=100, random_state=42)
    clf.fit(X_train, train_labels)
    y_pred = clf.predict(X_test)
    accuracy = accuracy_score(test_labels, y_pred)
    report = classification_report(test_labels, y_pred)
    print(f"{method} - Accuracy: {accuracy:.4f}")
    print(f"{method} - Classification Report:\n{report}\n")

In [48]:
X_train_transformer, X_test_transformer = model_creater("all-MiniLM-L6-v2")
X_train_tfidf, X_test_tfidf = tf_idf(5000)

print("Training and evaluating models...")
train_and_evaluate(X_train_transformer, X_test_transformer, train_labels, test_labels, "Transformer Embeddings")
train_and_evaluate(X_train_tfidf, X_test_tfidf, train_labels, test_labels, "TF-IDF")

Generating transformer embeddings...
Training and evaluating models...
Transformer Embeddings - Accuracy: 0.7520
Transformer Embeddings - Classification Report:
              precision    recall  f1-score   support

           0       0.77      0.74      0.75       511
           1       0.74      0.77      0.75       489

    accuracy                           0.75      1000
   macro avg       0.75      0.75      0.75      1000
weighted avg       0.75      0.75      0.75      1000


TF-IDF - Accuracy: 0.8200
TF-IDF - Classification Report:
              precision    recall  f1-score   support

           0       0.82      0.83      0.82       511
           1       0.82      0.81      0.81       489

    accuracy                           0.82      1000
   macro avg       0.82      0.82      0.82      1000
weighted avg       0.82      0.82      0.82      1000




In [50]:
X_train_transformer, X_test_transformer = model_creater("all-MiniLM-L6-v2")
X_train_tfidf, X_test_tfidf = tf_idf(1000)

print("Training and evaluating models...")
train_and_evaluate(X_train_transformer, X_test_transformer, train_labels, test_labels, "Transformer Embeddings")
train_and_evaluate(X_train_tfidf, X_test_tfidf, train_labels, test_labels, "TF-IDF")

Generating transformer embeddings...
Training and evaluating models...
Transformer Embeddings - Accuracy: 0.7520
Transformer Embeddings - Classification Report:
              precision    recall  f1-score   support

           0       0.77      0.74      0.75       511
           1       0.74      0.77      0.75       489

    accuracy                           0.75      1000
   macro avg       0.75      0.75      0.75      1000
weighted avg       0.75      0.75      0.75      1000


TF-IDF - Accuracy: 0.8110
TF-IDF - Classification Report:
              precision    recall  f1-score   support

           0       0.82      0.81      0.81       511
           1       0.81      0.81      0.81       489

    accuracy                           0.81      1000
   macro avg       0.81      0.81      0.81      1000
weighted avg       0.81      0.81      0.81      1000




In [None]:
X_train_transformer, X_test_transformer = model_creater("paraphrase-MiniLM-L3-v2")
X_train_tfidf, X_test_tfidf = tf_idf(5000)

print("Training and evaluating models...")
train_and_evaluate(X_train_transformer, X_test_transformer, train_labels, test_labels, "Transformer Embeddings")
train_and_evaluate(X_train_tfidf, X_test_tfidf, train_labels, test_labels, "TF-IDF")

In [None]:
X_train_transformer, X_test_transformer = model_creater("paraphrase-MiniLM-L3-v2")
X_train_tfidf, X_test_tfidf = tf_idf(1000)

print("Training and evaluating models...")
train_and_evaluate(X_train_transformer, X_test_transformer, train_labels, test_labels, "Transformer Embeddings")
train_and_evaluate(X_train_tfidf, X_test_tfidf, train_labels, test_labels, "TF-IDF")

### 4. Train a model with all embeddings and evaluate

### 5. Which one is performing the best, why do you think it is the best?

In [None]:
#note azure open ai