# Task 3: Car Review Classification

In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.pipeline import make_pipeline
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB
import numpy as np

#### Data preparation


In [16]:
import pandas as pd

# Read the Excel file
train_df = pd.read_excel('/Users/kaylahoffman/Downloads/Train_data.xlsx')

# Display the first few rows to verify the data
print(train_df.head())

# Split the data into X_train and y_train
X = train_df['Review']
y = train_df['Target']

# Display the first few entries of X_train and y_train
print(X.head())
print(y.head())

from sklearn.model_selection import train_test_split

X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

                                              Review  Target
0  Bought 2017 Optima Hybrid in November 17. \nIt...       2
1  You get a lot for your money and great perform...       2
2  This car is amazing and have no complaints. Yo...       2
3  At 11k now in a lease for 39 months and it onl...       0
4  I've owned BMW, Lexus, Mercedes-Benz in the la...       2
0    Bought 2017 Optima Hybrid in November 17. \nIt...
1    You get a lot for your money and great perform...
2    This car is amazing and have no complaints. Yo...
3    At 11k now in a lease for 39 months and it onl...
4    I've owned BMW, Lexus, Mercedes-Benz in the la...
Name: Review, dtype: object
0    2
1    2
2    2
3    0
4    2
Name: Target, dtype: int64


### Tokenization

In [62]:
# Import and install necessary libaries 
from transformers import BertForSequenceClassification
import torch

# !pip install transformers
from transformers import BertTokenizer

In [20]:
# Load the BERT tokenizier
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

In [21]:
# Tokenize the input texts
X_train_tokenized = tokenizer(X_train.tolist(), padding=True, truncation=True, max_length=128, return_tensors='pt')
X_val_tokenized = tokenizer(X_val.tolist(), padding=True, truncation=True, max_length=128, return_tensors='pt')

### Model Setup 

In [22]:
from transformers import BertForSequenceClassification
import torch

In [24]:
# Load the pre-trained BERT model
model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=len(y.unique()))

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


### Training 

In [25]:
# Set up the training parameters 
from torch.optim import AdamW
from torch.utils.data import TensorDataset, DataLoader

batch_size = 32
epochs = 3
optimizer = AdamW(model.parameters(), lr=2e-5)

In [26]:
train_dataset = TensorDataset(X_train_tokenized['input_ids'], X_train_tokenized['attention_mask'], torch.tensor(y_train.tolist()))
train_dataloader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)

In [27]:
for epoch in range(epochs):
    model.train()
    for batch in train_dataloader:
        optimizer.zero_grad()
        input_ids, attention_mask, labels = batch
        outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs.loss
        loss.backward()
        optimizer.step()
    print(f"Epoch {epoch+1}/{epochs} completed")

Epoch 1/3 completed
Epoch 2/3 completed
Epoch 3/3 completed


In [28]:
val_dataset = TensorDataset(X_val_tokenized['input_ids'], X_val_tokenized['attention_mask'], torch.tensor(y_val.tolist()))
val_dataloader = DataLoader(val_dataset, batch_size=batch_size)

## Evaluate the model

In [29]:
model.eval()
predictions = []
actual_labels = []

with torch.no_grad():
    for batch in val_dataloader:
        input_ids, attention_mask, labels = batch
        outputs = model(input_ids, attention_mask=attention_mask)
        preds = torch.argmax(outputs.logits, dim=1)
        predictions.extend(preds.tolist())
        actual_labels.extend(labels.tolist())

from sklearn.metrics import accuracy_score, classification_report
print(f"Accuracy: {accuracy_score(actual_labels, predictions)}")
print(classification_report(actual_labels, predictions))

Accuracy: 0.7073170731707317
              precision    recall  f1-score   support

           0       0.00      0.00      0.00         8
           1       0.00      0.00      0.00         4
           2       0.71      1.00      0.83        29

    accuracy                           0.71        41
   macro avg       0.24      0.33      0.28        41
weighted avg       0.50      0.71      0.59        41



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [37]:
param_grid = [
    {'learning_rate': 1e-5, 'batch_size': 16, 'num_epochs': 3, 'dropout': 0.1},
    {'learning_rate': 2e-5, 'batch_size': 32, 'num_epochs': 4, 'dropout': 0.2},
    # Add more parameter combinations as needed
]

In [45]:
import torch
from transformers import BertForSequenceClassification, BertTokenizer, AdamW, get_linear_schedule_with_warmup
from torch.utils.data import DataLoader, TensorDataset
from sklearn.model_selection import train_test_split
import pandas as pd
import numpy as np

In [53]:
# Function to train and evaluate model
def train_evaluate_model(params):
    model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=len(set(y)))
    model.dropout = torch.nn.Dropout(params['dropout'])

    train_loader = DataLoader(train_dataset, batch_size=params['batch_size'], shuffle=True)
    val_loader = DataLoader(val_dataset, batch_size=params['batch_size'])

    optimizer = AdamW(model.parameters(), lr=params['learning_rate'])
    
    # Learning rate scheduler with warmup
    total_steps = len(train_loader) * params['num_epochs']
    scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=0, num_training_steps=total_steps)

    for epoch in range(params['num_epochs']):
        model.train()
        for batch in train_loader:
            optimizer.zero_grad()
            input_ids, attention_mask, labels = batch
            outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
            loss = outputs.loss
            loss.backward()
            optimizer.step()
            scheduler.step()

# Evaluation
    model.eval()
    val_loss = 0
    correct = 0
    with torch.no_grad():
        for batch in val_loader:
            input_ids, attention_mask, labels = batch
            outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
            val_loss += outputs.loss.item()
            predictions = torch.argmax(outputs.logits, dim=1)
            correct += (predictions == labels).sum().item()

    accuracy = correct / len(val_dataset)
    return accuracy
print(accuracy)

0.6829268292682927


In [54]:
# Grid search
best_accuracy = 0
best_params = None

for params in param_grid:
    accuracy = train_evaluate_model(params)
    if accuracy > best_accuracy:
        best_accuracy = accuracy
        best_params = params

print(f"Best parameters: {best_params}")
print(f"Best validation accuracy: {best_accuracy}")

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Best parameters: {'learning_rate': 1e-05, 'batch_size': 16, 'num_epochs': 3, 'dropout': 0.1}
Best validation accuracy: 0.7073170731707317


In [60]:
import pandas as pd
from transformers import BertTokenizer

# Load the test dataset
test_df = pd.read_excel('/Users/kaylahoffman/Downloads/test_features.xlsx')

# Initialize the BERT tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

# Tokenize and encode the test data
encoded_test = tokenizer(test_df['Review'].tolist(), padding=True, truncation=True, max_length=128, return_tensors='pt')

# Create a TensorDataset for the test data
test_dataset = torch.utils.data.TensorDataset(encoded_test['input_ids'], encoded_test['attention_mask'])

# Create a DataLoader for batch processing
test_loader = torch.utils.data.DataLoader(test_dataset, batch_size=32)

In [63]:
# Load the pre-trained BERT model
model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=len(y.unique()))

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [64]:
import torch

all_predictions = []

with torch.no_grad():
    for batch in test_loader:
        input_ids, attention_mask = batch
        outputs = model(input_ids, attention_mask=attention_mask)
        predictions = torch.argmax(outputs.logits, dim=1)
        all_predictions.extend(predictions.cpu().numpy())

# Convert predictions to labels if needed
# predicted_labels = [label_map[pred] for pred in all_predictions]

In [66]:
# Add predictions to the test dataframe
test_df['Predicted_Target'] = all_predictions

# Save predictions to a CSV file
test_df.to_csv('test_predictions.csv', index=False)

print("Predictions have been added to the test dataset and saved to 'test_predictions.csv'")

Predictions have been added to the test dataset and saved to 'test_predictions.csv'


In [72]:
import pandas as pd
import nltk
from nltk.corpus import wordnet
import random
import numpy as np

# Download required NLTK data
nltk.download('wordnet')
nltk.download('averaged_perceptron_tagger')

def augment_data(text, num_augmentations=2):
    augmented_texts = []
    
    # 1. Synonym Replacement
    def synonym_replacement(sentence):
        words = sentence.split()
        new_words = words.copy()
        random_word_list = list(set([word for word in words if wordnet.synsets(word)]))
        random.shuffle(random_word_list)
        num_replaced = 0
        for random_word in random_word_list:
            synonyms = wordnet.synsets(random_word)
            if len(synonyms) >= 1:
                synonym = random.choice(synonyms).lemmas()[0].name()
                new_words = [synonym if word == random_word else word for word in new_words]
                num_replaced += 1
            if num_replaced >= 2:  # Replace up to 2 words
                break
        return ' '.join(new_words)
    
    # 2. Random Deletion
    def random_deletion(sentence, p=0.1):
        words = sentence.split()
        if len(words) == 1:
            return sentence
        new_words = []
        for word in words:
            if random.uniform(0, 1) > p:
                new_words.append(word)
        if len(new_words) == 0:
            rand_int = random.randint(0, len(words)-1)
            new_words.append(words[rand_int])
        return ' '.join(new_words)
    
    # 3. Random Swap
    def random_swap(sentence, n=2):
        words = sentence.split()
        if len(words) < 2:
            return sentence
        for _ in range(n):
            idx1, idx2 = random.sample(range(len(words)), 2)
            words[idx1], words[idx2] = words[idx2], words[idx1]
        return ' '.join(words)
    
    # Apply augmentations
    augmented_texts.append(synonym_replacement(text))
    augmented_texts.append(random_deletion(text))
    augmented_texts.append(random_swap(text))
    
    return augmented_texts[:num_augmentations]

# Apply augmentation to the minority classes (0 and 1)
def augment_dataset(df):
    augmented_data = []
    
    # Augment class 0 and 1 samples
    for idx, row in df.iterrows():
        if row['Target'] in [0, 1]:  # Minority classes
            augmented_texts = augment_data(row['Review'])
            for aug_text in augmented_texts:
                augmented_data.append({
                    'Review': aug_text,
                    'Target': row['Target']
                })
    
    # Create DataFrame with augmented data
    augmented_df = pd.DataFrame(augmented_data)
    
    # Combine original and augmented data
    final_df = pd.concat([df, augmented_df], ignore_index=True)
    
    return final_df

# Apply augmentation to your training data
augmented_train_df = augment_dataset(train_df)

# Shuffle the final dataset
augmented_train_df = augmented_train_df.sample(frac=1).reset_index(drop=True)

print(f"Original dataset size: {len(train_df)}")
print(f"Augmented dataset size: {len(augmented_train_df)}")
print("\nClass distribution in augmented dataset:")
print(augmented_train_df['Target'].value_counts())

[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/kaylahoffman/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /Users/kaylahoffman/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


KeyError: 'Target'