<a href="https://colab.research.google.com/github/kaanygl/WeatherScraping/blob/master/ftmodel.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
import torch
from torch.utils.data import Dataset
from transformers import AutoTokenizer, AutoModelForSequenceClassification, Trainer, TrainingArguments
from transformers import (
    AutoTokenizer,
    AutoModelForSequenceClassification,
    Trainer,
    TrainingArguments,
    EarlyStoppingCallback
)

In [None]:
df1 = pd.read_csv('/content/drive/MyDrive/prj/pr2.csv')

In [None]:
from sklearn.utils import shuffle
df2 = shuffle(df1)
df = pd.DataFrame(df2.head(1000000))
df['processed_complaint'] = df['processed_complaint'].str.replace(r'\[redacted\]', ' ', regex=True)

In [None]:
print("Issue Distribution:")
print(df['Issue'].value_counts())

print("\nProduct Distribution:")
print(df['Product'].value_counts())

Issue Distribution:
Issue
Incorrect information on your report                                                230034
Improper use of your report                                                         125140
Problem with a credit reporting company's investigation into an existing problem     89671
Attempts to collect debt not owed                                                    60374
Problem with a company's investigation into an existing problem                      39146
                                                                                     ...  
Property was damaged or destroyed property                                               3
Lender damaged or destroyed vehicle                                                      2
Lost or stolen refund                                                                    1
Lender damaged or destroyed property                                                     1
Problem with an overdraft                                       

In [None]:
labels = df['Product']

In [None]:
# Encode the labels into integers
label_encoder = LabelEncoder()
df['label'] = label_encoder.fit_transform(labels)

# Adjust this parameter based on your text lengths
max_length = 256  # You can try 64, 128, etc. Longer sequences take more time

In [None]:
# 2. Split the data into training and testing sets
train_texts, test_texts, train_labels, test_labels = train_test_split(
    df['processed_complaint'].tolist(),
    df['label'].tolist(),
    test_size=0.2,
    random_state=42
)

In [None]:
# 3. Set up the tokenizer and create a custom dataset
model_name = "bert-base-multilingual-cased"
tokenizer = AutoTokenizer.from_pretrained(model_name)

In [None]:
class ComplaintDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_length):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = self.texts[idx]
        label = self.labels[idx]
        encoding = self.tokenizer(
            text,
            truncation=True,
            padding='max_length',
            max_length=self.max_length,
            return_tensors='pt'
        )
        # Remove extra dimensions introduced by return_tensors='pt'
        item = {key: val.squeeze() for key, val in encoding.items()}
        item['labels'] = torch.tensor(label, dtype=torch.long)
        return item

In [None]:
train_dataset = ComplaintDataset(train_texts, train_labels, tokenizer, max_length)
test_dataset = ComplaintDataset(test_texts, test_labels, tokenizer, max_length)

In [None]:
# 4. Load the pre-trained multilingual BERT model for classification
num_labels = len(label_encoder.classes_)  # Total number of unique labels
model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=num_labels)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-multilingual-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
# Increase dropout probabilities for more regularization
model.config.hidden_dropout_prob = 0.5
model.config.attention_probs_dropout_prob = 0.5

In [None]:
# 5. Set up training arguments and initialize the Trainer
training_args = TrainingArguments(
    output_dir='./results',
    num_train_epochs=10,                    # Adjust the number of epochs
    per_device_train_batch_size=128,        # Increase batch size if your GPU memory allows
    per_device_eval_batch_size=128,
    evaluation_strategy="epoch",           # Evaluate at the end of each epoch
    save_strategy="epoch",                 # Save model at the end of each epoch
    load_best_model_at_end=True,           # Load best model at the end
    learning_rate=2e-5,                    # Set a learning rate (tweak if necessary)
    lr_scheduler_type="linear",            # Use a linear scheduler
    warmup_steps=500,                      # Warm-up for the first 500 steps
    fp16=True,                             # Enable mixed precision for faster training
    logging_dir='./logs',
    weight_decay=0.01                      # Regularization: apply weight decay
)




In [None]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
    callbacks=[EarlyStoppingCallback(early_stopping_patience=2)]
)


In [None]:
import wandb
wandb.init(mode="disabled")
# 6. Fine-tune the model
trainer.train()

Epoch,Training Loss,Validation Loss
1,0.7208,0.697142
2,0.6464,0.655087
3,0.5987,0.648548
4,0.5417,0.638926
5,0.4967,0.650462


Epoch,Training Loss,Validation Loss
1,0.7208,0.697142
2,0.6464,0.655087
3,0.5987,0.648548
4,0.5417,0.638926
5,0.4967,0.650462
6,0.446,0.664099


TrainOutput(global_step=37500, training_loss=0.6031292834472656, metrics={'train_runtime': 15285.8246, 'train_samples_per_second': 523.361, 'train_steps_per_second': 4.089, 'total_flos': 6.315742568448e+17, 'train_loss': 0.6031292834472656, 'epoch': 6.0})

In [None]:
from google.colab import files
files.upload()

KeyboardInterrupt: 

In [None]:
# Save the fine-tuned model and tokenizer to a specified directory
output_dir = "/content/drive/MyDrive/prj/fine_tuned_model"
trainer.save_model(output_dir)           # Saves the model's weights and configuration
tokenizer.save_pretrained(output_dir)      # Saves the tokenizer configuration and vocabulary


('/content/drive/MyDrive/prj/fine_tuned_model/tokenizer_config.json',
 '/content/drive/MyDrive/prj/fine_tuned_model/special_tokens_map.json',
 '/content/drive/MyDrive/prj/fine_tuned_model/vocab.txt',
 '/content/drive/MyDrive/prj/fine_tuned_model/added_tokens.json',
 '/content/drive/MyDrive/prj/fine_tuned_model/tokenizer.json')

In [None]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification

model = AutoModelForSequenceClassification.from_pretrained(output_dir)
tokenizer = AutoTokenizer.from_pretrained(output_dir)

In [None]:
# Evaluate the model on the test set
results = trainer.evaluate()
print(results)

{'eval_loss': 0.6389257311820984, 'eval_runtime': 349.6286, 'eval_samples_per_second': 572.036, 'eval_steps_per_second': 4.47, 'epoch': 6.0}


In [None]:
sample_text = "I am having a difficult time identifying directives in the manual"

# Tokenize the sample text (adjust max_length to what you used during training)
inputs = tokenizer(
    sample_text,
    truncation=True,
    padding='max_length',
    max_length=256,
    return_tensors='pt'
)

In [None]:
# Move inputs to the same device as the model
inputs = {key: value.to(device) for key, value in inputs.items()}

# Set model to evaluation mode and disable gradient computation
model.eval()
with torch.no_grad():
    outputs = model(**inputs)

# Get logits and find the predicted class id
logits = outputs.logits
predicted_class_id = torch.argmax(logits, dim=1).item()

# Convert the class id back to the original label using the label encoder
predicted_label = label_encoder.inverse_transform([predicted_class_id])[0]

print("Predicted label:", predicted_label)

Predicted label: Credit reporting or other personal consumer reports
