In [1]:
import transformers
from transformers import DistilBertTokenizer
from transformers import TFDistilBertForSequenceClassification
from transformers import TextClassificationPipeline

import tensorflow as tf
import pandas as pd
import json
import gc

from sklearn.model_selection import train_test_split

import re
import nltk
from nltk.corpus import stopwords
nltk.download('stopwords')
stopw = stopwords.words('english')

import seaborn as sns
import matplotlib.pyplot as plt
from plotly.offline import iplot

from tqdm import tqdm


[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/danielwang/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [2]:
csv = pd.read_csv("cleaned_slayer.csv")

In [3]:
csv

Unnamed: 0.1,Unnamed: 0,section_id,content,file_name
0,0,##87151,43 ripsnorter unplayable watson is gone and h...,5
1,1,##97951,welcome to the world confederation for physic...,5
2,2,##209450,just over a week ago australia were bowled ou...,5
3,3,##239752,release title 345612toolong 2007 release date...,5
4,4,##411252,lte both radio and core network evolution is ...,5
...,...,...,...,...
1645,1645,##1151552,study could advantageously be carried on by s...,16
1646,1646,##1153651,the official bit the concept of organic expre...,16
1647,1647,##1154050,nietzsche hegel and schopenhauer hegel and th...,16
1648,1648,##1154350,the forex market the foreign exchange market ...,16


In [4]:
data_texts = csv['content'].to_list()

data_labels = csv['file_name'].to_list()

In [5]:
#Train Test Split
train_texts, val_texts, train_labels, val_labels = train_test_split(data_texts, data_labels, test_size = 0.2, random_state = 0 )


train_texts, test_texts, train_labels, test_labels = train_test_split(train_texts, train_labels, test_size = 0.01, random_state = 0 )


In [6]:
from transformers import DistilBertTokenizer, DistilBertForSequenceClassification, Trainer, TrainingArguments
from torch.utils.data import Dataset, DataLoader
import torch

from sklearn.metrics import accuracy_score, precision_score, f1_score

def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    preds = predictions.argmax(axis=-1)  # Get the index of the highest logit as the predicted class
    acc = accuracy_score(labels, preds)
    precision = precision_score(labels, preds, average='weighted')
    f1 = f1_score(labels, preds, average='weighted')
    return {
        "accuracy": acc,
        "precision": precision,
        "f1": f1
    }

# Model and tokenizer
tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')

# Tokenize the data
train_encodings = tokenizer(train_texts, truncation=True, padding=True, max_length=512)
val_encodings = tokenizer(val_texts, truncation=True, padding=True, max_length=512)

# Custom Dataset class for PyTorch
class TextDataset(Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        return item

# Convert to PyTorch Datasets
train_dataset = TextDataset(train_encodings, train_labels)
val_dataset = TextDataset(val_encodings, val_labels)

# Model definition
trainer_model = DistilBertForSequenceClassification.from_pretrained('distilbert-base-uncased', num_labels=5)

# Training arguments
training_args = TrainingArguments(
    output_dir='./results',          
    num_train_epochs=7,              
    per_device_train_batch_size=16,  
    per_device_eval_batch_size=64,   
    warmup_steps=500,                
    weight_decay=1e-5,               
    logging_dir='./logs',            
    evaluation_strategy="steps",     
    eval_steps=100                   
)

# Trainer setup
trainer = Trainer(
    model=trainer_model,                 
    args=training_args,                  
    train_dataset=train_dataset,         
    eval_dataset=val_dataset,
    compute_metrics=compute_metrics
)

# Train the model
trainer.train()


Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.

`evaluation_strategy` is deprecated and will be removed in version 4.46 of 🤗 Transformers. Use `eval_strategy` instead



Step,Training Loss,Validation Loss,Accuracy,Precision,F1
100,No log,0.915425,0.269697,0.113353,0.15131
200,No log,0.629339,0.421212,0.301976,0.329891
300,No log,0.575003,0.421212,0.305032,0.330303
400,No log,0.65877,0.427273,0.284557,0.328103
500,0.550100,0.847353,0.415152,0.281724,0.319383



Precision is ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.


Precision is ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.


Precision is ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.


Precision is ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.


Precision is ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.



TrainOutput(global_step=574, training_loss=0.4972773960658482, metrics={'train_runtime': 2346.8163, 'train_samples_per_second': 3.895, 'train_steps_per_second': 0.245, 'total_flos': 1211081748817920.0, 'train_loss': 0.4972773960658482, 'epoch': 7.0})

In [7]:

trainer.save_model("file:///Users/danielwang/Ling%20380/")

In [20]:
test = pd.read_csv("test.csv")

In [25]:
# Convert to numeric with errors='coerce' (turns invalid values into NaN)
test['country'] = pd.to_numeric(test['country'], errors='coerce')

# Drop rows with NaN values (those that couldn't be converted)
test = test.dropna(subset=['country'])



In [26]:
test

Unnamed: 0.1,Unnamed: 0,fileID,text,country
0,0,176501,Either you 're flat or I am . I beg your pardo...,1.0
1,1,5992934,"Good morning , kitty ! Come in here , baby ! I...",1.0
2,2,4613561,There 's a place called Penguin island In the ...,1.0
3,3,3639561,"- Thank you . That will be all , Moses . - Yes...",1.0
4,4,4613564,Hands up ! Your money or your life . Gim me th...,1.0
...,...,...,...,...
676,676,4930051,"Come inside . Come on in , boys . - The crowd ...",1.0
677,677,4535604,"NARRATOR : In May of 2012 , Marshall and Lily ...",1.0
678,678,5809034,"Previously on "" Missing "" ... Something 's hap...",1.0
679,679,4833491,"HANNAH : Fuck , I 'm crushed . MARNIE : No , y...",1.0


In [27]:
test_texts = test['text'].to_list()

test_labels = test['country'].to_list()

In [28]:
from transformers import DistilBertTokenizer

# Load the tokenizer used during training
tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')

# Tokenize the test data
test_encodings = tokenizer(test_texts, truncation=True, padding=True, max_length=512)


In [29]:
class NewTestDataset(Dataset):
    def __init__(self, encodings, labels = None):
        self.encodings = encodings
        self.labels = labels

    def __len__(self):
        return len(self.encodings["input_ids"])

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        if self.labels:
            item["labels"] = torch.tensor(self.labels[idx])
        return item

# Instantiate dataset
test_dataset = TextDataset(test_encodings, test_labels)


In [30]:
test_results = trainer.predict(test_dataset)


Precision is ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.



In [31]:
metrics = test_results.metrics
print(f"Test Accuracy: {metrics['test_accuracy']}")
print(f"Test F1: {metrics['test_f1']}")
print(f"Test Precision: {metrics['test_precision']}")

Test Accuracy: 0.7558823529411764
Test F1: 0.7686626742281454
Test Precision: 0.7833735596130005


In [8]:
# Evaluate the model on the validation dataset
# Evaluate the model
eval_results = trainer.evaluate()

# Output validation loss and accuracy
validation_loss = eval_results.get("eval_loss", None)
accuracy = eval_results.get("eval_accuracy", None)

print(f"Validation Loss: {validation_loss}")
print(f"Validation Accuracy: {accuracy}")



Validation Loss: 0.8993900418281555
Validation Accuracy: 0.4212121212121212


In [1]:
from sklearn.metrics import precision_score, f1_score

In [2]:
import torch
#Inference
# Tokenize the test input
test_text = test_texts
test_encoding = tokenizer(test_text, truncation=True, padding=True, max_length=512, return_tensors='pt')

# Set the model to evaluation mode
trainer_model.eval()

# Move tensors and model to the same device (CPU or GPU)
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
trainer_model.to(device)
test_encoding = {key: val.to(device) for key, val in test_encoding.items()}

# Make predictions
with torch.no_grad():  # Disable gradient computation for inference
    output = trainer_model(**test_encoding)

# Get the predicted class index
logits = output.logits  # Access logits from the output
predicted_class = torch.argmax(logits, dim=1).item()

# Print the result
print(f"Predicted class: {predicted_class}")



NameError: name 'test_texts' is not defined