In [None]:
pip install pandas numpy scikit-learn transformers[torch] torch accelerate


In [None]:
pip install ipykernel


In [None]:
pip install tensorflow

In [1]:
import time
import torch
import pandas as pd
import numpy as np
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import SVC
from sklearn.model_selection import GridSearchCV, StratifiedKFold
from sklearn.metrics import classification_report

from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.ensemble import RandomForestClassifier, VotingClassifier
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.preprocessing import FunctionTransformer
from nltk.stem import WordNetLemmatizer
from torch.utils.data import DataLoader, Dataset
from transformers import GPT2Tokenizer, GPT2ForSequenceClassification, AdamW
from transformers import get_linear_schedule_with_warmup
from torch.cuda.amp import GradScaler, autocast
from torch.utils.data import DataLoader, RandomSampler, TensorDataset
from transformers import BertTokenizer, BertForSequenceClassification, AdamW, get_linear_schedule_with_warmup
import nltk

nltk.download('wordnet')




[nltk_data] Downloading package wordnet to C:\Users\Happy
[nltk_data]     Iguare\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [2]:
test_data = pd.read_csv('test.csv', encoding='ISO-8859-1')
train_data = pd.read_csv('train.csv', encoding='ISO-8859-1')

print(len(train_data))
print(len(test_data))
train_data.dropna(subset=['text'], inplace=True)
test_data.dropna(subset=['text'], inplace=True)

sentiment_mapping = {
    'negative': 0,
    'neutral': 1,
    'positive': 2
}

# Apply the mapping to the sentiment column in both DataFrames
train_data['sentimentLabel'] = train_data['sentiment'].map(sentiment_mapping)
test_data['sentimentLabel'] = test_data['sentiment'].map(sentiment_mapping)

# Drop any rows that failed to map (i.e., if any original 'sentiment' values didn't match the keys in `sentiment_mapping`)
train_data.dropna(subset=['sentimentLabel'], inplace=True)
test_data.dropna(subset=['sentimentLabel'], inplace=True)

# Simplify the datasets to contain only 'text' and 'encoded sentiment'
train = train_data[['text', 'sentimentLabel']]
test = test_data[['text', 'sentimentLabel']]

print(len(train))
print(len(test))

X_train = train['text']
y_train = train['sentimentLabel']

X_test = test['text']
y_test = test['sentimentLabel']

27481
4815
27480
3534


In [None]:
train

Unnamed: 0,text,sentimentLabel
0,"I`d have responded, if I were going",1
1,Sooo SAD I will miss you here in San Diego!!!,0
2,my boss is bullying me...,0
3,what interview! leave me alone,0
4,"Sons of ****, why couldn`t they put them on t...",0
...,...,...
27476,wish we could come see u on Denver husband l...,0
27477,I`ve wondered about rake to. The client has ...,0
27478,Yay good for both of you. Enjoy the break - y...,2
27479,But it was worth it ****.,2


In [11]:
def lemmatize_text(texts):
    lemmatizer = WordNetLemmatizer()
    return [' '.join([lemmatizer.lemmatize(word) for word in text.split()]) for text in texts]

# Enhanced pipeline with VotingClassifier
start_time = time.time()
pipeline = Pipeline([
    ('lemmatize', FunctionTransformer(lambda x: lemmatize_text(x), validate=False)),
    ('tfidf', TfidfVectorizer(lowercase=True, max_features=5000)),
    ('clf', VotingClassifier(estimators=[
        ('svc', SVC(probability=True, kernel='linear', C=1.0)),
        ('mnb', MultinomialNB(alpha=1.0))
    ], voting='soft')),
])

param_grid = {
    'tfidf__ngram_range': [(1, 2)],
    'tfidf__min_df': [5],
    'tfidf__max_df': [0.75],
    'clf__svc__C': [1.0],
    'clf__mnb__alpha': [1.0,10.0],
}

cv_strategy = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

# Grid search with the updated pipeline and StratifiedKFold
grid_search = GridSearchCV(pipeline, param_grid, cv=cv_strategy, scoring='accuracy', n_jobs=-1)
grid_search.fit(X_train, y_train)
best_model = grid_search.best_estimator_

# Predictions and classification report
SVM_MNB_pred = best_model.predict(X_test)
print("Enhanced Classification Report")
print(classification_report(y_test, SVM_MNB_pred, target_names=['negative', 'neutral', 'positive']))
end_time = time.time()
print(f"Training completed in {end_time - start_time:.2f} seconds.")

Enhanced Classification Report
              precision    recall  f1-score   support

    negative       0.75      0.62      0.68      1001
     neutral       0.63      0.79      0.70      1430
    positive       0.83      0.70      0.76      1103

    accuracy                           0.71      3534
   macro avg       0.74      0.70      0.71      3534
weighted avg       0.73      0.71      0.71      3534

Training completed in 1462.72 seconds.


In [3]:
from sklearn.metrics import confusion_matrix
class PerformanceMetrics:
    def __init__(self, y_true, y_pred, class_labels):
        self.y_true = y_true
        self.y_pred = y_pred
        self.class_labels = class_labels
        self.conf_matrix = confusion_matrix(y_true, y_pred)
        self.total_positive = self.conf_matrix.sum(axis=1)  
        self.total_negative = self.conf_matrix.sum(axis=0)  
        self.total_predictions = len(y_pred)

    def calculate_tpr(self):
        tpr = {}
        for i, class_name in enumerate(self.class_labels):
            TP = self.conf_matrix[i, i]  
            FN = self.total_positive[i] - TP  
            tpr[class_name] = TP / (TP + FN) if (TP + FN) != 0 else 0
        return tpr

    def calculate_fpr(self):
        fpr = {}
        total = self.conf_matrix.sum()
        for i, class_name in enumerate(self.class_labels):
            FP = self.total_negative[i] - self.conf_matrix[i, i]
            TN = total - (self.total_positive[i] + FP)  
            fpr[class_name] = FP / (FP + TN) if (FP + TN) != 0 else 0
        return fpr

    def calculate_statistical_parity(self):
        statistical_parity = {}
        for i, class_name in enumerate(self.class_labels):
            statistical_parity[class_name] = self.total_negative[i] / self.total_predictions
        return statistical_parity

    def compute_all_metrics(self):
        return {
            'TPR': self.calculate_tpr(),
            'FPR': self.calculate_fpr(),
            'Statistical Parity': self.calculate_statistical_parity()
        }
    def results_as_dataframe(self):
        metrics = self.compute_all_metrics()
        df = pd.DataFrame(metrics)
        return df

In [10]:
if __name__ == "__main__":
    y_true = y_test 
    y_pred = SVM_MNB_pred 
    class_labels = ['negative', 'neutral', 'positive']  

    metrics = PerformanceMetrics(y_true, y_pred, class_labels)
    results = metrics.compute_all_metrics()
    print(results)
    results_df = metrics.results_as_dataframe()
    print(results_df)


{'TPR': {'negative': 0.6173826173826173, 'neutral': 0.7902097902097902, 'positive': 0.6980961015412511}, 'FPR': {'negative': 0.08093170153967627, 'neutral': 0.3127376425855513, 'positive': 0.06293706293706294}, 'Statistical Parity': {'negative': 0.23288058856819469, 'neutral': 0.5059422750424448, 'positive': 0.2611771363893605}}
               TPR       FPR  Statistical Parity
negative  0.617383  0.080932            0.232881
neutral   0.790210  0.312738            0.505942
positive  0.698096  0.062937            0.261177


In [10]:
def encode_reviews(tokenizer, reviews, max_length=64):
    return tokenizer(list(reviews), padding='max_length', truncation=True, max_length=max_length, return_tensors="pt")

tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

train_encodings = encode_reviews(tokenizer, X_train)
test_encodings = encode_reviews(tokenizer, X_test)

train_labels = torch.tensor(y_train.values)
test_labels = torch.tensor(y_test.values)

train_dataset = TensorDataset(train_encodings['input_ids'], train_encodings['attention_mask'], train_labels)
test_dataset = TensorDataset(test_encodings['input_ids'], test_encodings['attention_mask'], test_labels)

batch_size = 32
train_loader = DataLoader(train_dataset, batch_size=batch_size, sampler=RandomSampler(train_dataset))
test_loader = DataLoader(test_dataset, batch_size=batch_size)

start_time = time.time()
model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=3)
model.to(torch.device("cuda" if torch.cuda.is_available() else "cpu"))

optimizer = AdamW(model.parameters(), lr=5e-5)
total_steps = len(train_loader) * 3  
scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=0, num_training_steps=total_steps)

scaler = GradScaler()

model.train()
for epoch in range(3):  
    for batch in train_loader:
        batch = tuple(t.to(model.device) for t in batch)
        inputs = {
            'input_ids': batch[0],
            'attention_mask': batch[1],
            'labels': batch[2]
        }
        optimizer.zero_grad()
        with autocast():
            outputs = model(**inputs)
            loss = outputs.loss
        scaler.scale(loss).backward()
        scaler.step(optimizer)
        scaler.update()
        scheduler.step()
    print(f"Epoch {epoch + 1}, Loss: {loss.item()}")

model.eval()
bert_predictions, bert_true_labels = [], []
with torch.no_grad():
    for batch in test_loader:
        batch = tuple(t.to(model.device) for t in batch)
        inputs = {
            'input_ids': batch[0],
            'attention_mask': batch[1]
        }
        outputs = model(**inputs)
        logits = outputs.logits
        bert_predictions.extend(torch.argmax(logits, dim=-1).tolist())
        bert_true_labels.extend(batch[2].tolist())

end_time = time.time()
total_time = end_time - start_time
#accuracy = accuracy_score(bert_true_labels, bert_predictions)
print(f"Total execution time: {total_time:.2f} seconds")


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch 1, Loss: 0.35207176208496094
Epoch 2, Loss: 0.4614095687866211
Epoch 3, Loss: 0.1747008115053177
Total execution time: 599.21 seconds


In [5]:
bert_report = classification_report(bert_true_labels, bert_predictions, target_names=['negative','neutral','positive'], digits=4)
print("Classification Report:\n", bert_report)

Classification Report:
               precision    recall  f1-score   support

    negative     0.7669    0.8252    0.7950      1001
     neutral     0.7678    0.7469    0.7572      1430
    positive     0.8574    0.8286    0.8428      1103

    accuracy                         0.7946      3534
   macro avg     0.7974    0.8002    0.7983      3534
weighted avg     0.7955    0.7946    0.7946      3534



In [6]:
if __name__ == "__main__":
    y_true = bert_true_labels 
    y_pred = bert_predictions 
    class_labels = ['negative', 'neutral', 'positive']  

    metrics = PerformanceMetrics(y_true, y_pred, class_labels)
    results = metrics.compute_all_metrics()
    print(results)
    results_df = metrics.results_as_dataframe()
    print(results_df)

{'TPR': {'negative': 0.8251748251748252, 'neutral': 0.7468531468531469, 'positive': 0.828649138712602}, 'FPR': {'negative': 0.09909198578760363, 'neutral': 0.1535171102661597, 'positive': 0.06252570958453312}, 'Statistical Parity': {'negative': 0.30475382003395585, 'neutral': 0.3936049801924165, 'positive': 0.30164119977362763}}
               TPR       FPR  Statistical Parity
negative  0.825175  0.099092            0.304754
neutral   0.746853  0.153517            0.393605
positive  0.828649  0.062526            0.301641


In [7]:
def encode_reviews(tokenizer, reviews, max_length=64):
    return tokenizer(list(reviews), padding='max_length', truncation=True, max_length=max_length, return_tensors="pt")

# Initialize GPT-2 tokenizer
tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
tokenizer.pad_token = tokenizer.eos_token  

train_encodings = encode_reviews(tokenizer, X_train)
test_encodings = encode_reviews(tokenizer, X_test)

train_labels = torch.tensor(y_train.values)
test_labels = torch.tensor(y_test.values)

train_dataset = TensorDataset(train_encodings['input_ids'], train_encodings['attention_mask'], train_labels)
test_dataset = TensorDataset(test_encodings['input_ids'], test_encodings['attention_mask'], test_labels)

batch_size = 32
train_loader = DataLoader(train_dataset, batch_size=batch_size, sampler=RandomSampler(train_dataset))
test_loader = DataLoader(test_dataset, batch_size=batch_size)

start_time = time.time()

model = GPT2ForSequenceClassification.from_pretrained('gpt2', num_labels=3)
model.config.pad_token_id = tokenizer.eos_token_id
model.to(torch.device("cuda" if torch.cuda.is_available() else "cpu"))

optimizer = AdamW(model.parameters(), lr=5e-5)
total_steps = len(train_loader) * 3 
scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=0, num_training_steps=total_steps)
scaler = GradScaler()

model.train()
for epoch in range(3):  
    for batch in train_loader:
        batch = tuple(t.to(model.device) for t in batch)
        inputs = {
            'input_ids': batch[0],
            'attention_mask': batch[1],
            'labels': batch[2]
        }
        optimizer.zero_grad()
        with autocast():
            outputs = model(**inputs)
            loss = outputs.loss
        scaler.scale(loss).backward()
        scaler.step(optimizer)
        scaler.update()
        scheduler.step()
    print(f"Epoch {epoch + 1}, Loss: {loss.item()}")

model.eval()
gpt_predictions, gpt_true_labels = [], []
with torch.no_grad():
    for batch in test_loader:
        batch = tuple(t.to(model.device) for t in batch)
        inputs = {
            'input_ids': batch[0],
            'attention_mask': batch[1]
        }
        outputs = model(**inputs)
        logits = outputs.logits
        gpt_predictions.extend(torch.argmax(logits, dim=-1).tolist())
        gpt_true_labels.extend(batch[2].tolist())

end_time = time.time()
total_time = end_time - start_time
#accuracy = accuracy_score(gpt_true_labels, gpt_predictions)
print(f"Total execution time: {total_time:.2f} seconds")


Some weights of GPT2ForSequenceClassification were not initialized from the model checkpoint at gpt2 and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch 1, Loss: 0.5870822072029114
Epoch 2, Loss: 0.3996003568172455
Epoch 3, Loss: 0.288114994764328
Total execution time: 1240.44 seconds


In [8]:
gpt_report = classification_report(gpt_true_labels, gpt_predictions, target_names=['negative','neutral', 'positive'], digits=4)
print("Classification Report:\n", gpt_report)

Classification Report:
               precision    recall  f1-score   support

    negative     0.7927    0.7982    0.7954      1001
     neutral     0.7618    0.7650    0.7634      1430
    positive     0.8394    0.8296    0.8345      1103

    accuracy                         0.7946      3534
   macro avg     0.7980    0.7976    0.7978      3534
weighted avg     0.7948    0.7946    0.7947      3534



In [9]:
if __name__ == "__main__":
    y_true = gpt_true_labels 
    y_pred = gpt_predictions 
    class_labels = ['negative', 'neutral', 'positive']  

    metrics = PerformanceMetrics(y_true, y_pred, class_labels)
    results = metrics.compute_all_metrics()
    print(results)
    results_df = metrics.results_as_dataframe()
    print(results_df)

{'TPR': {'negative': 0.7982017982017982, 'neutral': 0.765034965034965, 'positive': 0.8295557570262919}, 'FPR': {'negative': 0.08251085669166995, 'neutral': 0.16254752851711027, 'positive': 0.07198683669271905}, 'Statistical Parity': {'negative': 0.28522920203735147, 'neutral': 0.40633842671194115, 'positive': 0.30843237125070744}}
               TPR       FPR  Statistical Parity
negative  0.798202  0.082511            0.285229
neutral   0.765035  0.162548            0.406338
positive  0.829556  0.071987            0.308432


**Additional Analysis**

In [None]:
import pandas as pd
import torch
from torch.utils.data import DataLoader, RandomSampler, TensorDataset
from transformers import DistilBertTokenizer, DistilBertForSequenceClassification, AdamW, get_linear_schedule_with_warmup
from sklearn.metrics import accuracy_score
from torch.cuda.amp import GradScaler, autocast

# BERT Tokenization and Encoding Function
def encode_reviews(tokenizer, reviews, max_length=64):
    return tokenizer(list(reviews), padding='max_length', truncation=True, max_length=max_length, return_tensors="pt")


tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')

# Encoding the training and testing data
train_encodings = encode_reviews(tokenizer, X_train)
test_encodings = encode_reviews(tokenizer, X_test)

# Convert labels to tensors
train_labels = torch.tensor(y_train.values)
test_labels = torch.tensor(y_test.values)

# Prepare datasets for DataLoader
train_dataset = TensorDataset(train_encodings['input_ids'], train_encodings['attention_mask'], train_labels)
test_dataset = TensorDataset(test_encodings['input_ids'], test_encodings['attention_mask'], test_labels)

# Data Loaders
batch_size = 32
train_loader = DataLoader(train_dataset, batch_size=batch_size, sampler=RandomSampler(train_dataset))
test_loader = DataLoader(test_dataset, batch_size=batch_size)

model = DistilBertForSequenceClassification.from_pretrained('distilbert-base-uncased', num_labels=3)
model.to(torch.device("cuda" if torch.cuda.is_available() else "cpu"))


optimizer = AdamW(model.parameters(), lr=2e-5)
total_steps = len(train_loader) * 3
scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=0, num_training_steps=total_steps)

# Mixed precision setup
scaler = GradScaler()

# Training loop with mixed precision
model.train()
for epoch in range(3):  # Adjust epochs based on your model's performance and computational resources
    for batch in train_loader:
        batch = tuple(t.to(model.device) for t in batch)
        inputs = {
            'input_ids': batch[0],
            'attention_mask': batch[1],
            'labels': batch[2]
        }
        optimizer.zero_grad()
        with autocast():
            outputs = model(**inputs)
            loss = outputs.loss
        scaler.scale(loss).backward()
        scaler.step(optimizer)
        scaler.update()
        scheduler.step()  # Update the learning rate
    print(f"Epoch {epoch + 1}, Loss: {loss.item()}")

# Model evaluation
model.eval()
predictions = []
with torch.no_grad():
    for batch in test_loader:
        batch = tuple(t.to(model.device) for t in batch)
        inputs = {
            'input_ids': batch[0],
            'attention_mask': batch[1]
        }
        outputs = model(**inputs)
        logits = outputs.logits
        predictions.extend(torch.argmax(logits, dim=-1).tolist())

accuracy = accuracy_score(y_test, predictions)
print(f"Test Accuracy: {accuracy * 100:.2f}%")


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/483 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch 1, Loss: 0.4790515899658203
Epoch 2, Loss: 0.38086065649986267
Epoch 3, Loss: 0.27932611107826233
Test Accuracy: 79.97%


In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import SVC
from sklearn.decomposition import TruncatedSVD
from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.model_selection import GridSearchCV, StratifiedKFold
from sklearn.metrics import classification_report
from sklearn.feature_extraction.stop_words import ENGLISH_STOP_WORDS
import time

# Advanced text preprocessing
class TextPreprocessor:
    def __init__(self, vectorizer):
        self.vectorizer = vectorizer

    def fit(self, X, y=None):
        return self.vectorizer.fit(X)

    def transform(self, X):
        X_transformed = self.vectorizer.transform(X)
        # Further processing like stemming/lemmatization could be added here
        return X_transformed

# Updated pipeline with dimensionality reduction
svm_pipeline = Pipeline([
    ('tfidf', TextPreprocessor(TfidfVectorizer(stop_words=ENGLISH_STOP_WORDS, lowercase=True))),
    ('svd', TruncatedSVD(n_components=100)),  # Reducing dimensions
    ('clf', SVC(probability=True, class_weight='balanced')),
])

svm_param_grid = {
    'tfidf__vectorizer__ngram_range': [(1, 2), (1, 3)],
    'tfidf__vectorizer__min_df': [3, 5],
    'tfidf__vectorizer__max_df': [0.5, 0.75],
    'svd__n_components': [100, 200],  # Experiment with the number of components
    'clf__C': [0.1, 1, 10],
    'clf__kernel': ['linear', 'rbf'],
    'clf__gamma': ['scale'],
}

kfolds = StratifiedKFold(n_splits=5, shuffle=True, random_state=1)

start_time = time.time()

svm_grid_search = GridSearchCV(svm_pipeline, svm_param_grid, cv=kfolds, scoring='accuracy', n_jobs=-1, verbose=3)
svm_grid_search.fit(X_train, y_train)

end_time = time.time()
print(f"Training completed in {end_time - start_time:.2f} seconds.")

svm_best_model = svm_grid_search.best_estimator_
svm_pred = svm_best_model.predict(X_test)

print("Optimized SVM Classification Report:")
print(classification_report(y_test, svm_pred, target_names=['negative', 'neutral', 'positive']))


In [None]:
device = "cuda" if torch.cuda.is_available() else "cpu"
model.to(device)  # Move the model to the GPU


In [None]:
!python -m ipykernel install --user --name=newenv --display-name="Python (newenv)"


In [3]:
class SDataset:
    def __init__(self, encodings, labels):
        self.encodings = {key: torch.tensor(val) for key, val in encodings.items()}  # Move tensor conversion here
        self.labels = torch.tensor(labels, dtype=torch.int64)  # Move tensor conversion here

    def __getitem__(self, idx):
        item = {key: self.encodings[key][idx] for key in self.encodings}
        item['labels'] = self.labels[idx]
        return item

    def __len__(self):
        return len(self.labels)



def compute_metrics(pred):
    labels = pred.label_ids
    preds = np.argmax(pred.predictions, axis=1)
    report = classification_report(labels, preds, target_names=list(sentiment_mapping.keys()), output_dict=True)
    return {k: v for k, v in report.items() if k in sentiment_mapping.keys()}

In [4]:
from transformers import GPT2Tokenizer, GPT2ForSequenceClassification, GPT2Config,TrainingArguments, Trainer

In [2]:
!pip install accelerate -U --force-reinstall
!pip install transformers[torch] --force-reinstall

Collecting accelerate
  Downloading accelerate-0.29.2-py3-none-any.whl (297 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m297.4/297.4 kB[0m [31m2.0 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting numpy>=1.17 (from accelerate)
  Downloading numpy-1.26.4-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (18.2 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m18.2/18.2 MB[0m [31m11.8 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting packaging>=20.0 (from accelerate)
  Downloading packaging-24.0-py3-none-any.whl (53 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m53.5/53.5 kB[0m [31m1.6 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting psutil (from accelerate)
  Downloading psutil-5.9.8-cp36-abi3-manylinux_2_12_x86_64.manylinux2010_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl (288 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m288.2/288.2 kB[0m [31m10.7 MB/s[0m eta [36m0:00:00[0m
[?25hCo

Collecting transformers[torch]
  Downloading transformers-4.39.3-py3-none-any.whl (8.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m8.8/8.8 MB[0m [31m26.3 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting filelock (from transformers[torch])
  Using cached filelock-3.13.4-py3-none-any.whl (11 kB)
Collecting huggingface-hub<1.0,>=0.19.3 (from transformers[torch])
  Using cached huggingface_hub-0.22.2-py3-none-any.whl (388 kB)
Collecting numpy>=1.17 (from transformers[torch])
  Using cached numpy-1.26.4-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (18.2 MB)
Collecting packaging>=20.0 (from transformers[torch])
  Using cached packaging-24.0-py3-none-any.whl (53 kB)
Collecting pyyaml>=5.1 (from transformers[torch])
  Using cached PyYAML-6.0.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (705 kB)
Collecting regex!=2019.12.17 (from transformers[torch])
  Downloading regex-2023.12.25-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (7

In [5]:
import accelerate
import transformers
print("Accelerate version:", accelerate.__version__)
print("Transformers version:", transformers.__version__)
import torch
from transformers import GPT2Tokenizer, GPT2ForSequenceClassification, GPT2Config, TrainingArguments, Trainer
from sklearn.metrics import classification_report


In [18]:

class SDataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = {key: torch.tensor(val) for key, val in encodings.items()}
        self.labels = torch.tensor(labels, dtype=torch.int64)

    def __getitem__(self, idx):
        item = {key: self.encodings[key][idx] for key in self.encodings}
        item['labels'] = self.labels[idx]
        return item

    def __len__(self):
        return len(self.labels)

#def compute_metrics(pred):
  #  labels = pred.label_ids
   # preds = np.argmax(pred.predictions, axis=1)
    #report = classification_report(labels, preds, target_names=['negative', 'neutral', 'positive'], output_dict=False)
    #return {"classification_report": report}

def compute_metrics(pred):
    labels = pred.label_ids
    preds = np.argmax(pred.predictions, axis=1)
    accuracy = (preds == labels).mean()
    return {"accuracy": accuracy}

# Tokenize data
tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
tokenizer.pad_token = tokenizer.eos_token
#train_encodings = tokenizer(train['text'].tolist(), truncation=True, padding=True, max_length=128)
#test_encodings = tokenizer(test['text'].tolist(), truncation=True, padding=True, max_length=128)
train_encodings = tokenizer(train['text'].tolist(), truncation=True, padding=True, max_length=128, return_tensors="pt")
test_encodings = tokenizer(test['text'].tolist(), truncation=True, padding=True, max_length=128, return_tensors="pt")

# Prepare datasets
train_dataset = SDataset(train_encodings, train['sentimentLabel'].tolist())
test_dataset = SDataset(test_encodings, test['sentimentLabel'].tolist())

# Configure the model
config = GPT2Config.from_pretrained('gpt2', num_labels=3)
config.pad_token_id = tokenizer.eos_token_id

model = GPT2ForSequenceClassification.from_pretrained('gpt2', config=config)

from transformers import Trainer, TrainingArguments

# Define training arguments
training_args = TrainingArguments(
    output_dir='./results',
    num_train_epochs=2,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    weight_decay=0.01,
    logging_dir='./logs',
    evaluation_strategy="steps",
    eval_steps=500,
    save_strategy="steps",
    save_steps=500,
    load_best_model_at_end=True,
    metric_for_best_model='accuracy',
    fp16=True  
    
)

# Initialize Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
    compute_metrics=compute_metrics
)

# Train and evaluate the model
trainer.train()
results = trainer.evaluate()
print("Evaluation results:", results)



  self.encodings = {key: torch.tensor(val) for key, val in encodings.items()}
Some weights of GPT2ForSequenceClassification were not initialized from the model checkpoint at gpt2 and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
dataloader_config = DataLoaderConfiguration(dispatch_batches=None)


Step,Training Loss,Validation Loss,Accuracy
500,0.9134,0.726535,0.732598
1000,0.6758,0.570133,0.764007
1500,0.5899,0.582142,0.760611
2000,0.5997,0.652298,0.753254
2500,0.579,0.59458,0.751839
3000,0.5575,0.547618,0.788059
3500,0.5415,0.552083,0.795699
4000,0.4626,0.578681,0.791737
4500,0.4637,0.556078,0.799095
5000,0.4434,0.53118,0.799943


Evaluation results: {'eval_loss': 0.5311800241470337, 'eval_accuracy': 0.7999434069043577, 'eval_runtime': 51.4646, 'eval_samples_per_second': 68.669, 'eval_steps_per_second': 8.588, 'epoch': 2.0}


In [19]:
predictions = trainer.predict(test_dataset)

# Process the predictions
pred_labels = np.argmax(predictions.predictions, axis=1)
true_labels = predictions.label_ids

#classification report
report = classification_report(true_labels, pred_labels, target_names=['negative', 'neutral', 'positive'], digits=4)
print("Classification Report:\n", report)

Classification Report:
               precision    recall  f1-score   support

    negative     0.8146    0.7812    0.7976      1001
     neutral     0.7569    0.7902    0.7732      1430
    positive     0.8464    0.8296    0.8379      1103

    accuracy                         0.7999      3534
   macro avg     0.8060    0.8003    0.8029      3534
weighted avg     0.8012    0.7999    0.8003      3534



In [6]:
from torch.utils.data import DataLoader, Dataset
from transformers import GPT2Tokenizer, GPT2ForSequenceClassification, AdamW
from transformers import get_linear_schedule_with_warmup
from torch.cuda.amp import GradScaler, autocast
import time

# Initialize tokenizer
tokenizer = GPT2Tokenizer.from_pretrained("gpt2")
tokenizer.pad_token = tokenizer.eos_token

# Tokenization and dataset preparation
def tokenize_data(text):
    return tokenizer(text, truncation=True, padding="max_length", max_length=128, return_tensors="pt")

class SentimentDataset(Dataset):
    def __init__(self, texts, labels):
        self.encodings = [tokenize_data(text) for text in texts]
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: val.squeeze(0) for key, val in self.encodings[idx].items()}  # Squeeze is necessary because tokenize_data adds an extra dimension
        item['labels'] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.labels)

# Prepare datasets
train_dataset = SentimentDataset(train['text'].tolist(), train['sentimentLabel'].tolist())
test_dataset = SentimentDataset(test['text'].tolist(), test['sentimentLabel'].tolist())

# DataLoader
train_loader = DataLoader(train_dataset, batch_size=4, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=4, shuffle=False)

# Model initialization
model = GPT2ForSequenceClassification.from_pretrained("gpt2", num_labels=3)
model.config.pad_token_id = tokenizer.pad_token_id
model.to('cuda')

start_time = time.time()
# Optimizer and learning rate scheduler
optimizer = AdamW(model.parameters(), lr=5e-5)
total_steps = len(train_loader) * 3  # Assuming 3 epochs
scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=0, num_training_steps=total_steps)

scaler = GradScaler()   
model.train()
for epoch in range(3):
    for batch_index, batch in enumerate(train_loader):
        optimizer.zero_grad()
        with autocast():
            inputs = {k: v.to('cuda') for k, v in batch.items()}
            outputs = model(**inputs)
            loss = outputs.loss
        
        scaler.scale(loss).backward()
        scaler.step(optimizer)
        scaler.update()
        scheduler.step()


    print(f"Epoch {epoch+1} completed. Loss: {loss.item()}")

# Evaluation
model.eval()
total_eval_accuracy = 0
for batch in test_loader:
    with torch.no_grad(), autocast():
        inputs = {k: v.to('cuda') for k, v in batch.items()}
        outputs = model(**inputs)
        logits = outputs.logits
        predictions = torch.argmax(logits, dim=-1)
        total_eval_accuracy += (predictions == inputs['labels']).sum().item()

# Calculate accuracy
total_correct = total_eval_accuracy
total = len(test_dataset)
accuracy = total_correct / total
print(f"Accuracy on test set: {accuracy:.2f}")

end_time = time.time()
total_time = end_time - start_time
print(f"Total execution time: {total_time:.2f} seconds")

Some weights of GPT2ForSequenceClassification were not initialized from the model checkpoint at gpt2 and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch 1 completed. Loss: 0.7008514404296875
Epoch 2 completed. Loss: 0.6493625640869141
Epoch 3 completed. Loss: 0.003504633903503418
Accuracy on test set: 0.79
Total execution time: 3540.52 seconds
