In [None]:
# libs needed that aren't on the huggingface container 
# !pip install datasets --user
# !pip install evaluate --user
# !pip install optuna --user

In [None]:
import csv
import pandas as pd
from transformers import BertTokenizerFast, TrainingArguments, Trainer, BertForSequenceClassification, \
    DataCollatorWithPadding, BertModel, BertweetTokenizer, RobertaForSequenceClassification, AutoConfig
from datasets import Dataset, load_dataset, DatasetDict
import csv
import pandas as pd
from sklearn import model_selection
import torch
import evaluate
import numpy as np

In [None]:
with open('Tweets.csv', mode='r') as file:
    df = pd.read_csv(file, header=0)
    df = df.drop(columns=['textID', 'selected_text'], axis=1)
    df = df.rename(columns={"text": "texts", "sentiment": "label"})
    df = df.dropna()

df['label'] = df['label'].replace({"neutral":1, "negative":0, "positive":2})


dataset = Dataset.from_pandas(df)
dataset = dataset.train_test_split(test_size=0.2)
# shuffle defaults to true

print(dataset)
print(dataset["train"][0])

# Model 1:

In [None]:
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay
import matplotlib.pyplot as plt
from sklearn.metrics import confusion_matrix
import seaborn as sns

def tokenize_function(example):
    """
    Tokenizes input embeddings using model tokenizer"""
    return tokenizer(example["texts"], truncation=True)

metric1 = evaluate.load("precision")
metric2 = evaluate.load("recall")
    
def compute_metrics(eval_preds):
    """
    Computes eval metrics"""
    
    logits, labels = eval_preds
    predictions = np.argmax(logits, axis=-1)
    precision = metric1.compute(predictions=predictions, references=labels, average="weighted")
    recall = metric2.compute(predictions=predictions, references=labels, average="weighted")
    

    cm = confusion_matrix(predictions, labels)
    cmn = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]
    fig, ax = plt.subplots(figsize=(10,10))
    sns.heatmap(cmn, annot=True, fmt='.2f', xticklabels=["negative", "neutral", "positive"], yticklabels=["negative", "neutral", "positive"])
    plt.ylabel('Actual')
    plt.xlabel('Predicted')
    plt.show(block=False)
    return {"precision": precision, "recall": recall}

# tokenize input sequences to subwords
tokenizer = BertTokenizerFast.from_pretrained("bert-base-uncased")
tokenized_tweets = dataset.map(tokenize_function, batched=True)


# for batching
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

In [None]:
# to load from checkpoint for eval; trainer class takes care of both training and eval.
# pretty much just a general model wrapper
# import torch
# seed=1
# np.random.seed(seed)
# if torch.cuda.is_available():
#   generator = torch.Generator('cuda').manual_seed(seed)
# else:
#   generator = torch.Generator().manual_seed(seed)

model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=3)


args = TrainingArguments(
    output_dir="./results_custom_model",
    per_device_eval_batch_size=20
)

trainer = Trainer(
    model=model,
    args=args,
    tokenizer=tokenizer,
    eval_dataset=tokenized_tweets["test"],
    data_collator=data_collator,
    compute_metrics=compute_metrics
)

# speeds up testing by turning off backprop objects
eval_results = trainer.evaluate()
print(eval_results)

In [None]:
# to train model; see https://huggingface.co/docs/transformers/model_doc/bert#transformers.BertForSequenceClassification
import torch
print(tokenized_tweets)
print(torch.cuda.is_available())
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(device)
print(torch.cuda.device_count())

# init model:
# takes in a string and pulls the model from huggingface
configuration = AutoConfig.from_pretrained('bert-base-uncased')
configuration.hidden_dropout_prob = 0.2
configuration.attention_probs_dropout_prob = 0.2
configuration.num_labels = 3
model = BertForSequenceClassification.from_pretrained("bert-base-uncased", config = configuration)
# model.bert.encoder.layer[-1].apply(model._init_weights)
# model.bert.encoder.layer[-2].apply(model._init_weights)
args = TrainingArguments(
    output_dir="./results_base_bert",
    evaluation_strategy="steps",
    learning_rate=1e-5,
    per_device_train_batch_size=4,
    per_device_eval_batch_size=4,
    num_train_epochs=5,
    warmup_steps=10,
    weight_decay=.1,
    eval_steps=500)

# train
trainer = Trainer(
    model=model,
    args=args,
    train_dataset=tokenized_tweets["train"],
    eval_dataset=tokenized_tweets["test"],
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics
)
trainer.train()
trainer.save_model()

In [None]:
with torch.no_grad():
    eval_results = trainer.evaluate(tokenized_tweets["test"])
print(eval_results)

In [None]:
def optuna_hp_space(trial):

    return {

        "learning_rate": trial.suggest_float("learning_rate", 2e-5, 3e-5, log=True),

        "per_device_train_batch_size": trial.suggest_categorical("per_device_train_batch_size", [16, 32, 64, 128]),

    }

best_trials = trainer.hyperparameter_search(

    direction=["minimize", "maximize"],

    backend="optuna",

    hp_space=optuna_hp_space,

    n_trials=10,

    compute_objective=compute_objective,

)

# Model 2:

In [None]:
def remove_punct(text):
    text_no_punct = "".join([char for char in text if char not in string.punctuation])
    return text_no_punct

def tokenize(text):
    tokens = re.split('\W+', text)
    return tokens

def remove_stopwords(tokenized_text):
    stopwords = nltk.corpus.stopwords.words('english')
    text = [word for word in tokenized_text if word not in stopwords]
    return text

def lemmatize(tokenized_text):
    wn = nltk.WordNetLemmatizer()
    text = [wn.lemmatize(word) for word in tokenized_text]
    return text

In [None]:
df['texts'] = df['texts'].apply(lambda text: remove_punct(text))
df['texts'] = df['texts'].apply(lambda text: tokenize(text.lower()))
df['texts'] = df['texts'].apply(lambda text: remove_stopwords(text))
df['texts'] = df['texts'].apply(lambda text: lemmatize(text))
df['texts'] = df['texts'].apply(lambda text: ' '.join(text))

In [None]:
tfidf_vect = TfidfVectorizer(analyzer='word')
X_tfidf = tfidf_vect.fit_transform(df['texts'])

rf = RandomForestClassifier()
param = {'n_estimators': [10, 150, 300], 'max_depth': [30, 60, 90, None]}

gs = GridSearchCV(rf, param, cv=5, n_jobs=-1)
gs_fit = gs.fit(X_tfidf, df['label'])
pd.DataFrame(gs_fit.cv_results_).sort_values('mean_test_score', ascending=False).head()