In [1]:
##########################################################################
##### machine learning script for constructivity ######
## based upon the chatGPT_test_script2 script #########
##########################################################################
### Step 1: install packages - 
##### packages are necessary to install and load, given that they have the built in functions necessary to run complex tasks. 
## They effectively act as one of the most crucial time saving activities that would otherwise lead to overly long and 
## duplicative scripts. 

### note: you are not expected to remember all of these; just for the best to copy and paste these sections 

## read in pkgs 
import sys
import os
# !{sys.executable} -m pip install xgboost==1.7.5 # note: needed since it looks like anaconda installs an earlier version 
# of the package, which is not helpful. 1.7.5 allows for the categorical data of interest to be used. 

# !{sys.executable} -m pip install requests #; this code here can be used to install packages on anaconda/jupyter notebook 
### I believe the below should be installed by default 
import requests # web scraping 
from bs4 import BeautifulSoup # for web scraping 
import itertools # for efficient operation of loops 
import pandas as pd # necessary for reading in, creating, and manipulating data frames 
import csv ## for importing/exporting csvs 
import glob ## for finding files in path
import re
import numpy as np
import tiktoken
import openai

In [2]:
### import torch packages and such 
import torch
from torch.utils.data import Dataset, DataLoader
from transformers import GPT2Tokenizer, GPT2LMHeadModel, GPT2Config
### these were loaded in successfully 

In [3]:
# Define your labeled dataset class
class LabeledDataset(Dataset):
    def __init__(self, tokenizer, comments, labels):
        self.tokenizer = tokenizer
        self.comments = comments
        self.labels = labels

    def __len__(self):
        return len(self.comments)

    def __getitem__(self, index):
        comment = self.comments[index]
        label = self.labels[index]
        inputs = self.tokenizer.encode_plus(comment, add_special_tokens=True, padding='max_length', max_length=128, truncation=True)
        input_ids = inputs['input_ids']
        attention_mask = inputs['attention_mask']
        return {
            'input_ids': torch.tensor(input_ids, dtype=torch.long),
            'attention_mask': torch.tensor(attention_mask, dtype=torch.long),
            'labels': torch.tensor(label, dtype=torch.long)
        }

In [4]:
# Tokenizer and model configuration
tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
#config = GPT2Config.from_pretrained('gpt2', num_labels=2)  # 2 classes: toxic and non-toxic; can expand as needed 
config = GPT2Config.from_pretrained('gpt2')

In [5]:
#### import the data from OSU comments 
rmp_df = pd.read_csv('coding/text_cleaning_data/scored_rmp_data_osu_final.csv')
rmp_df

Unnamed: 0,row,quality_of_class,difficulty_of_class,class_code,college,prof_firstname,prof_lastname,comment,out_misrep,out_emo_lang,...,pb_origin,pb_nuero_div,pb_phys_able,pb_pol_affil,complex,constructive,reflective,outrage_agg,personal_attack_agg,prejudice_agg
0,1,2.0,4,POLITSC3500,OHIO STATE UNIVERSITY,ALEX,ACS,"Only graded on 4 assignments (30% Midterm, 30%...",0,0,...,0.0,0.0,0,0.0,0,0,0,0,0,0.0
1,2,3.0,4,POLITSC3500,OHIO STATE UNIVERSITY,ALEX,ACS,"Final grade is only based on two exams, readin...",0,0,...,0.0,0.0,0,0.0,1,0,0,0,0,0.0
2,3,4.0,1,POLITSC1100,OHIO STATE UNIVERSITY,ALEX,ACS,Class was super easy. One reading quiz a week ...,0,0,...,0.0,0.0,0,0.0,0,0,0,0,0,0.0
3,4,4.0,2,POLITSC3115,OHIO STATE UNIVERSITY,ALEX,ACS,"Lecture could be dry at times, but I still lik...",0,0,...,0.0,0.0,0,0.0,0,0,1,0,0,0.0
4,5,5.0,3,POLITSC3115,OHIO STATE UNIVERSITY,ALEX,ACS,"was an excellent lecturer. Insightful, even h...",0,0,...,0.0,0.0,0,0.0,0,0,0,0,0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2795,16,5.0,3,PSYCH1100H,OHIO STATE UNIVERSITY,ANNA,YOCOM,"I had Dr. for PSYCH 1100H last semester, and ...",0,0,...,0.0,0.0,0,0.0,0,0,0,0,0,0.0
2796,17,5.0,3,PSYCH2200,OHIO STATE UNIVERSITY,ANNA,YOCOM,A great professor for this class. You only nee...,0,0,...,0.0,0.0,0,0.0,0,0,0,0,0,0.0
2797,18,5.0,3,PSY1100H,OHIO STATE UNIVERSITY,ANNA,YOCOM,LOVED this class! Prof made me want to attend...,0,0,...,0.0,0.0,0,0.0,0,0,0,0,0,0.0
2798,19,5.0,3,PSY2220,OHIO STATE UNIVERSITY,ANNA,YOCOM,Dr. made this class much better than I expect...,0,0,...,0.0,0.0,0,0.0,0,0,0,0,0,0.0


In [6]:
## with that read in, let's get the comments and labels specified 
comments = rmp_df['comment']
labels_const = rmp_df['constructive']

In [7]:
# Create the labeled dataset object
tokenizer.add_special_tokens({'pad_token': '0'})
dataset = LabeledDataset(tokenizer, comments, labels_const) # tokenizer above (from gpt2), 
#with comments and labels from the RMP data set 
# and the labels the 0s and 1s 

# Data loader
batch_size = 2
dataloader = DataLoader(dataset, batch_size=batch_size, shuffle=True)

In [8]:
# Initialize the model
model = GPT2LMHeadModel.from_pretrained('gpt2', config=config)
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model.to(device)

# Fine-tuning parameters
num_epochs = 5
learning_rate = 2e-5
dataloader

<torch.utils.data.dataloader.DataLoader at 0x2178331e160>

In [9]:
### chek the labels just to be safe 
labels_const.unique() #good
# the script below taking way too long; lets winnow down the data set 

array([0, 2, 1], dtype=int64)

In [12]:
# The model itself. Let's see if we can't get this working 
optimizer = torch.optim.AdamW(model.parameters(), lr=learning_rate)
loss_fn = torch.nn.BCEWithLogitsLoss()


for epoch in range(num_epochs):
    model.train()
    total_loss = 0.0
    #print("got here")
    for batch in dataloader:
        input_ids = batch['input_ids'].to(device)
        print(input_ids)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)
        print(labels)
        optimizer.zero_grad()
        outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=input_ids)
        loss = outputs.loss
        logits = outputs.logits
        total_loss += loss.item()

        loss.backward()
        optimizer.step()

    avg_loss = total_loss / len(dataloader)
    print(f"Epoch {epoch+1}/{num_epochs} - Average Loss: {avg_loss:.4f}")

# Save the fine-tuned model
save_path = 'coding/models'

model.save_pretrained(save_path)
tokenizer.save_pretrained(save_path)

tensor([[ 3347,   373,   416,  1290,   262,  1266,  1534,    13,   329,   428,
          1781,    13,  1375,  1718,  7685, 14262,  2587,   290,   925,   340,
          3499,    13,  2332, 48258,   274,   389,  9389,   475,  8005,    33,
         33076,   815,   423,  2761,   611,   484,  1100,   262,  1492,    13,
            15,    15,    15,    15,    15,    15,    15,    15,    15,    15,
            15,    15,    15,    15,    15,    15,    15,    15,    15,    15,
            15,    15,    15,    15,    15,    15,    15,    15,    15,    15,
            15,    15,    15,    15,    15,    15,    15,    15,    15,    15,
            15,    15,    15,    15,    15,    15,    15,    15,    15,    15,
            15,    15,    15,    15,    15,    15,    15,    15,    15,    15,
            15,    15,    15,    15,    15,    15,    15,    15,    15,    15,
            15,    15,    15,    15,    15,    15,    15,    15,    15,    15,
            15,    15,    15,    15,    15,    15,  

KeyboardInterrupt: 

In [17]:
#dataloader.__getitem__('input_ids')

AttributeError: 'DataLoader' object has no attribute '__getitem__'

In [19]:
len(dataset) ## length is 1400 ; is half of what it is in dataset; doesn't seem to be driving issues here. 

2800

In [20]:
type(labels_const) # so this is a pandas object, yet it did not break the laptop script... though it did run A LOT w/out
# doing anything. So maybe it is the pandas nature messing things up? What if we could recreate the error in the 
# chatGPT script? 

pandas.core.series.Series