In [1]:
##########################################################################
##### machine learning script for constructivity ######
## based upon the chatGPT_test_script2 script #########
##########################################################################
### Step 1: install packages - 
##### packages are necessary to install and load, given that they have the built in functions necessary to run complex tasks. 
## They effectively act as one of the most crucial time saving activities that would otherwise lead to overly long and 
## duplicative scripts. 
!{sys.executable} -m pip install tiktoken
!{sys.executable} -m pip install openai

### note: you are not expected to remember all of these; just for the best to copy and paste these sections 

## read in pkgs 
import sys
import os
# !{sys.executable} -m pip install xgboost==1.7.5 # note: needed since it looks like anaconda installs an earlier version 
# of the package, which is not helpful. 1.7.5 allows for the categorical data of interest to be used. 

# !{sys.executable} -m pip install requests #; this code here can be used to install packages on anaconda/jupyter notebook 
### I believe the below should be installed by default 
import requests # web scraping 
from bs4 import BeautifulSoup # for web scraping 
import itertools # for efficient operation of loops 
import pandas as pd # necessary for reading in, creating, and manipulating data frames 
import csv ## for importing/exporting csvs 
import glob ## for finding files in path
import re
import numpy as np
import tiktoken
import openai

'{sys.executable}' is not recognized as an internal or external command,
operable program or batch file.
'{sys.executable}' is not recognized as an internal or external command,
operable program or batch file.


In [2]:
## install the pkgs unlikely to be pre-installed 
!{sys.executable} -m pip install torch
!{sys.executable} -m pip install transformers
### import torch packages and such 

import torch
from torch.utils.data import Dataset, DataLoader
from transformers import GPT2Tokenizer, GPT2LMHeadModel, GPT2Config
### these were loaded in successfully 



In [27]:
# Define your labeled dataset class
class LabeledDataset(Dataset):
    def __init__(self, tokenizer, comments, labels):
        self.tokenizer = tokenizer
        self.comments = comments
        self.labels = labels

    def __len__(self):
        return len(self.comments)

    def __getitem__(self, index):
        comment = self.comments[index]
        label = self.labels[index]
        inputs = self.tokenizer.encode_plus(comment, add_special_tokens=True, padding='max_length', max_length=128, truncation=True)
        input_ids = inputs['input_ids']
        attention_mask = inputs['attention_mask']
        return {
            'input_ids': torch.tensor(input_ids, dtype=torch.long),
            'attention_mask': torch.tensor(attention_mask, dtype=torch.long),
            'labels': torch.tensor(label, dtype=torch.long)
        }

In [28]:
# Tokenizer and model configuration
tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
#config = GPT2Config.from_pretrained('gpt2', num_labels=2)  # 2 classes: toxic and non-toxic; can expand as needed 
config = GPT2Config.from_pretrained('gpt2')

In [29]:
#### import the data from OSU comments 
rmp_df = pd.read_csv('coding/text_cleaning_data/scored_rmp_data_osu_final.csv')
rmp_df

Unnamed: 0,row,quality_of_class,difficulty_of_class,class_code,college,prof_firstname,prof_lastname,comment,out_misrep,out_emo_lang,...,pb_origin,pb_nuero_div,pb_phys_able,pb_pol_affil,complex,constructive,reflective,outrage_agg,personal_attack_agg,prejudice_agg
0,1,2.0,4,POLITSC3500,OHIO STATE UNIVERSITY,ALEX,ACS,"Only graded on 4 assignments (30% Midterm, 30%...",0,0,...,0.0,0.0,0,0.0,0,0,0,0,0,0.0
1,2,3.0,4,POLITSC3500,OHIO STATE UNIVERSITY,ALEX,ACS,"Final grade is only based on two exams, readin...",0,0,...,0.0,0.0,0,0.0,1,0,0,0,0,0.0
2,3,4.0,1,POLITSC1100,OHIO STATE UNIVERSITY,ALEX,ACS,Class was super easy. One reading quiz a week ...,0,0,...,0.0,0.0,0,0.0,0,0,0,0,0,0.0
3,4,4.0,2,POLITSC3115,OHIO STATE UNIVERSITY,ALEX,ACS,"Lecture could be dry at times, but I still lik...",0,0,...,0.0,0.0,0,0.0,0,0,1,0,0,0.0
4,5,5.0,3,POLITSC3115,OHIO STATE UNIVERSITY,ALEX,ACS,"was an excellent lecturer. Insightful, even h...",0,0,...,0.0,0.0,0,0.0,0,0,0,0,0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2795,16,5.0,3,PSYCH1100H,OHIO STATE UNIVERSITY,ANNA,YOCOM,"I had Dr. for PSYCH 1100H last semester, and ...",0,0,...,0.0,0.0,0,0.0,0,0,0,0,0,0.0
2796,17,5.0,3,PSYCH2200,OHIO STATE UNIVERSITY,ANNA,YOCOM,A great professor for this class. You only nee...,0,0,...,0.0,0.0,0,0.0,0,0,0,0,0,0.0
2797,18,5.0,3,PSY1100H,OHIO STATE UNIVERSITY,ANNA,YOCOM,LOVED this class! Prof made me want to attend...,0,0,...,0.0,0.0,0,0.0,0,0,0,0,0,0.0
2798,19,5.0,3,PSY2220,OHIO STATE UNIVERSITY,ANNA,YOCOM,Dr. made this class much better than I expect...,0,0,...,0.0,0.0,0,0.0,0,0,0,0,0,0.0


In [31]:
### winnow down in order to create more balanced data 
constructive_df = rmp_df[rmp_df['constructive']>0]
unconstructive_df = rmp_df[rmp_df['constructive']==0]
constructive_df

Unnamed: 0,row,quality_of_class,difficulty_of_class,class_code,college,prof_firstname,prof_lastname,comment,out_misrep,out_emo_lang,...,pb_origin,pb_nuero_div,pb_phys_able,pb_pol_affil,complex,constructive,reflective,outrage_agg,personal_attack_agg,prejudice_agg
7,8,2.0,5,POLITSC3115,OHIO STATE UNIVERSITY,ALEX,ACS,Many questions are completely ambiguous or hav...,1,0,...,0.0,0.0,0,0.0,2,2,0,1,0,0.0
38,1,3.5,5,H508,OHIO STATE UNIVERSITY,HAL,ARKES,Dr. was a very good professor. I really had n...,0,0,...,0.0,0.0,0,0.0,0,2,0,0,0,0.0
64,16,3.0,3,POLI502,OHIO STATE UNIVERSITY,HERB,ASHER,"decent professor and a funny guy, although you...",0,0,...,0.0,0.0,0,0.0,1,2,0,0,0,0.0
94,18,2.0,4,POLITSC4138,OHIO STATE UNIVERSITY,LAWRENCE,BAUM,He does not use any kind of slides or another ...,0,0,...,0.0,0.0,0,0.0,1,2,1,0,0,0.0
137,1,5.0,3,SOC5925,OHIO STATE UNIVERSITY,PAUL,BELLAIR,Dr. is an excellent professor. He really care...,0,0,...,0.0,0.0,0,0.0,0,1,0,0,0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2634,1,3.0,2,PS3280,OHIO STATE UNIVERSITY,SARA,WATSON,"Very kind, took 4285 with her last semester, d...",0,0,...,0.0,0.0,0,0.0,0,1,1,0,0,0.0
2635,2,3.0,3,POLITSC4285,OHIO STATE UNIVERSITY,SARA,WATSON,Extremely nice & knowledgeable yet unorganized...,0,0,...,0.0,0.0,0,0.0,1,1,0,0,0,0.0
2710,2,1.0,5,PHYS367,OHIO STATE UNIVERSITY,JOHN,WILKINS,His class was the most infuriating class I've ...,0,0,...,0.0,0.0,0,0.0,0,1,0,0,0,0.0
2712,1,1.0,4,SOCIOL3549,OHIO STATE UNIVERSITY,KRISTI,WILLIAMS,If you need to take this course and struggle w...,1,0,...,0.0,0.0,0,0.0,1,1,0,1,0,0.0


In [32]:
### lets test replace vals in column 
constructive_df['constructive'].values[constructive_df['constructive']>1] = 1
#constructive_df['constructive'].unique()

In [33]:
### now we will want to get 77 randomly sampled unconstructive comments 
import random
random.seed(1337)
unconstructive_df = unconstructive_df.sample(n=77)
unconstructive_df

Unnamed: 0,row,quality_of_class,difficulty_of_class,class_code,college,prof_firstname,prof_lastname,comment,out_misrep,out_emo_lang,...,pb_origin,pb_nuero_div,pb_phys_able,pb_pol_affil,complex,constructive,reflective,outrage_agg,personal_attack_agg,prejudice_agg
2640,7,3.0,2,PS3290,OHIO STATE UNIVERSITY,SARA,WATSON,Prof is very kind and approachable. The class...,0,0,...,0.0,0.0,0,0.0,0,0,0,0,0,0.0
1917,14,4.5,5,PSYCH3550,OHIO STATE UNIVERSITY,STEPHEN,PETRILL,"The class was definitely a challenge, but Prof...",0,0,...,0.0,0.0,0,0.0,0,0,0,0,0,0.0
41,4,5.0,5,PSY508,OHIO STATE UNIVERSITY,HAL,ARKES,Simply awesome - best professor I ever had.,0,0,...,0.0,0.0,0,0.0,0,0,0,0,0,0.0
1310,1,2.5,3,POLSC101,OHIO STATE UNIVERSITY,R. WILLIAM,LIDDLE,"Nice guy, so not sure what all of the horrible...",0,0,...,0.0,0.0,0,0.0,0,0,0,0,0,0.0
542,20,1.5,4,PSYCHH320,OHIO STATE UNIVERSITY,ROBERT,CUDECK,Classes are completely unorganized. He can be ...,0,0,...,0.0,0.0,0,0.0,0,0,0,0,0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
282,11,5.0,1,PSYCH4475,OHIO STATE UNIVERSITY,KRISTY,BOYCE,This is my second class was Dr. . Not only are...,0,0,...,0.0,0.0,0,0.0,0,0,0,0,0,0.0
566,11,3.0,4,PSY508,OHIO STATE UNIVERSITY,MIKE,DEKAY,"Professor DeKay is a great professor, he reall...",0,0,...,0.0,0.0,0,0.0,0,0,2,0,0,0.0
1474,11,5.0,4,POLSC2400,OHIO STATE UNIVERSITY,ERIC,MACGILVRAY,Fantastic!!!!,0,0,...,0.0,0.0,0,0.0,0,0,0,0,0,0.0
96,20,5.0,4,POLITSC4138,OHIO STATE UNIVERSITY,LAWRENCE,BAUM,might be the best professor I've had at OSU....,0,0,...,0.0,0.0,0,0.0,0,0,0,0,0,0.0


In [34]:
### good, no let's rebind 
rmp_df_test = constructive_df.append(unconstructive_df)
rmp_df_test.reset_index()
rmp_df_test ## works 

  rmp_df_test = constructive_df.append(unconstructive_df)


Unnamed: 0,row,quality_of_class,difficulty_of_class,class_code,college,prof_firstname,prof_lastname,comment,out_misrep,out_emo_lang,...,pb_origin,pb_nuero_div,pb_phys_able,pb_pol_affil,complex,constructive,reflective,outrage_agg,personal_attack_agg,prejudice_agg
7,8,2.0,5,POLITSC3115,OHIO STATE UNIVERSITY,ALEX,ACS,Many questions are completely ambiguous or hav...,1,0,...,0.0,0.0,0,0.0,2,1,0,1,0,0.0
38,1,3.5,5,H508,OHIO STATE UNIVERSITY,HAL,ARKES,Dr. was a very good professor. I really had n...,0,0,...,0.0,0.0,0,0.0,0,1,0,0,0,0.0
64,16,3.0,3,POLI502,OHIO STATE UNIVERSITY,HERB,ASHER,"decent professor and a funny guy, although you...",0,0,...,0.0,0.0,0,0.0,1,1,0,0,0,0.0
94,18,2.0,4,POLITSC4138,OHIO STATE UNIVERSITY,LAWRENCE,BAUM,He does not use any kind of slides or another ...,0,0,...,0.0,0.0,0,0.0,1,1,1,0,0,0.0
137,1,5.0,3,SOC5925,OHIO STATE UNIVERSITY,PAUL,BELLAIR,Dr. is an excellent professor. He really care...,0,0,...,0.0,0.0,0,0.0,0,1,0,0,0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
282,11,5.0,1,PSYCH4475,OHIO STATE UNIVERSITY,KRISTY,BOYCE,This is my second class was Dr. . Not only are...,0,0,...,0.0,0.0,0,0.0,0,0,0,0,0,0.0
566,11,3.0,4,PSY508,OHIO STATE UNIVERSITY,MIKE,DEKAY,"Professor DeKay is a great professor, he reall...",0,0,...,0.0,0.0,0,0.0,0,0,2,0,0,0.0
1474,11,5.0,4,POLSC2400,OHIO STATE UNIVERSITY,ERIC,MACGILVRAY,Fantastic!!!!,0,0,...,0.0,0.0,0,0.0,0,0,0,0,0,0.0
96,20,5.0,4,POLITSC4138,OHIO STATE UNIVERSITY,LAWRENCE,BAUM,might be the best professor I've had at OSU....,0,0,...,0.0,0.0,0,0.0,0,0,0,0,0,0.0


In [35]:
## with that read in, let's get the comments and labels specified 
comments = rmp_df_test['comment']
labels_const = rmp_df_test['constructive']
comments = comments.tolist()
labels_const = labels_const.tolist()
type(labels_const) ## want to make just a list. 

list

In [36]:
### let's see if it is th emutliple cats causing issues; let's make into binary
labels_const

## when I changed a value to 2 in the chatGPT_test_script2, everything still worked. This implies that there is some other 
# error going on. 

[1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0]

In [37]:
# Create the labeled dataset object
tokenizer.add_special_tokens({'pad_token': '0'})
dataset = LabeledDataset(tokenizer, comments, labels_const) # tokenizer above (from gpt2), 
#with comments and labels from the RMP data set 
# and the labels the 0s and 1s 

# Data loader
batch_size = 2
dataloader = DataLoader(dataset, batch_size=batch_size, shuffle=True)
len(dataloader) # lrngth now mismatch; diff than dataset 

77

In [40]:
# Initialize the model
model = GPT2LMHeadModel.from_pretrained('gpt2', config=config)
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model.to(device)

# Fine-tuning parameters
num_epochs = 5
learning_rate = 2e-5
dataloader

<torch.utils.data.dataloader.DataLoader at 0x1e37e581b50>

In [39]:
### chek the labels just to be safe 
# labels_const.unique() #good
# the script below taking way too long; lets winnow down the data set
#labels_const

#dataloader[[0]]

In [42]:
# The model itself. Let's see if we can't get this working 
optimizer = torch.optim.AdamW(model.parameters(), lr=learning_rate)
loss_fn = torch.nn.BCEWithLogitsLoss()
### problem remains even if we change data solely to 0 or 1 
### managed to get the same error. Therefore, we should be able to work here. 
## we reset the index, then switched to 0 and 1, and it got to the model part. Let's see what happens.
#IF we just leave it like so 
for epoch in range(num_epochs):
    model.train()
    total_loss = 0.0
        ## debug print here 
    print("got here")
    for batch in dataloader:

        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)
        print(labels)
        optimizer.zero_grad()
        outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=input_ids)
        # print(outputs)
        loss = outputs.loss
        logits = outputs.logits
        total_loss += loss.item()

        loss.backward()
        optimizer.step()

    avg_loss = total_loss / len(dataloader)
    print(f"Epoch {epoch+1}/{num_epochs} - Average Loss: {avg_loss:.4f}")

# Save the fine-tuned model
save_path = 'coding/models'

model.save_pretrained(save_path)
tokenizer.save_pretrained(save_path)

got here
tensor([0, 0])
tensor([0, 1])


KeyboardInterrupt: 

In [50]:
len(dataloader)## only 77 

77