In [11]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [1]:
!pip install torch
!pip install transformers
!pip install transformers[torch]




[notice] A new release of pip is available: 23.2.1 -> 23.3.1
[notice] To update, run: python.exe -m pip install --upgrade pip





[notice] A new release of pip is available: 23.2.1 -> 23.3.1
[notice] To update, run: python.exe -m pip install --upgrade pip

[notice] A new release of pip is available: 23.2.1 -> 23.3.1
[notice] To update, run: python.exe -m pip install --upgrade pip




In [3]:
# import the dataset
import pandas as pd
import numpy as np
import os

# Directory path
#CHRIS directory_path = '/content/drive/MyDrive/all_lectures.csv'
directory_path = '../data/all_lectures.csv'

# Initialize an empty DataFrame
df = pd.DataFrame(columns=['Week Number', 'Lesson Number', 'Lesson Title', 'Transcript'])

# Read in csv to dataframe
df = pd.read_csv(directory_path)

# Display the resulting DataFrame
df.head()

Unnamed: 0,Week Number,Lesson Number,Lesson Title,Transcript
0,1,1,Natural Language Content Analysis,This lecture is about Natural Language of Cont...
1,1,2,Text Access,"In this lecture,\r\nwe're going to talk about ..."
2,1,3,Text Retrieval Problem,This lecture is about\r\nthe text retrieval pr...
3,1,4,Overview of Text Retrieval Methods,This lecture is a overview of\r\ntext retrieva...
4,1,5,Vector Space Model - Basic Idea,This lecture is about the\r\nvector space retr...


In [4]:
# clean up words in dataset -- this includes removing stopwords
import regex as re
import nltk
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords, words, brown

nltk.download("stopwords")
nltk.download("words")
nltk.download("brown")
nltk.download("punkt")

lemmer = WordNetLemmatizer()
stop_words = set(stopwords.words('english'))

# initialize dictionary
global_dictionary  = set(words.words()) | set(brown.words())
global_dictionary = {word.lower() for word in global_dictionary}
remove_words = list(stop_words) # might need to use word_tokenize
remove_words.extend(['Play', 'video', 'starting', 'at', '::', 'follow', 'transcript', 'natural', 'language', 'lecture', 'processing']) # remove the common words that are included in transcript

# Now start actually cleaning the text
def clean_text(text):
    text = text.lower() # lowercase
    text = text.replace('\n', ' ') # remove newline indicator
    text = re.sub(r'[^a-zA-Z\s]', '', text) # case
    text = re.sub(r'\s+', ' ', text).strip()
    text = re.sub(r'http\S+|www\S+', '', text) # website
    text = re.sub(r'(\b\w+\b)(?: \1)+', r'\1', text) # remove duplicate next word after space
    text = re.sub(r'\b(?![aI]\b)\w\b', '', text)

    return text

# Remove stopwords and only keep words in dictionary
def remove_terms(text):
    text = clean_text(text)
    words = text.split()
    # filtered_words = [word for word in words if word not in remove_words] # remove stopwords
    filtered_words = [word for word in words if word in global_dictionary] # remove if not in global dictionary
    return " ".join(filtered_words)

# Tokenize reviews + remove stop words + filter only nouns
def tokenize_and_filter(text):
    words = nltk.word_tokenize(text)
    words = [word.lower() for word in words] # if word.lower() not in stop_words and len(word) > 2]
    # print(words)
    # pos_comment = nltk.pos_tag(words)
    # filtered = [word[0] for word in pos_comment if word[1] in ['NN']]
    return words #filtered

def lower_text(text):
    words = text.lower()
    return words

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\chris\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package words to
[nltk_data]     C:\Users\chris\AppData\Roaming\nltk_data...
[nltk_data]   Package words is already up-to-date!
[nltk_data] Downloading package brown to
[nltk_data]     C:\Users\chris\AppData\Roaming\nltk_data...
[nltk_data]   Package brown is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\chris\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [5]:
pd.options.display.max_rows = 500

df['Transcript_Cleaned'] = df['Transcript'].apply(remove_terms)
df['Transcript_Cleaned'] = df['Transcript_Cleaned'].apply(lower_text)
# Skipping this in order to tokenize later
# df['Transcript_Cleaned'] = df['Transcript_Cleaned'].map(tokenize_and_filter)
# df['Transcript_Cleaned']

In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
import torch
from transformers import TrainingArguments, Trainer
from transformers import GPT2Tokenizer, GPT2Model, GPT2LMHeadModel, GPT2Config

model_choice = "gpt2"

tokenizer = GPT2Tokenizer.from_pretrained(model_choice)
tokenizer.add_special_tokens({'pad_token': '[PAD]'}) # added padding token

class CustomGPT2Model(GPT2LMHeadModel):
    def forward(self, input_ids, attention_mask=None, labels=None):
        outputs = super().forward(input_ids, attention_mask=attention_mask, labels=labels)

        # Extract the loss from the outputs if labels are provided
        loss = outputs.loss if labels is not None else None

        return {"loss": loss, "logits": outputs.logits}

# Replace "gpt2" with your desired model configuration
config = GPT2Config.from_pretrained(model_choice)

# Instantiate your custom GPT-2 model
model = CustomGPT2Model(config)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

# modify data to be used as a list
data = list(df['Transcript_Cleaned'])

# create tokenizer function to allow arguemnts in apply method to df columns
def tokenize_text(tokens):
    return tokenizer(tokens, padding=True, truncation=True, max_length=512, return_tensors="pt")

# df['Tokenized_Text'] = df['Transcript_Cleaned'].apply(lambda x: tokenizer.encode(" ".join(x), return_tensors="pt"))
train, val = train_test_split(data, test_size=0.2)
train_tokenized = tokenizer(train, padding=True, truncation=True, max_length=512)
val_tokenized = tokenizer(val, padding=True, truncation=True, max_length=512)

In [17]:
class Dataset(torch.utils.data.Dataset):
    def __init__(self, encodings):
        self.encodings = encodings
        self.labels = [ids[:-1] for ids in encodings["input_ids"]]  # Shift labels by one position
        self.labels = [torch.tensor(ids + [0]) for ids in self.labels]  # Set the last token to 0 or another appropriate value

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item["labels"] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.encodings["input_ids"])

In [18]:
train_dataset = Dataset(train_tokenized)
val_dataset = Dataset(val_tokenized)

In [19]:
# Define Trainer
args = TrainingArguments(
    output_dir="/content/drive/MyDrive/output",
    overwrite_output_dir=True,
    num_train_epochs=5,
    per_device_train_batch_size=2,
    save_steps=10,
    gradient_accumulation_steps=6,
    learning_rate=1e-5,
    evaluation_strategy="steps",
    eval_steps=10,  # Set an appropriate evaluation frequency
    save_total_limit=2,  # Adjust as needed
    load_best_model_at_end=True,
    metric_for_best_model="perplexity",
    greater_is_better=False,  # Lower perplexity is better

trainer = Trainer(
    model=model,
    args=args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    compute_metrics=compute_metrics,
)

In [20]:
trainer.train()

  item["labels"] = torch.tensor(self.labels[idx])


Step,Training Loss


  item["labels"] = torch.tensor(self.labels[idx])
  item["labels"] = torch.tensor(self.labels[idx])


Step,Training Loss


TrainOutput(global_step=30, training_loss=9.493714396158854, metrics={'train_runtime': 4095.8194, 'train_samples_per_second': 0.093, 'train_steps_per_second': 0.007, 'total_flos': 94065131520000.0, 'train_loss': 9.493714396158854, 'epoch': 4.74})

In [21]:
trainer.evaluate()

  item["labels"] = torch.tensor(self.labels[idx])


{'eval_loss': 9.17955493927002,
 'eval_runtime': 70.4454,
 'eval_samples_per_second': 0.284,
 'eval_steps_per_second': 0.043,
 'epoch': 4.74}

In [None]:
trainer.save_model('/content/drive/MyDrive/Finetuned_Model_01')

In [23]:
tokenizer.decode([1])

'"'

In [None]:
text = "The best ways to retrieve text are"
inputs = tokenizer(text, padding=True, truncation=True, return_tensors='pt').to(device)
outputs = model(**inputs)
# print(outputs)

# Access the logits from the dictionary
logits = outputs["logits"]
# print(logits)

# Apply softmax to the logits
# predictions = torch.nn.functional.softmax(logits, dim=-1)
temperature = 0.85
predictions = torch.nn.functional.softmax(logits / temperature, dim=-1)
# print(predictions)

# Convert predictions to NumPy array
predictions = predictions.cpu().detach().numpy()
print(predictions)

# Decode predictions
predicted_token_ids = torch.argmax(torch.from_numpy(predictions), dim=-1)
print(predicted_token_ids)

# Convert tensor to a Python list
predicted_token_ids = predicted_token_ids.tolist()

# Decode token IDs to words
predicted_tokens = [tokenizer.decode(ids) for ids in predicted_token_ids[0]]
print(predicted_tokens)
