In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
!pip install torch
!pip install transformers
!pip install transformers[torch]

Collecting accelerate>=0.20.3 (from transformers[torch])
  Downloading accelerate-0.25.0-py3-none-any.whl (265 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m265.7/265.7 kB[0m [31m7.2 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: accelerate
Successfully installed accelerate-0.25.0


In [3]:
# import the dataset
import pandas as pd
import numpy as np
import os

# Directory path
MAIN_DIR = "/content/drive/MyDrive/all_lectures.csv"

# Initialize an empty DataFrame
df = pd.DataFrame(columns=['Week Number', 'Lesson Number', 'Lesson Title', 'Transcript'])

# Read in csv to dataframe
df = pd.read_csv(MAIN_DIR)

# Display the resulting DataFrame
df.head()

Unnamed: 0,Week Number,Lesson Number,Lesson Title,Transcript
0,1,1,Natural Language Content Analysis,This lecture is about Natural Language of Cont...
1,1,2,Text Access,"In this lecture,\nwe're going to talk about th..."
2,1,3,Text Retrieval Problem,This lecture is about\nthe text retrieval prob...
3,1,4,Overview of Text Retrieval Methods,This lecture is a overview of\ntext retrieval ...
4,1,5,Vector Space Model - Basic Idea,This lecture is about the\nvector space retrie...


In [4]:
# clean up words in dataset -- this includes removing stopwords
import regex as re
import nltk
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords, words, brown

nltk.download("stopwords")
nltk.download("words")
nltk.download("brown")
nltk.download("punkt")

lemmer = WordNetLemmatizer()
stop_words = set(stopwords.words('english'))

# initialize dictionary
global_dictionary  = set(words.words()) | set(brown.words())
global_dictionary = {word.lower() for word in global_dictionary}
remove_words = list(stop_words) # might need to use word_tokenize
remove_words.extend(['Play', 'video', 'starting', 'at', '::', 'follow', 'transcript', 'natural', 'language', 'lecture', 'processing']) # remove the common words that are included in transcript

# Now start actually cleaning the text
def clean_text(text):
    text = text.lower() # lowercase
    text = text.replace('\n', ' ') # remove newline indicator
    text = re.sub(r'[^a-zA-Z\s]', '', text) # case
    text = re.sub(r'\s+', ' ', text).strip()
    text = re.sub(r'http\S+|www\S+', '', text) # website
    text = re.sub(r'(\b\w+\b)(?: \1)+', r'\1', text) # remove duplicate next word after space
    text = re.sub(r'\b(?![aI]\b)\w\b', '', text)

    return text

# Remove stopwords and only keep words in dictionary
def remove_terms(text):
    text = clean_text(text)
    words = text.split()
    # filtered_words = [word for word in words if word not in remove_words] # remove stopwords
    filtered_words = [word for word in words if word in global_dictionary] # remove if not in global dictionary
    return " ".join(filtered_words)

# Tokenize reviews + remove stop words + filter only nouns
def tokenize_and_filter(text):
    words = nltk.word_tokenize(text)
    words = [word.lower() for word in words] # if word.lower() not in stop_words and len(word) > 2]
    # print(words)
    # pos_comment = nltk.pos_tag(words)
    # filtered = [word[0] for word in pos_comment if word[1] in ['NN']]
    return words #filtered

def lower_text(text):
    words = text.lower()
    return words

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package words to /root/nltk_data...
[nltk_data]   Unzipping corpora/words.zip.
[nltk_data] Downloading package brown to /root/nltk_data...
[nltk_data]   Unzipping corpora/brown.zip.
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


In [5]:
pd.options.display.max_rows = 500

df['Transcript_Cleaned'] = df['Transcript'].apply(remove_terms)
df['Transcript_Cleaned'] = df['Transcript_Cleaned'].apply(lower_text)
# Skipping this in order to tokenize later
# df['Transcript_Cleaned'] = df['Transcript_Cleaned'].map(tokenize_and_filter)
df['Transcript_Cleaned']

0     this lecture is about natural language of cont...
1     in this lecture were going to talk about the t...
2     this lecture is about the text retrieval probl...
3     this lecture is a overview of text retrieval m...
4     this lecture is about the vector space retriev...
5     in this lecture were going to talk about how t...
6     in this lecture we are going to talk about how...
7     in this lecture we continue the discussion of ...
8     this lecture is about document length normaliz...
9     this lecture is about the implementation of te...
10    this lecture is about the inverted index const...
11    this lecture is about how to do faster search ...
12    this lecture is about evaluation of text retri...
13    this lecture is about the basic measures for e...
14    this lecture is about how we can evaluate a ra...
15    so average precision is computer for just one ...
16    this lecture is about how to evaluate the text...
17    this lecture is about some practical issue

In [6]:
import random

def extract_window_labels(df, window_size, num_samples):
  """
  Extracts window/label pairs from a DataFrame with batching, skip logic, and error checks.

  Args:
    df: A DataFrame containing text sections in the "Transcript_Cleaned" column.
    window_size: The size of the window for capturing context.
    num_samples: The number of window/label pairs to generate per batch.

  Returns:
    A list of dictionaries containing window and label pairs.
  """
  new_data = []

  for i in range(len(df)):
    text = df.loc[i, "Transcript_Cleaned"]
    remaining_words = len(text.split())
    processed_words = 0

    # Process batches of 100 words
    while remaining_words >= 100:
      # Skip processing the last 50 words in subsequent batches
      if remaining_words < 100:
        break

      # Check if sufficient words remain for another batch
      if remaining_words < window_size + 1:
        break

      batch_text = text[processed_words:processed_words + 100]
      batch_words = batch_text.split()

      # Generate samples within the batch
      for _ in range(num_samples):
        # Check for valid sampling range within the batch
        max_start_idx = len(batch_words) - window_size - 1
        start_idx = random.randint(window_size, max_start_idx)
        end_idx = start_idx + window_size

        # Create window and label strings
        window_text = " ".join(batch_words[start_idx:end_idx])
        label = batch_words[end_idx]

        # Append pair to new data
        new_data.append({"window": window_text, "label": label})

      processed_words += 100
      remaining_words -= 100

  return new_data

new_df = extract_window_labels(df, 5, 15)

In [7]:
new_df = pd.DataFrame.from_dict(new_df)
new_df = new_df.drop_duplicates()
new_df

Unnamed: 0,window,label
0,see from this picture this,is
1,content analysis as you see,from
2,language of content analysis as,you
3,as you see from this,picture
6,analysis as you see from,this
...,...,...
23656,meaningful for other sequences of,data
23658,might be also meaningful for,other
23659,of data we also talked,a
23663,other sequences of data we,also


In [8]:
# now we have our dataframe, let's use it!
import torch
import torch.nn as nn
from torch.utils.data import DataLoader, TensorDataset, random_split
from transformers import BertTokenizer, BertForSequenceClassification, AdamW
import json
import pandas as pd
from tqdm import tqdm
from sklearn.metrics import accuracy_score, f1_score, classification_report


In [9]:
# lets save the label values as integers, and save the indexes for them as well
results = set(new_df['label'])

# save a vocabulary of labels to predict with number indexes
word_to_index = {word: idx for idx, word in enumerate(results)}
index_to_word = {idx: word for word, idx in word_to_index.items()}

# convert labels into indices
def words_to_indices(words):
    return [word_to_index[word] for word in words]
new_df['label_indices'] = words_to_indices(new_df['label'])

# create function to reverse this, for output
def indices_to_words(indices):
    return [index_to_word[idx] for idx in indices]


In [10]:
new_df
df

Unnamed: 0,Week Number,Lesson Number,Lesson Title,Transcript,Transcript_Cleaned
0,1,1,Natural Language Content Analysis,This lecture is about Natural Language of Cont...,this lecture is about natural language of cont...
1,1,2,Text Access,"In this lecture,\nwe're going to talk about th...",in this lecture were going to talk about the t...
2,1,3,Text Retrieval Problem,This lecture is about\nthe text retrieval prob...,this lecture is about the text retrieval probl...
3,1,4,Overview of Text Retrieval Methods,This lecture is a overview of\ntext retrieval ...,this lecture is a overview of text retrieval m...
4,1,5,Vector Space Model - Basic Idea,This lecture is about the\nvector space retrie...,this lecture is about the vector space retriev...
5,1,6,Vector Space Retrieval Model - Simplest Instan...,In this lecture we're going to talk about how ...,in this lecture were going to talk about how t...
6,2,1,Vector Space Model - Improved Instantiation,"In this lecture, we are going to talk about ho...",in this lecture we are going to talk about how...
7,2,2,TF Transformation,"In this lecture, we continue\nthe discussion o...",in this lecture we continue the discussion of ...
8,2,3,Doc Length Normalization,This lecture is about Document Length Normaliz...,this lecture is about document length normaliz...
9,2,4,Implementation of TR Systems,This lecture is about the implementation\nof t...,this lecture is about the implementation of te...


In [11]:
# might just use the normal df for now

import re
import json
from sklearn.model_selection import train_test_split

train, test = train_test_split(df, test_size=0.15)

print("Train dataset length: "+str(len(train)))
print("Test dataset length: "+ str(len(test)))


Train dataset length: 81
Test dataset length: 15


In [12]:
# # Display some samples
# for example in train_dataset:
#     print(len(tokenizer(example["Transcript_Cleaned"]).input_ids))


In [13]:
!pip install datasets

Collecting datasets
  Downloading datasets-2.15.0-py3-none-any.whl (521 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m521.2/521.2 kB[0m [31m5.7 MB/s[0m eta [36m0:00:00[0m
Collecting pyarrow-hotfix (from datasets)
  Downloading pyarrow_hotfix-0.6-py3-none-any.whl (7.9 kB)
Collecting dill<0.3.8,>=0.3.0 (from datasets)
  Downloading dill-0.3.7-py3-none-any.whl (115 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m115.3/115.3 kB[0m [31m10.8 MB/s[0m eta [36m0:00:00[0m
Collecting multiprocess (from datasets)
  Downloading multiprocess-0.70.15-py310-none-any.whl (134 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m134.8/134.8 kB[0m [31m12.2 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: pyarrow-hotfix, dill, multiprocess, datasets
Successfully installed datasets-2.15.0 dill-0.3.7 multiprocess-0.70.15 pyarrow-hotfix-0.6


In [14]:
from transformers import AutoTokenizer, DataCollatorForLanguageModeling
import datasets
from datasets import Dataset, DatasetDict

tokenizer = AutoTokenizer.from_pretrained("gpt2-large")
tokenizer.pad_token = tokenizer.eos_token
train_dataset = Dataset.from_pandas(train)
test_dataset = Dataset.from_pandas(test)

config.json:   0%|          | 0.00/666 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/1.04M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

In [15]:
max_seq_length = 1024

def tokenization(example):
    return tokenizer(example["Transcript_Cleaned"], padding='max_length', truncation=True, max_length=max_seq_length, return_tensors="pt")

train_dataset = train_dataset.map(tokenization)
test_dataset = test_dataset.map(tokenization)
data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False)
device = 'cuda' if torch.cuda.is_available() else 'cpu'

Map:   0%|          | 0/81 [00:00<?, ? examples/s]

Map:   0%|          | 0/15 [00:00<?, ? examples/s]

In [16]:
# !pip install --upgrade transformers

In [17]:
from transformers import Trainer, TrainingArguments, AutoModelWithLMHead

model = AutoModelWithLMHead.from_pretrained("gpt2-medium").to(device)

training_args = TrainingArguments(
        output_dir='/content/drive/MyDrive/fine-tuned-model',
        overwrite_output_dir=True,
        num_train_epochs=3,
        per_device_train_batch_size=2, # batch size for training
        # per_device_eval_batch_size=64,  # batch size for evaluation
        # eval_steps=100, # Number of update steps between two evaluations.
        save_steps=3, # after # steps model is saved
        # warmup_steps=500,# number of warmup steps for learning rate scheduler
        gradient_accumulation_steps=4, # or any larger value
        gradient_checkpointing=True # SUPER IMPORTANT!!! MAKES IT SO DOESN'T TAKE UP TOO MUCH RAM AND ISNTEAD TRANSFERS TO DISK!!!
    )

trainer = Trainer(
    model=model,
    args=training_args,
    data_collator=data_collator,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
    tokenizer=tokenizer
)

trainer.train()



config.json:   0%|          | 0.00/718 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.52G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/124 [00:00<?, ?B/s]

You're using a GPT2TokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.
`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


Step,Training Loss




TrainOutput(global_step=30, training_loss=3.2870396931966144, metrics={'train_runtime': 653.9449, 'train_samples_per_second': 0.372, 'train_steps_per_second': 0.046, 'total_flos': 442061530595328.0, 'train_loss': 3.2870396931966144, 'epoch': 2.93})

In [18]:
# !nvidia-smi -L
# !kill -9 -pid_number

In [22]:
# save model to output directory from training arguments
trainer.save_model()

In [27]:
# Generate text from a prompt
prompt = "What is Latent Dirichlet Allocation"
input_ids = tokenizer.encode(prompt, return_tensors="pt").to(device)

# Generate text using the trained model
output = model.generate(input_ids, max_length=250, num_return_sequences=1, no_repeat_ngram_size=2, top_k=50, top_p=0.95, temperature=0.7)

The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


In [28]:
# Decode and print the generated text
generated_text = tokenizer.decode(output[0], skip_special_tokens=True)
print(generated_text)

What is Latent Dirichlet Allocation?

Latent is a term that describes the process of assigning a vector to a function that is not a normal vector. This is the case when we have a non-normal function and we assign a new vector that has a different value to the function.
 and this is called a latent function because it is used to assign the vector of a particular function to another vector and then we can use this vector in the next function as a reference to this new function so that we know that the reference vector is now the same as the original vector so this means that this function is also a special case of the normal function in that it has the property that if we use the new value of this variable then the result is always the value that was assigned to that function by the previous function but we do not have to use a value for the old vector because the two vectors are the only reference for this particular vector now so the difference between the latent and normal functions i

In [None]:
# Try the model again, but this time use the labels as well, so that you can have much more from your dataset