In [2]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
!pip install torch
!pip install transformers
!pip install transformers[torch]

Collecting accelerate>=0.20.3 (from transformers[torch])
  Downloading accelerate-0.25.0-py3-none-any.whl (265 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m265.7/265.7 kB[0m [31m5.7 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: accelerate
Successfully installed accelerate-0.25.0


In [3]:
# # PyTorch clear gpu cache
# device = torch.cuda
# del device
# torch.cuda.empty_cache() # unsure if it really works

In [4]:
# import the dataset
import pandas as pd
import numpy as np
import os

# Directory path
directory_path = '/content/drive/MyDrive/all_lectures.csv'

# Initialize an empty DataFrame
df = pd.DataFrame(columns=['Week Number', 'Lesson Number', 'Lesson Title', 'Transcript'])

# Read in csv to dataframe
df = pd.read_csv(directory_path)

# Display the resulting DataFrame
df.head()

Unnamed: 0,Week Number,Lesson Number,Lesson Title,Transcript
0,1,1,Natural Language Content Analysis,This lecture is about Natural Language of Cont...
1,1,2,Text Access,"In this lecture,\r\nwe're going to talk about ..."
2,1,3,Text Retrieval Problem,This lecture is about\r\nthe text retrieval pr...
3,1,4,Overview of Text Retrieval Methods,This lecture is a overview of\r\ntext retrieva...
4,1,5,Vector Space Model - Basic Idea,This lecture is about the\r\nvector space retr...


In [5]:
# # for testing a sample dataframe
# import pandas as pd
# import numpy as np
# import os

# # Testing directory path
# directory_path = 'C:\\Users\\azaan\\OneDrive\\Documents\\GitHub\\cs410_LLM_project\\sample_data\\module_7_sample.csv'

# # Initialize an empty DataFrame
# df = pd.DataFrame(columns=['Week Number', 'Lesson Number', 'Lesson Title', 'Transcript'])

# # Read in csv to dataframe
# df = pd.read_csv(directory_path)

# # Display the resulting DataFrame
# df.head()

In [5]:
# clean up words in dataset -- this includes removing stopwords
import regex as re
import nltk
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords, words, brown

nltk.download("stopwords")
nltk.download("words")
nltk.download("brown")
nltk.download("punkt")

lemmer = WordNetLemmatizer()
stop_words = set(stopwords.words('english'))

# initialize dictionary
global_dictionary  = set(words.words()) | set(brown.words())
global_dictionary = {word.lower() for word in global_dictionary}
remove_words = list(stop_words) # might need to use word_tokenize
remove_words.extend(['Play', 'video', 'starting', 'at', '::', 'follow', 'transcript', 'natural', 'language', 'lecture', 'processing']) # remove the common words that are included in transcript

# Now start actually cleaning the text
def clean_text(text):
    text = text.lower() # lowercase
    text = text.replace('\n', ' ') # remove newline indicator
    text = re.sub(r'[^a-zA-Z\s]', '', text) # case
    text = re.sub(r'\s+', ' ', text).strip()
    text = re.sub(r'http\S+|www\S+', '', text) # website
    text = re.sub(r'(\b\w+\b)(?: \1)+', r'\1', text) # remove duplicate next word after space
    text = re.sub(r'\b(?![aI]\b)\w\b', '', text)

    return text

# Remove stopwords and only keep words in dictionary
def remove_terms(text):
    text = clean_text(text)
    words = text.split()
    # filtered_words = [word for word in words if word not in remove_words] # remove stopwords
    filtered_words = [word for word in words if word in global_dictionary] # remove if not in global dictionary
    return " ".join(filtered_words)

df['Transcript_Cleaned'] = df['Transcript'].apply(remove_terms)

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\chris\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package words to
[nltk_data]     C:\Users\chris\AppData\Roaming\nltk_data...
[nltk_data]   Package words is already up-to-date!
[nltk_data] Downloading package brown to
[nltk_data]     C:\Users\chris\AppData\Roaming\nltk_data...
[nltk_data]   Package brown is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\chris\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [6]:
df['Transcript_Cleaned'][0]

'this lecture is about natural language of content analysis as you see from this picture this is really the first step to process any text data text data are in natural languages so computers have to understand natural languages to some extent in order to make use of the data so thats the topic of this lecture were going to cover three things first what is natural language processing which is the main technique for processing natural language to obtain understanding the second is the state of the art of which stands for natural language processing finally were going to cover the relation between natural language processing and text retrieval first what is well the best way to explain it is to think about if you see a text in a foreign language that you can understand now what do you have to do in order to understand that text this is basically what computers are facing so looking at the simple sentence like a dog is chasing a boy on the playground we dont have any problems understandin

In [12]:
# # Create bigrams and trigrams from data

# # Function to filter bigrams or trigrams
# def ngram_filter(ngram):
#     tags = nltk.pos_tag(ngram)
#     if not all(tag[1] in ['JJ', 'NN'] for tag in tags):
#         return False
#     if any(word in stop_words for word in ngram):
#         return False
#     if 'n' in ngram or 't' in ngram:
#         return False
#     if 'PRON' in ngram:
#         return False
#     return True

# # Function to find top ngrams
# def find_top_ngrams(texts, ngram_measures, min_freq=50, min_pmi=5, top_k=100):
#     finder = nltk.collocations.BigramCollocationFinder.from_documents(texts)
#     finder.apply_freq_filter(min_freq)
#     ngram_scores = finder.score_ngrams(ngram_measures.pmi)
#     filtered_ngrams = [ngram for ngram, pmi in ngram_scores if ngram_filter(ngram) and pmi > min_pmi]
#     return [' '.join(ngram) for ngram in filtered_ngrams][:top_k]

# bigram_measures = nltk.collocations.BigramAssocMeasures()
# bigrams = find_top_ngrams([text.split() for text in df['Transcript_Cleaned']], bigram_measures)
# trigram_measures = nltk.collocations.TrigramAssocMeasures()
# trigrams = find_top_ngrams([text.split() for text in df['Transcript_Cleaned']], trigram_measures)

# # Function to replace ngrams in text
# def replace_ngrams(text):
#     for gram in trigrams:
#         text = text.replace(gram, '_'.join(gram.split()))
#     for gram in bigrams:
#         text = text.replace(gram, '_'.join(gram.split()))
#     return text

# # Apply ngram replacements to the text
# df['Grams'] = df['Transcript_Cleaned'].map(replace_ngrams)


In [7]:
# Tokenize reviews + remove stop words + filter only nouns
def tokenize_and_filter(text):
    words = nltk.word_tokenize(text)
    words = [word.lower() for word in words] # if word.lower() not in stop_words and len(word) > 2]
    # print(words)
    # pos_comment = nltk.pos_tag(words)
    # filtered = [word[0] for word in pos_comment if word[1] in ['NN']]
    return words #filtered

# If using transcript instead of grams
df['Transcript_Cleaned'] = df['Transcript_Cleaned'].map(tokenize_and_filter)

# If using Grams instead of transcript
# df['Grams'] = df['Grams'].map(tokenize_and_filter)

In [8]:
# df['Transcript_Cleaned'][0] #['the']
df['Transcript_Cleaned'][0]

['this',
 'lecture',
 'is',
 'about',
 'natural',
 'language',
 'of',
 'content',
 'analysis',
 'as',
 'you',
 'see',
 'from',
 'this',
 'picture',
 'this',
 'is',
 'really',
 'the',
 'first',
 'step',
 'to',
 'process',
 'any',
 'text',
 'data',
 'text',
 'data',
 'are',
 'in',
 'natural',
 'languages',
 'so',
 'computers',
 'have',
 'to',
 'understand',
 'natural',
 'languages',
 'to',
 'some',
 'extent',
 'in',
 'order',
 'to',
 'make',
 'use',
 'of',
 'the',
 'data',
 'so',
 'thats',
 'the',
 'topic',
 'of',
 'this',
 'lecture',
 'were',
 'going',
 'to',
 'cover',
 'three',
 'things',
 'first',
 'what',
 'is',
 'natural',
 'language',
 'processing',
 'which',
 'is',
 'the',
 'main',
 'technique',
 'for',
 'processing',
 'natural',
 'language',
 'to',
 'obtain',
 'understanding',
 'the',
 'second',
 'is',
 'the',
 'state',
 'of',
 'the',
 'art',
 'of',
 'which',
 'stands',
 'for',
 'natural',
 'language',
 'processing',
 'finally',
 'were',
 'going',
 'to',
 'cover',
 'the',
 'relat

In [9]:
# now I will make embeddings for my words, let's see if it works
# Replace Transcript_Cleaned with grams if using that method
import torch
import torch.nn as nn
import  torch.nn.functional as F
from torch.nn.utils.rnn import pad_sequence

results = set()
df['Transcript_Cleaned'].apply(results.update)
vocab_size = len(results)

# Create a vocabulary dictionary
word_to_index = {word: idx for idx, word in enumerate(results)}

# Convert words to indices in your DataFrame
# AKA Encode these
# df['Words_indices'] = df['Transcript_Cleaned'].apply(lambda x: [word_to_index[word] for word in x])
def words_to_indices(words):
    return [word_to_index[word] for word in words]
df['Words_indices'] = df['Transcript_Cleaned'].apply(words_to_indices)

# Create a reverse dictionary
index_to_word = {idx: word for word, idx in word_to_index.items()}

# Function to convert indices back to words
def indices_to_words(indices):
    return [index_to_word[idx] for idx in indices]

# Aka Decode this column
# df['Decoded_Words'] = df['Words_indices'].apply(indices_to_words)

# Pad sequences to a specified length (e.g., maxlen)
maxlen = 200  # You can adjust this based on your data
padded_indices = pad_sequence([torch.LongTensor(seq) for seq in df['Words_indices']], batch_first=True, padding_value=0)

In [35]:
# make a batch and set up parameters
block_size = 256
batch_size = 128
max_iters = 10000
learning_rate = 1e-4
eval_iters = 250
# new
n_embd = 128
n_layer = 6
dropout = 0.2
n_head = 6
# reduce gpu usage
accumulation_steps = 6  # Accumulate gradients over 4 batches before performing optimization step

# change to gpu
device = 'cuda' if torch.cuda.is_available() else 'cpu'
print(device)

# Flatten the padded indices used to identify each word
data = flattened_indices = padded_indices.view(-1)
n = int(0.8*len(data))
train_data = data[:n]
val_data = data[n:]
# print(len(data))

def get_batch(split):
    data = train_data if split == 'train' else val_data
    ix = torch.randint(len(data) - block_size, (batch_size,))
    x = torch.stack([data[i:i+block_size] for i in ix])
    y = torch.stack([data[i+1:i+block_size+1] for i in ix])
    x, y = x.to(device), y.to(device)
    return x, y

x, y = get_batch('train')
print(x)
print(y)

# The result has many zeros, which is normal for a padded dataset

cpu
tensor([[ 351, 3038, 2255,  ...,  538, 2255, 1643],
        [   0,    0,    0,  ...,    0,    0,    0],
        [1643, 1779, 2964,  ...,  551,  351,  173],
        ...,
        [   0,    0,    0,  ...,    0,    0,    0],
        [3741, 3349,  685,  ...,  685, 2846,  587],
        [   0,    0,    0,  ...,    0,    0,    0]])
tensor([[3038, 2255, 2365,  ..., 2255, 1643, 3587],
        [   0,    0,    0,  ...,    0,    0,    0],
        [1779, 2964,  685,  ...,  351,  173, 2255],
        ...,
        [   0,    0,    0,  ...,    0,    0,    0],
        [3349,  685, 1412,  ..., 2846,  587, 3759],
        [   0,    0,    0,  ...,    0,    0,    0]])


In [36]:
sum(data == 0) / sum(data)

tensor(0.0009)

# LLM Start Modelling

In [37]:
import torch
import torch.nn as nn
import  torch.nn.functional as F
from torch.nn.utils.rnn import pad_sequence

In [38]:
# Estimating losses function
@torch.no_grad()

def estimate_loss():
    out = {}
    model.eval()
    for split in ['train', 'val']:
        losses = torch.zeros(eval_iters)
        for k in range(eval_iters):
            X, Y = get_batch(split)
            logits, loss = model(X, Y)
            losses[k] = loss.item()
        out[split] = losses.mean()
    model.train()
    return out

# This function is able to estimate the losses for the training iterations of the data
# uing model.eval in order to prevent some aspects of the model to run during that time
# It monitors the performance of loss and if it decreases per every iteration
# These losses print out when model is running

In [39]:
torch.tril(torch.ones(block_size, block_size))

tensor([[1., 0., 0.,  ..., 0., 0., 0.],
        [1., 1., 0.,  ..., 0., 0., 0.],
        [1., 1., 1.,  ..., 0., 0., 0.],
        ...,
        [1., 1., 1.,  ..., 1., 0., 0.],
        [1., 1., 1.,  ..., 1., 1., 0.],
        [1., 1., 1.,  ..., 1., 1., 1.]])

In [40]:
# Scaled dot product attention
class Head(nn.Module):

    def __init__(self, head_size):
        super().__init__()
        self.key = nn.Linear(n_embd, head_size, bias=False)
        self.query = nn.Linear(n_embd, head_size, bias=False)
        self.value = nn.Linear(n_embd, head_size, bias=False)
        # These 3 variables, key, query, and value, are all very important in calculating attention
        self.register_buffer('tril', torch.tril(torch.ones(block_size, block_size)))
        # This "buffer" is used as a mask for future tokens in attention
        self.dropout = nn.Dropout(dropout)

    def forward(self, x):
        B, T, C = x.shape
        # B = batch size, T = sequence length, C = original embedding dimension
        k = self.key(x)
        q = self.query(x)
        # Even though k and q are initialized in the same way, nn.Linear randomizes the initial weights to create these tensors

        # create attention scores
        weights = q @ k.transpose(-2, -1) * k.shape[-1]**-0.5
        # this is used to caclulate attention scores, a standard formula of getting dot product of q and k
        weights = weights.masked_fill(self.tril[:T, :T] == 0, float('-inf'))
        # this sets attention scores for future tokens to -inf (negative infinity), so that the model only pays attention to the old scores and not the new ones
        #  it overwrites the attention scores calculated in the previous step for future tokens. In each iteration, the model is exposed to a partially
        # revealed sequence, allowing it to attend only to past tokens. This is a form of autoregressive training.
        weights = F.softmax(weights, dim=-1)
        # normalized step
        drop = self.dropout(weights)
        # step to randomly ignore random nodes in order to prevent overfitting and codependence

        # weighted aggregation of values
        v = self.value(x)
        out = drop @ v
        # the attention scores in drop are dot product with the value tesnor to get the final outputs
        return out

# Note:
# In transformers, the key and query vectors are typically designed to be similar to capture relevant information in both directions. They are used to
# calculate the attention scores, indicating how much each element in the sequence should attend to every other element. The similarity between key and
# query helps the model learn dependencies in both directions.

In [41]:
# Multi-head attention
class MultiHeadAttention(nn.Module):

    def __init__(self, num_heads, head_size):
        super().__init__()
        self.heads = nn.ModuleList([Head(head_size) for _ in range(num_heads)])
        self.proj = nn.Linear(head_size * num_heads, n_embd)
        # this is a linear layer that concats all the heads created and mushes them togetherinto shape n_embd
        self.dropout = nn.Dropout(dropout)

    def forward(self, x):
        out = torch.cat([h(x) for h in self.heads], dim=-1)
        # for every head, x is passed into self.heads aka the Head class
        out = self.dropout(self.proj(out))
        return out

In [42]:
# Creating a feedforward class
class FeedForward(nn.Module):

    def __init__(self, n_embd):
        super().__init__()
        self.net = nn.Sequential(
            nn.Linear(n_embd, 4*n_embd),
            nn.ReLU(),
            nn.Linear(n_embd*4, n_embd),
            nn.Dropout(dropout)
        )

    def forward(self, x):
        return self.net(x)

# Not super complicated, just a simple part of the transformer structure

In [43]:
# Creating a transformer block
class Block(nn.Module):

    def __init__(self, n_embd, n_head):
        super().__init__()
        head_size = n_embd // n_head
        self.attention = MultiHeadAttention(n_head, head_size)
        self.feedforward = FeedForward(n_embd)
        self.ln1 = nn.LayerNorm(n_embd)
        self.ln2 = nn.LayerNorm(n_embd)

    def forward(self, x):
        y = self.attention(x)
        x = self.ln1(x + y)
        y = self.feedforward(x)
        x = self.ln2(x + y)
        return x

# As we can see, the initialization of this class basically includes the previous structures we made
# So for the forward fucntion, everytime we pass our input through either multiheadattention, or feedforward
# we need to linearize it, so that iw what we do

In [44]:
# Now to make a GPT model
class GPTLanguageModel(nn.Module):
    def __init__(self, vocab_size):
        super().__init__()
        # Create a self vocab_size variable to save it into the class
        self.vocab_size = vocab_size
        # Make an embedding table
        self.token_embedding_table = nn.Embedding(self.vocab_size, n_embd).to(device)
        # Adding a positional embedding table as well
        self.positional_embedding_table = nn.Embedding(block_size, n_embd).to(device)  # added new parameter, n_embd
        # Adding 4 decoder layers
        self.blocks = nn.Sequential(*(Block(n_embd, n_head=n_head).to(device) for _ in range(n_layer)))
        # final layer normalization
        self.lm_f = nn.LayerNorm(n_embd).to(device)
        # unsure what this is below
        self.lm_head = nn.Linear(n_embd, self.vocab_size).to(device)

        # std variables to help training converge better
        self.apply(self._init_weights)

    def _init_weights(self, module):
        if isinstance(module, nn.Linear):
            torch.nn.init.normal_(module.weight, mean=0.0, std=0.02)
            if module.bias is not None:
                torch.nn.init.zeros_(module.bias)
        elif isinstance(module, nn.Embedding):
            torch.nn.init.normal_(module.weight, mean=0.0, std=0.02)
    # Linear layers are initialized with normal distribution, and embedding layers are initialized with normal distribution as well.

    def forward(self, index, targets=None):
        B, T = index.shape
        # index represents the sequence of tokens

        # Add in token and positional embeddings
        token_embd = self.token_embedding_table(index)  # (B, T, C)
        # This layer is an embedding table for token embeddings. Given an input index (representing a token), it retrieves the corresponding embedding vector from the table.
        pos_embd = self.positional_embedding_table(torch.arange(T, device=device))  # (T, C)
        # Embedding table for positional embeddings. It helps the model take into account the order or position of tokens in the sequence
        x = token_embd + pos_embd  # (B, T, C)
        x = self.blocks(x)  # (B, T, C)
        x = self.lm_f(x)  # (B, T, C)
        logits = self.lm_head(x)  # (B, T, vocab_size)

        if targets is None:
            loss = None
        else:
            B, T, C = logits.shape
            logits = logits.view(B*T, C)
            targets = targets.view(B*T)
            loss = F.cross_entropy(logits, targets)
        return logits, loss

    def generate(self, index, max_new_tokens):
        # model is iteratively called to predict the next token, and the predicted token is concatenated to the input sequence
        for _ in range(max_new_tokens):
            logits, loss = self.forward(index)
            logits = logits[:, -1, :]
            # print(f"Logits shape: {logits.shape}")
            # These are the raw scores produced by the model before applying the softmax function. Each entry in the logits tensor represents
            # the model's prediction for the likelihood of a particular token in the vocabulary. The dimensions of logits are (B, T, vocab_size),
            #  where B is the batch size, T is the sequence length, and vocab_size is the size of the vocabulary.
            probabilities = F.softmax(logits, dim=-1)

            # Ensure generated index is within the vocabulary size
            valid_indices = torch.arange(self.vocab_size).to(device)
            # print('vocab size', vocab_size)
            index_next = torch.multinomial(probabilities[:, valid_indices], num_samples=1)
            index_next = valid_indices[index_next]  # Map back to the original indices

            index = torch.cat((index, index_next), dim=1)

        return index

# model = GPTLanguageModel(vocab_size)

# To explain the positional and token embeddings:
# In a Transformer, each position in the input sequence has a unique positional embedding associated with it. This positional embedding is added
# to the token embedding of the corresponding word. If you didn't have positional embeddings and used a single embedding table for both tokens and positions,
# the model might struggle to distinguish between words based on their positions in the sequence.

# Having separate tables allows the model to learn distinct embeddings for tokens and positions. The positional embeddings can then be added to the
# token embeddings during processing, ensuring that the model can effectively capture both the semantic content of words and their positions in the sequence.

# So, even if you're working with a single sequence (no batches), having separate token and positional embeddings is still beneficial for the
# Transformer model's ability to understand and leverage both semantic and positional information.

# in order to deal with a prompt, make the GPT model encounter a prompt size of around 50.


In [26]:
# # Do not run this and next 2 cells, if loading in model

# # create the model
# max_prompt_size = 50
# model = GPTLanguageModel(vocab_size + max_prompt_size)
# model = model.to(device)

In [27]:
# # Creating an Optimizer
# optimizer = torch.optim.AdamW(model.parameters(), lr=learning_rate)

# # save the best params
# best_val_loss = float('inf')  # Initialize with a large value
# best_params = None

# for iter in range(max_iters):
#     if iter % eval_iters == 0:
#         losses = estimate_loss()
#         train_loss = losses['train']
#         val_loss = losses['val']
#         print(f"step {iter}, train loss: {train_loss}, val loss: {val_loss}")
#     xb, yb = get_batch('train')

#     if val_loss < best_val_loss:
#         best_val_loss = val_loss
#         best_params = model.state_dict()  # Save the current model parameters
#         print("Updated parameters ", best_params.keys())


#     logits, loss = model.forward(xb, yb)
#     optimizer.zero_grad(set_to_none=True)
#     # clears the gradients of all optimized parameters
#     loss.backward()
#     # implement accumulation stepse
#     if (iter + 1) % accumulation_steps == 0:
#         optimizer.step()
#         optimizer.zero_grad(set_to_none=True)

# print(loss.item())

# # What we are doing here is the same thing as training the dataset

In [28]:
# import pickle

# # Save the model into a pickle file
# with open('/content/drive/MyDrive/model-06.pkl', 'wb') as f:
#     pickle.dump(model, f)

In [29]:
# import json
# import numpy as np

# # Convert tensors to NumPy arrays in the model's state dictionary
# model_state_dict_np = {key: value.cpu().numpy().tolist() if isinstance(value, torch.Tensor) else value for key, value in model_state_dict.items()}

# # Save the model's state dictionary into a JSON file
# json_path = '/content/drive/MyDrive/model-06.json'
# with open(json_path, 'w') as json_file:
#     json.dump(model_state_dict_np, json_file)


# Run Saved Model

In [86]:
import pickle
import io

#Change the path if needed
path = '/content/drive/MyDrive/model-06.pkl'

# Load the model, if necessary
class CPU_Unpickler(pickle.Unpickler):
    def find_class(self, module, name):
        if module == 'torch.storage' and name == '_load_from_bytes':
            return lambda b: torch.load(io.BytesIO(b), map_location='cpu')
        else: return super().find_class(module, name)

with open(path, 'rb') as f:
    model = pickle.load(f)
    #Use the line below instead of the line above if an error occurs 
    #model = CPU_Unpickler(f).load()

In [48]:
# # adjust the dictionaries (ONLY RUN ONCE)
# index_to_word.update({0: ''})
# word_to_index[''] = word_to_index.pop('block')

In [43]:
# Model 3

# So what I have done is I have removed the stopwords from being in df['Transcript_Cleaned']
# Not entirely sure if this is a good idea but it is worth a shot
# I am doing this in order to generate a good response from GPT form the prompt, and to do
# that it must include words to form an actual sentence

# Model 4

# like Model 3, but I trained it on a GPU so should work much better

# Model 5

# Batch size 64, number of heads = 4

# Model 6

# different parameters
# batch_size = 128 # learning_rate = 1e-4 # n_layer = 6 # n_head = 6 # accumulation_steps = 6
# Rand this one all the way, 1000 iters

In [65]:
# Add the prompt into the dictionary used for the training dataset
# prompt = 'Can you give me an overview on Probabilistic Latent Semantic Analysis'.split()
prompt = 'Word association mining'

# Find the maximum key in the existing dictionaries
max_key = max(word_to_index.values()) if word_to_index else -1

# Enumerate through the new words and add them to the dictionaries
for word in prompt:
    if word not in word_to_index:
        max_key += 1
        word_to_index[word] = max_key
        index_to_word[max_key] = word

In [66]:
# Create result from prompt, as a chatbot would
context = torch.tensor(words_to_indices(prompt), dtype=torch.long, device=device)
# context = torch.zeros((1, 1), dtype=torch.long)
generated_terms = indices_to_words(model.generate(context.unsqueeze(0), max_new_tokens=50)[0].tolist())
# make sure max_new_tokens is less than block_size

In [67]:
print(' '.join(generated_terms[len(prompt):]))

byproduct resort regime surprising himself problems disk nouns problems task grow nouns observable initiating mixture losing associated uncertain whereas management customers calculating advertisement highlight plug presentation observable agencies properly lose seek information cement resolve numerator neutral equation nouns losing typed december among problems alternate study campaigns passing continuing plug comfortable


In [52]:
# debugging issues with sizes and lengths
# Check vocabulary size
print("Actual Vocabulary Size:", len(word_to_index))
print("Model Vocabulary Size:", vocab_size)

# Check embedding dimension
print("Embedding Dimension:", n_embd)

# Check index values
print("Max Index in padded_indices:", torch.max(padded_indices).item())
print("Min Index in padded_indices:", torch.min(padded_indices).item())

print("Context vector", context)
print("Model", model.generate(context.unsqueeze(0), max_new_tokens=50)[0].tolist())

# Check indices added from prompt
prompt_indices = words_to_indices(prompt)
print("Prompt Indices:", prompt_indices)

Actual Vocabulary Size: 4023
Model Vocabulary Size: 4011
Embedding Dimension: 128
Max Index in padded_indices: 4010
Min Index in padded_indices: 0
Context vector tensor([4011, 4012, 4013, 4014, 4015, 1643, 4016, 4016, 4012, 4017, 4018, 1643,
        4019, 4018, 4012, 4020, 4015, 4021, 4018, 4020, 4018, 4020, 4022])
Model [4011, 4012, 4013, 4014, 4015, 1643, 4016, 4016, 4012, 4017, 4018, 1643, 4019, 4018, 4012, 4020, 4015, 4021, 4018, 4020, 4018, 4020, 4022, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
Prompt Indices: [4011, 4012, 4013, 4014, 4015, 1643, 4016, 4016, 4012, 4017, 4018, 1643, 4019, 4018, 4012, 4020, 4015, 4021, 4018, 4020, 4018, 4020, 4022]


In [52]:
### Notes on model
# model 1 is on just the grams
# model 2 is on the full transcript training
# model 3, I stopped removing stopwords, in order to generate a better response from the context vector

# model 1 prompt: Can you give me an overview on Probabilistic Latent Semantic Analysis
# result: word_distribution promise profile allocation justification minimize edge briefly mix environment sky
# interest root research present tilde york engine domain light popularity likelihood bye bridge summary understood
# separating pick quantitate sub polarity encounter observation right setting future order Can distinguish accurate
# stick aggregate doesnt apply party photo message scientist transpose categorize

# model 2 prompt: Can you give me an overview on Probabilistic Latent Semantic Analysis
# result: assign goal clutch discovering simplification meal bomb discovery generality recalibration confidence front
# request implement anticipate percent suppose perspective choice attribute state development sit uncertainty choose
# viewer play regime baring doubt hash table tolerance exploitation bit guess causal moreover letter area web profile
# rating algorithm incomplete incomplete well imbalance event motion

# Fine Tuning

In [32]:
# import torch
# from transformers import GPTNeoForCausalLM, GPT2Tokenizer, TextDataset, DataCollatorForLanguageModeling, Trainer, TrainingArguments

# # Check if CUDA (GPU) is available
# device = 'cuda' if torch.cuda.is_available() else 'cpu'
# torch.cuda.set_per_process_memory_fraction(0.85)  # or any smaller fraction

# model_name = 'EleutherAI/gpt-neo-1.3B'
# tokenizer = GPT2Tokenizer.from_pretrained(model_name)
# model = GPTNeoForCausalLM.from_pretrained(model_name).to(device)  # Move the model to CUDA if available

# train_dataset = TextDataset(
#     tokenizer=tokenizer,
#     file_path='/content/drive/MyDrive/all_lectures.csv',  # Replace with your fine-tuning dataset
#     block_size=64,
# )

# data_collator = DataCollatorForLanguageModeling(
#     tokenizer=tokenizer,
#     mlm=False
# )

# trainer = Trainer(
#     model=model,
#     args=TrainingArguments(
#         output_dir='/content/drive/MyDrive/fine-tuned-model',
#         overwrite_output_dir=True,
#         num_train_epochs=1,
#         per_device_train_batch_size=2,  # Adjust batch size based on GPU memory
#         save_steps=10,  # Adjust the frequency of saving checkpoints
#         gradient_accumulation_steps=8  # or any larger value
#     ),
#     data_collator=data_collator,
#     train_dataset=train_dataset
# )

# trainer.train()


# Flask Frontend

In [72]:
def answer(question):
    prompt = question

    max_key = max(word_to_index.values()) if word_to_index else -1

    for word in prompt:
        if word not in word_to_index:
            max_key += 1
            word_to_index[word] = max_key
            index_to_word[max_key] = word

    context = torch.tensor(words_to_indices(prompt), dtype=torch.long, device=device)
    generated_terms = indices_to_words(model.generate(context.unsqueeze(0), max_new_tokens=50)[0].tolist())

    return ' '.join(generated_terms[len(prompt):])

In [69]:
pip install Flask

Note: you may need to restart the kernel to use updated packages.



[notice] A new release of pip is available: 23.2.1 -> 23.3.1
[notice] To update, run: python.exe -m pip install --upgrade pip


In [81]:
from flask import Flask, render_template, request

app = Flask(__name__)

#This code creates the initial website
@app.route("/")
def html():
    return render_template("index.html", question = "Hello, welcome to our LLM! Ask a question about course material above.", answer="")

#This code will update the website when the user submits a question
@app.route("/update", methods=["GET","POST"])
def update():
    question = request.form['input']
    response = answer(question)
    return render_template("index.html", question='Question: '+question, answer='Answer: '+response)

app.run()

 * Serving Flask app "__main__" (lazy loading)
 * Environment: production
[2m   Use a production WSGI server instead.[0m
 * Debug mode: off


 * Running on http://127.0.0.1:5000/ (Press CTRL+C to quit)
127.0.0.1 - - [14/Dec/2023 13:41:37] "GET / HTTP/1.1" 200 -
127.0.0.1 - - [14/Dec/2023 13:41:48] "POST /update HTTP/1.1" 200 -
127.0.0.1 - - [14/Dec/2023 13:42:02] "POST /update HTTP/1.1" 200 -
