In [None]:
from gensim import corpora
from gensim.models import LdaModel
import json
from data_analysis.OpeningLDA import OpeningLDA
from classes.QueryHandler import QueryHandler

## LDA

In [None]:
#opening_lda = OpeningLDA(num_passes=5, print_debug=True) 

#print(len(opening_lda.texts.keys()))
#opening_lda.lda_model.print_topics(num_words=15)

#opening_lda.save('lda')


In [None]:
opening_lda = OpeningLDA.load("lda")

In [None]:
opening_lda.__construct_topic_doc_matrix__()

In [None]:
query_handler = QueryHandler()
query_handler.opening_lda = opening_lda

sample_query1 = { 
                 "message": "capture the center, develop light square bishop",
                 "color": "white" 
}
query_handler.handle_user_query(sample_query1, debug=True)

## GPT

In [8]:
from data_analysis.OpeningGPT import OpeningGPT
from scraper.Paths import Paths

opening_gpt = OpeningGPT()


In [9]:
from transformers import pipeline, AutoTokenizer, AutoModelForSeq2SeqLM
from math import floor, ceil
from scraper.Paths import Paths 

model_name = "facebook/bart-large-cnn"
tokenizer = AutoTokenizer.from_pretrained(model_name)

model = AutoModelForSeq2SeqLM.from_pretrained(model_name)
summarizer = pipeline("summarization", model=model, tokenizer=tokenizer)


In [10]:

# Hyperparams 

# CHUNK_OFFSET := the offset from the end of the chunk to prevent indexing errors. can be set lower 
#                 (i.e greater abs value, closer to neg infinity) but cannot be set higher than -10
#                 or will cause indexing errors in the GPT model 
CHUNK_OFFSET:int = -10 

# CHUNK_SIZE := the maximum number of tokens the model can take at a time. defined by the model, 
#               not arbitrarily
CHUNK_SIZE:int = 1024

# MAX_SUM_SIZE, MIN_SUM_SIZE := the max/min length of a summary generated by the model (when given 
#                               a chunk of approx CHUNK_SIZE words)
MAX_SUM_SIZE:int = 100
MIN_SUM_SIZE:int = 50

# LENGTH_PEN := to balance between generating longer or shorter sequences; used to prevent the 
#               model from favoring excessively short or long outputs, and is incorporated into 
#               the generation process as a factor that scales the log probabilities of the 
#               generated tokens based on the length of the sequence/input.
LENGTH_PEN:float = 2.0 

# NUM_BEAMS := determines the number of beams (partial hypotheses) to consider during the 
#              generation process; larger values increase the diversity of hypotheses explored
#              during decoding, potentially leading to better results at the  cost of increased 
#              computational complexity.
NUM_BEAMS:int = 4

print_debug:bool = True
input_str = open(f"{Paths.RAW_DESC_BASE}/E92/wikipedia.txt").read().replace("\n", "")
input_toks = tokenizer.tokenize(input_str)
tok_chars = sum([len(t) for t in input_toks])

print(f"len(test_input) = {len(input_str)}")
print(f"tok_chars = {tok_chars}")

summaries:list[str] = []    # List of all sub summaries generated from the chunks of text
toks_processed:int = 0      # Counter of the number of tokens from the input text that have been processed
chars_processed:int = 0     # Counter of the number of characters from the input text that have been processed
c = 0                       # Counter for debugging prints

while chars_processed < len(input_str):
    if print_debug: print(f"\n[+] Iteration #{c}\n\tchars_processed = {chars_processed}\n\ttoks_processed = {toks_processed}")
    
    # Calculate how many characters we can take from the text (starting at input_str[chars_processed]) to get at most
    # [CHUNK_SIZE] tokens
    num_toks:int = len(tokenizer.tokenize(input_str[chars_processed : chars_processed + CHUNK_SIZE]))
    n:int = sum([len(t) for t in input_toks[toks_processed : toks_processed + CHUNK_SIZE]])
    num_chars = n - ceil(n/num_toks)*2
    
    # If num_chars is 0, then we've reached the end of the input, so we can break the loop
    if num_chars == 0: break
    
    # Ensure the chunk ends at a complete token; take less characters if necessary, never more
    while chars_processed + num_chars < len(input_str) and input_str[chars_processed + num_chars] != '.':
        num_chars -= 1
    
    # Get the chunk from the input string, from the range (chars_processed, chars_processed + num_chars] 
    chunk = input_str[chars_processed : chars_processed + num_chars]
    
    # Generate the summary and append it to the list of summaries
    summary = summarizer(chunk[:CHUNK_OFFSET], max_length=MAX_SUM_SIZE, min_length=MIN_SUM_SIZE, length_penalty=LENGTH_PEN, num_beams=NUM_BEAMS, early_stopping=True)[0]['summary_text']
    summaries.append(summary)
    
    if print_debug:
        print(f"\tn = {n}")
        print(f"\tnum_chars = {num_chars}")
        print(f"\tlen(tokenized chunk) = {len(tokenizer.tokenize(chunk))} | {sum([len(t) for t in tokenizer.tokenize(chunk)])}")
        #print(f"\tCHUNK:\n\t{chunk}")
        print(summary)
        print(f"\tsummary len = {len(tokenizer.tokenize(summary))}")
    
    # Increment the variables before continuing to the next iteration 
    chars_processed += num_chars
    toks_processed += len(tokenizer.tokenize(chunk))
    c += 1


Token indices sequence length is longer than the specified maximum sequence length for this model (4414 > 1024). Running this sequence through the model will result in indexing errors


len(test_input) = 13266
input_toks (4414): 
['Che', 'ss', 'Ġopening', 'Che', 'ss', 'Ġopening', 'N', 'im', 'z', 'ow', 'itsch', '-', 'L', 'ars', 'en', 'ĠAttack', 'abc', 'def', 'gh', '88', '77', '66', '55', '44', '33', '22', '11', 'abc', 'def', 'gh', 'M', 'oves', '1', '.', 'b', '3', 'EC', 'OA', '01', 'âĢĵ', 'A', '06', 'N', 'amed', 'Ġafter', 'A', 'ron', 'ĠNim', 'z', 'ow', 'itsch', ',', 'ĠBent', 'ĠLars', 'en', 'Syn', 'onym', '(', 's', ')', 'N', 'im', 'zo', '-', 'L', 'ars', 'en', 'ĠAttack', 'L', 'ars', 'en', "'s", 'ĠOpening', 'Queen', "'s", 'ĠF', 'ian', 'che', 'tto', 'ĠOpening', 'Baby', 'ĠOr', 'ang', 'utan', '[', '1', ']', 'Ġ(', 'c', 'ollo', 'q', '.)', 'The', 'ĠNim', 'z', 'ow', 'itsch', '-', 'L', 'ars', 'en', 'ĠAttack', 'Ġ(', 'also', 'Ġknown', 'Ġas', 'Ġthe', 'ĠNim', 'zo', '-', 'L', 'ars', 'en', 'ĠAttack', ',', 'ĠLars', 'en', "'s", 'ĠOpening', 'Ġand', 'ĠQueen', "'s", 'ĠF', 'ian', 'che', 'tto', 'ĠOpening', ')', 'Ġis', 'Ġa', 'Ġchess', 'Ġopening', 'Ġtypically', 'Ġstarting', 'Ġwith', 'Ġthe', 'Ġmo

IndexError: index out of range in self