In [18]:
from pycozo import Client

cozo_client = Client("sqlite", r"../src\lecture_search\app\assets\database.db")

In [19]:
from sentence_transformers import SentenceTransformer, CrossEncoder, util

bi_encoder = SentenceTransformer("multi-qa-MiniLM-L6-cos-v1")

In [57]:
query_embedding = bi_encoder.encode("definition of ASF")

def passages_hnsw(query_embedding, course):
    results = cozo_client.run(
        """
        ?[dist, path, referred_to, referred_type, course,slide_id, start_sentence_id, end_sentence_id] := ~passages:semantic{ path, slide_id, start_sentence_id, end_sentence_id, embedding |
            query: q,
            k: 10,
            ef: 2000,
            bind_distance: dist,
        }, q = vec($query),  *reference_assets[reference, referred_to, asset_type], reference=path, *courses_files[referred_to, referred_type, lecture, course], course=$course
        :order dist
    """,
        {"query": query_embedding.tolist(), "course": course},
    )
    results["sentence"] = [None] * len(results)

    for row in results.itertuples():
        path, slide_id, start_sentence_id, end_sentence_id = (
            row.path,
            row.slide_id,
            row.start_sentence_id,
            row.end_sentence_id,
        )

        passage = cozo_client.run(
            """
            ?[path, slide_id, sentence_id, sentence] := *slide_sentences[path, slide_id, sentence_id, sentence], path=$path, slide_id=$slide_id, sentence_id >= $start_sentence_id, sentence_id <= $end_sentence_id
        """,
            {
                "path": path,
                "slide_id": slide_id,
                "start_sentence_id": start_sentence_id,
                "end_sentence_id": end_sentence_id,
            },
        ).sentence.tolist()

        # insert passage into results
        results.at[row.Index, "sentence"] = " ".join(passage)
    return results

results = passages_hnsw(query_embedding, "Decision Analysis")

In [42]:
cross_encoder = CrossEncoder("cross-encoder/ms-marco-MiniLM-L-6-v2")

In [58]:
def rerank_passages(query, passages, k =3):

    #Concatenate the query and all passages and predict the scores for the pairs [query, passage]
    model_inputs = [[query, passage] for passage in passages.sentence.tolist()]
    scores = cross_encoder.predict(model_inputs)

    for i,score in enumerate(scores):
        passages.at[i, "score"] = score

rerank_passages("definition of ASF", results)

In [60]:
results

Unnamed: 0,dist,path,referred_to,referred_type,course,slide_id,start_sentence_id,end_sentence_id,sentence,score
0,0.532097,processed/da-lec11-notes.json,courses/Decision Analysis/Classical Method for...,kadzinski_notes_pdf,Decision Analysis,33,6,7,If the ASF optimizes the distances between obj...,2.91829
1,0.553551,processed/da-lec12.json,courses/Decision Analysis/Introduction to Evol...,pdf,Decision Analysis,33,0,0,Achievement Scalarizing Function (ASF) - A Pri...,4.877914
2,0.572968,processed/da-lec11.json,courses/Decision Analysis/Classical Method for...,pdf,Decision Analysis,32,0,0,Achievement Scalarizing Function (ASF) - Examp...,1.555527
3,0.578074,processed/da-lec11.json,courses/Decision Analysis/Classical Method for...,pdf,Decision Analysis,31,0,0,Achievement Scalarizing Function (ASF) f1(x) ...,2.954319
4,0.667088,processed/da-lec12.json,courses/Decision Analysis/Introduction to Evol...,pdf,Decision Analysis,34,0,0,Achievement Scalarizing Function (ASF) - A Pos...,4.144726
5,0.676813,processed/da-lec8.json,courses/Decision Analysis/Game Theory - Conges...,pdf,Decision Analysis,22,3,6,And as Player 2? Exercise: Describe this game ...,-10.385733
6,0.693501,processed/da-lec9-notes.json,courses/Decision Analysis/Introduction to Soci...,kadzinski_notes_pdf,Decision Analysis,0,1,3,It is a theoretical framework for analyzing in...,-6.244051
7,0.721298,processed/da-lec8.json,courses/Decision Analysis/Game Theory - Conges...,pdf,Decision Analysis,35,3,5,"And as Player 2? 2 (0,15) 1 (35,10) 2 (25,...",-10.251704


In [67]:
def find_most_relevant_passages(query, course, k=3):
    query_embedding = bi_encoder.encode(query)
    results = passages_hnsw(query_embedding, course)
    rerank_passages(query, results)
    return results.nlargest(k, "score")

find_most_relevant_passages("What is the definition of ASF?", "Decision Analysis")

Unnamed: 0,dist,path,referred_to,referred_type,course,slide_id,start_sentence_id,end_sentence_id,sentence,score
1,0.560181,processed/da-lec12.json,courses/Decision Analysis/Introduction to Evol...,pdf,Decision Analysis,33,0,0,Achievement Scalarizing Function (ASF) - A Pri...,4.90421
5,0.673689,processed/da-lec12.json,courses/Decision Analysis/Introduction to Evol...,pdf,Decision Analysis,34,0,0,Achievement Scalarizing Function (ASF) - A Pos...,3.644409
0,0.531377,processed/da-lec11-notes.json,courses/Decision Analysis/Classical Method for...,kadzinski_notes_pdf,Decision Analysis,33,6,7,If the ASF optimizes the distances between obj...,3.119122


In [69]:
from pathlib import Path

In [83]:
import pandas as pd

definition_dict = pd.read_csv('definitions3.csv')

all_phrases = definition_dict['phrase'].tolist()

In [89]:
print(all_phrases)

['ASF', 'Alabama paradox', "Arrow 's theorem", 'Baldwin method', 'Banzhaf power index', 'Borda count', 'Borda rule', 'Borda score', 'Braess paradox', 'Bucklin voting', 'Cartesian product', 'Chebyshev function', 'Condorcet cycle', 'Condorcet extension', 'Condorcet extensions', 'Condorcet method', 'Condorcet paradox', 'Condorcet principle', 'Condorcet winner', 'Coombs method', 'Copeland rule', "D'Hondt method", "D'Hondt system.", 'ECM', 'Euclidean distance', 'FPTP', 'FPTP system', "Fishburn 's classification", 'Gibbard-Satterthwaite theorem', 'Hamilton method', 'Hit-And-Run', 'IRV', 'Kemeny ranking', 'Kemeny rule', "Klee 's measure", 'MOEA/D', 'Manhattan distance', 'Markov Chain', 'Modern portfolio theory', 'Monte Carlo', 'Monte Carlo simulations', 'N-point crossover', 'NSGA-II', 'Nash equilibrium', 'Parent selection', 'Paretian', 'Pareto front', 'Pareto frontier', 'Pareto optimal', 'Pareto optimality', 'Pareto principle', 'Pareto-front', 'Plurality with run-off', 'Proportional represent

In [101]:
def find_other_phrases(text, all_phrases, phrase):
    other_phrases = []
    for p in all_phrases:
        if p in text and p != phrase:
            other_phrases.append(p)
    return set(other_phrases)

In [80]:
def get_references_for_questions(questions, course):
    # group questions with references
    # question : [references]
    questions_with_references = {}
    for question in questions:
    
        references = find_most_relevant_passages(question, course)
        questions_with_references[question] = []
        for index, row in references.iterrows():
            file_path = Path(row.referred_to).name
            page = row.slide_id + 1
            excerpt = row.sentence
            embed_string = f"![[{file_path}#page={page}]]"
            total_string = f"{embed_string}\n{excerpt}"
            questions_with_references[question].append(total_string)
    return questions_with_references


    
get_references_for_questions(["What is the definition of ASF?", "What is the definition of a decision tree?"], "Decision Analysis")

{'What is the definition of ASF?': ['![[da-lec12.pdf#page=34]]\nAchievement Scalarizing Function (ASF) - A Priori §\u202f zref indicates the aspiration levels  (desired values that the DM would like to  have) for all objectives  §\u202f Solution which has the least weighted  Chebyshev distance is selected  §\u202f Objective weights determine a direction   of the isoquant  §\u202f Solutions with equal distances from zref  are situated on the same isoquant  Minimize the weighted Chebyshev distance (i.e., maximal        weighted distance on any objective) from the reference point zref:  Minimize  max i=1,…,M wi·|fi(x) - zi ref|  subject to x∈S  f2(x)  f1(x)  a b  f  c  d  g  e  h  zref  isoquant of ASF  (w1=2/3 and w2=1/3)  Artiﬁcial Intellience Decision Analysis',
  '![[da-lec11-notes.pdf#page=34]]\nIf the ASF optimizes the distances between objective values of solutions and the  reference point, then such distances should also be considered in the augmentation term that sums them up ove

In [77]:
vault_path = r"C:\Users\kryst\Documents\Artificial Intelligence\Artificial Intelligence - sem6\nlp\lecture_search\automatic_notes"

In [103]:
def note_generator(phrase,definition,question_references):
   # note = f"# {phrase}"
   note = f"## Definition"
   note += f"\n{definition}"
   

   
   

   note += "\n\n## Questions"
   for question, references in question_references.items():
      note += f"\n### {question}"
      for reference in references:
         note += f"\n {reference}"

   adjacent_phrases = find_other_phrases(definition, all_phrases, phrase)
   print(adjacent_phrases)
   note += f"\n## See also"
   for adj_phrase in adjacent_phrases:
      note += f"\n[[{adj_phrase}]]"

   return note


In [None]:
# columns are phrase, prompt, definition
import tqdm
import time
time.sleep(10)
for index, row in tqdm.tqdm(definition_dict.iterrows()):

    phrase = row['phrase']
    # remove any non alphanumeric characters
    clean_phrase = ''.join(e if e.isalnum() else " " for e in phrase)

    questions = [
        f"What is the definition of {phrase}?",
        f"How to calculate {phrase}?"
    ]
    question_references = get_references_for_questions(questions, "Decision Analysis")

    definition = row['definition']
    # name the file with the phrase
    file_name = clean_phrase + '.md'

    
    # create the file
    with open(vault_path + '\\' + file_name, 'w') as f:
        f.write(note_generator(phrase,definition,question_references))
        f.close()
    
print('done')

In [76]:
# copy all pdfs from C:\Users\kryst\Documents\Artificial Intelligence\Artificial Intelligence - sem6\nlp\lecture_search\src\lecture_search\app\assets\courses\Decision Analysis to C:\Users\kryst\Documents\Artificial Intelligence\Artificial Intelligence - sem6\nlp\lecture_search\automatic_notes
import shutil, os
source = Path(r"C:\Users\kryst\Documents\Artificial Intelligence\Artificial Intelligence - sem6\nlp\lecture_search\src\lecture_search\app\assets\courses\Decision Analysis")
destination = Path(r"C:\Users\kryst\Documents\Artificial Intelligence\Artificial Intelligence - sem6\nlp\lecture_search\automatic_notes")

# search for files recursively
for root, dirs, files in os.walk(source):
    for file in files:
        if file.endswith(".pdf"):
             shutil.copy(os.path.join(root, file), destination)