In [391]:
# parse through contents of assets/courses
from pathlib import Path
path = Path("./assets/courses")

In [1]:
from pycozo import Client
import pycozo

client = Client("sqlite",path="./assets/database.db")

In [393]:
client.run("""
    ?[] <- [[1,3]]
""")

Unnamed: 0,_0,_1
0,1,3


In [394]:
from enum import Enum

class AssetType(str, Enum):
    VIDEO = "video"
    TEXT = "text"
    PDF = "pdf"
    KADZINSKI_NOTES_PDF = "kadzinski_notes_pdf"

In [395]:
AssetType.VIDEO

<AssetType.VIDEO: 'video'>

In [None]:
files_relation = {
    "courses_files" : {
        "headers": ["path", "type", "lecture", "course"],
        "rows": []
    }
}

for course in path.iterdir():
    if not course.is_dir():
        continue
    for lecture in course.iterdir():
        if not lecture.is_dir():
            continue
        for asset in lecture.iterdir():
            if not asset.is_file():
                continue
            asset_type = None
            if asset.suffix == ".mp4":
                asset_type = AssetType.VIDEO
            elif asset.suffix == ".txt":
                asset_type = AssetType.TEXT
            elif asset.suffix == ".pdf":
                if asset.stem.endswith("-notes"):
                    asset_type = AssetType.KADZINSKI_NOTES_PDF
                else:
                    asset_type = AssetType.PDF
            if asset_type is not None:
                files_relation["courses_files"]["rows"].append([str(asset.relative_to("assets").as_posix()), asset_type.value, lecture.name, course.name])
            else:
                print("Unknown asset type: ", asset)

files_relation

In [398]:
client.run("::remove courses_files")
def create_course_files_relation(client):
    return client.run("""
    :create courses_files {path => type, lecture, course}
    """)
create_course_files_relation(client)

Unnamed: 0,status
0,OK


In [399]:
client.import_relations(files_relation)

In [400]:
client.run("::remove reference_assets")
def create_reference_assets_relation(client):
    return client.run("""
    :create reference_assets {reference, referred_to, type}
""")
create_reference_assets_relation(client)

Unnamed: 0,status
0,OK


In [401]:
kadzinski_notes = client.run("""
    ?[path, type, lecture, course] := *courses_files[path, type, lecture, course], type == "kadzinski_notes_pdf"
    
""")
kadzinski_notes

Unnamed: 0,path,type,lecture,course
0,courses/Decision Analysis/Classical Method for...,kadzinski_notes_pdf,Classical Method for Multiple Objective Optimi...,Decision Analysis
1,courses/Decision Analysis/Game Theory - Conges...,kadzinski_notes_pdf,Game Theory - Congestion and Extensive Games,Decision Analysis
2,courses/Decision Analysis/Game Theory - Soluti...,kadzinski_notes_pdf,Game Theory - Solution Concepts in Strategic G...,Decision Analysis
3,courses/Decision Analysis/Introduction to Evol...,kadzinski_notes_pdf,Introduction to Evolutionary Multiple Objectiv...,Decision Analysis
4,courses/Decision Analysis/Introduction to Soci...,kadzinski_notes_pdf,Introduction to Social Choice Theory - Group D...,Decision Analysis
5,courses/Decision Analysis/Voting Rules - Chara...,kadzinski_notes_pdf,"Voting Rules - Characterization Results, Strat...",Decision Analysis


In [402]:
def assign_slides_to_kadzinski_notes(client, slides_path:Path, notes_path:Path):
    return client.run("""
            ?[reference, referred_to, type] <- [[$reference, $referred_to, $type]]
            :put reference_assets {reference, referred_to, type}
        """, {"reference": slides_path.as_posix(), "referred_to": notes_path.as_posix(), "type": "slides"})

In [403]:
import fitz
import re



In [404]:
import json
def write_array_to_file(path, array):
    with open(path, "w") as f:
        json.dump(array, f)


In [405]:
def read_pdf_kadzinski_notes(kadzinski_notes_path):
    kadzinski_notes_pattern = re.compile(r"\[[0-9]+\]")
    doc = fitz.open("./assets/"+kadzinski_notes_path)
    notes = []
    for page in doc:
        page_text = page.get_text()
        for match in kadzinski_notes_pattern.finditer(page_text):
                notes.append(page_text[match.end() :])
    return notes
notes = read_pdf_kadzinski_notes(kadzinski_notes.path[0])

In [406]:
def assign_text_to_pdf(client, slides_path:Path, text_path:Path):
    return client.run("""
            ?[reference, referred_to, type] <- [[$reference, $referred_to, $type]]
            :put reference_assets {reference, referred_to, type}
        """, {"reference": text_path.as_posix(), "referred_to": slides_path.as_posix(), "type": "text"})

In [407]:
assets_path = Path("./assets")
for row in kadzinski_notes.itertuples():

    
    slides_path = Path(row.path.replace("-notes",""))

    if (assets_path / slides_path).exists():
        try:
            assign_slides_to_kadzinski_notes(client, slides_path, Path(row.path))
        except pycozo.client.QueryException as e:
            print("Error while assigning slides for slide_notes:", e)
    
    notes_path = Path("processed") / Path(row.path).with_suffix(".json").name

    if not (assets_path / notes_path).exists():
        notes = read_pdf_kadzinski_notes(Path(row.path))
        # write notes as csv
        write_array_to_file(notes_path, notes)

    try:
        assign_text_to_pdf(client, Path(row.path), notes_path)
    except pycozo.client.QueryException as e:
        print("Error while assigning text for slide_notes:", e)


In [408]:
client.run("""
    ?[reference, referred_to, type] := *reference_assets[reference, referred_to, type] """)

Unnamed: 0,reference,referred_to,type
0,courses/Decision Analysis/Classical Method for...,courses/Decision Analysis/Classical Method for...,slides
1,courses/Decision Analysis/Game Theory - Conges...,courses/Decision Analysis/Game Theory - Conges...,slides
2,courses/Decision Analysis/Game Theory - Soluti...,courses/Decision Analysis/Game Theory - Soluti...,slides
3,courses/Decision Analysis/Introduction to Evol...,courses/Decision Analysis/Introduction to Evol...,slides
4,courses/Decision Analysis/Introduction to Soci...,courses/Decision Analysis/Introduction to Soci...,slides
5,courses/Decision Analysis/Voting Rules - Chara...,courses/Decision Analysis/Voting Rules - Chara...,slides
6,processed/da-lec10-notes.json,courses/Decision Analysis/Voting Rules - Chara...,text
7,processed/da-lec11-notes.json,courses/Decision Analysis/Classical Method for...,text
8,processed/da-lec12-notes.json,courses/Decision Analysis/Introduction to Evol...,text
9,processed/da-lec7-notes.json,courses/Decision Analysis/Game Theory - Soluti...,text


In [409]:
pdfs = client.run("""
    ?[path, type, lecture, course] := *courses_files[path, type, lecture, course], type == "pdf"
""")
pdfs

Unnamed: 0,path,type,lecture,course
0,courses/Big Data and Distributed Processing/Bi...,pdf,Big Data and noSQL models,Big Data and Distributed Processing
1,courses/Big Data and Distributed Processing/CA...,pdf,CAP theorem and BigTable,Big Data and Distributed Processing
2,courses/Big Data and Distributed Processing/Ca...,pdf,Cassandra,Big Data and Distributed Processing
3,courses/Big Data and Distributed Processing/Co...,pdf,Consensus and Paxos algorithm,Big Data and Distributed Processing
4,courses/Big Data and Distributed Processing/Dy...,pdf,Dynamo,Big Data and Distributed Processing
5,courses/Big Data and Distributed Processing/In...,pdf,Introduction,Big Data and Distributed Processing
6,courses/Big Data and Distributed Processing/Lo...,pdf,Logical time and broadcast protocols,Big Data and Distributed Processing
7,courses/Big Data and Distributed Processing/Mo...,pdf,Models of distributed systems,Big Data and Distributed Processing
8,courses/Big Data and Distributed Processing/Or...,pdf,Ordering of distributed events and logical time,Big Data and Distributed Processing
9,courses/Big Data and Distributed Processing/Sp...,pdf,Spark,Big Data and Distributed Processing


In [410]:
def read_pdf_text(pdf_path):
    doc = fitz.open(pdf_path)
    text = []
    for page in doc:
        text.append(page.get_text("text"))
    return text

In [411]:
for row in pdfs.itertuples():

    pdf_text_filename =  Path(row.path).with_suffix(".json").name
    pdf_text_path = Path("./assets/processed") / pdf_text_filename
    if (Path("./assets") / pdf_text_path).exists():
        pdf_path = Path("./assets") / Path(row.path)
        print("Processing pdf:", row.path)
        pdf_text = read_pdf_text(pdf_path)
        write_array_to_file(pdf_text_path, pdf_text)
    
    try:
        assign_text_to_pdf(client, Path(row.path), Path("processed") / pdf_text_filename)
    except pycozo.client.QueryException as e:
        print("Error while assigning text for pdf:", e)
    

In [None]:
# for each file in assets/processed

processed_files = client.run("""
    ?[reference, referred_to] := *reference_assets[reference, referred_to, "text"]
""")
processed_files

In [413]:
import nltk
from nltk import sent_tokenize
from pprint import pprint

In [None]:
slide_sentences_relation = {
    "slide_sentences" : {
        "headers": ["path", "slide_id", "sentence_id", "sentence"],
        "rows": []
    }
}

for row in processed_files.itertuples():
    path = row.reference
    
    if path.endswith(".json"):
        paragraphs = json.load(open("./assets/"+path, "r"))
        for slide_id,paragraph in enumerate(paragraphs):
            sentences = sent_tokenize(paragraph.replace("\n"," "))
            for sentence_id,sentence in enumerate(sentences):
                slide_sentences_relation["slide_sentences"]["rows"].append([path, slide_id, sentence_id, sentence])

slide_sentences_relation

In [418]:
#client.run("::remove slide_sentences")
def create_slide_sentences_relation(client):
    return client.run("""
    :create slide_sentences {path, slide_id, sentence_id => sentence}
    """)
create_slide_sentences_relation(client)

Unnamed: 0,status
0,OK


In [419]:
client.import_relations(slide_sentences_relation)

In [420]:
client.run("""
    ?[path, slide_id, sentence_id, sentence] := *slide_sentences[path, slide_id, sentence_id, sentence]
""")

Unnamed: 0,path,slide_id,sentence_id,sentence
0,processed/2023_AI_BDDS_L1.json,0,0,Big Data & Distributed Processing Lecture 1: ...
1,processed/2023_AI_BDDS_L1.json,0,1,inż.
2,processed/2023_AI_BDDS_L1.json,0,2,"Anna Kobusińska, prof. PP Anna.Kobusinska@cs.p..."
3,processed/2023_AI_BDDS_L1.json,1,0,Course structure ▪ Course staff: ▪ Dr hab.
4,processed/2023_AI_BDDS_L1.json,1,1,inż.
...,...,...,...,...
4487,processed/da-lec9.json,43,1,Dodgson rule: elect alternative X that minimiz...
4488,processed/da-lec9.json,44,0,Classiﬁcation of Voting Rules and Electoral Sy...
4489,processed/da-lec9.json,45,0,Voting Rules - Summary § There exist many dif...
4490,processed/da-lec9.json,45,1,Summary Comparison of voting rules via s What...


In [421]:
client.run("""
    ::fts create slide_sentences:full_text_search {
    extractor: sentence,
    tokenizer: Simple,
    filters: [Lowercase,AlphaNumOnly, Stemmer('English')],
}
""")

Unnamed: 0,status
0,OK


In [422]:
client.run("""
        ?[slide_id, path, sentence] := ~slide_sentences:full_text_search {slide_id,path, sentence |
            query: $query,
            k: 1000
        }
        """, {"query": f"NEAR/1(transferable vote)"})

Unnamed: 0,slide_id,path,sentence
0,15,processed/da-lec9-notes.json,"Whichever the regulations, when such multiple ..."
1,15,processed/da-lec9.json,Multiple Non-transferable Vote § Winners usua...
2,16,processed/da-lec9-notes.json,"Then, it is called the single non-transferable..."
3,16,processed/da-lec9.json,Single Non-transferable Vote § Single non-tra...
4,20,processed/da-lec9-notes.json,The one in the spirit of IRV is called Single ...
5,20,processed/da-lec9.json,Single Transferable Vote - Hare-Clark System (...
6,21,processed/da-lec9.json,Single Transferable Vote - Hare-Clark System (...
7,22,processed/da-lec9.json,Single Transferable Vote - Characteristics § ...
8,44,processed/da-lec9-notes.json,"Some of the rules we discussed, including lim..."
9,44,processed/da-lec9.json,Classiﬁcation of Voting Rules and Electoral Sy...


In [None]:
results = client.run(
        """
        results[slide_id, path, sentence] := ~slide_sentences:full_text_search {slide_id,path, sentence |
            query: $query,
            k: 1000
        }
        ?[slide_id, referred_to, sentence, referred_type] := results[slide_id, path, sentence], *reference_assets[reference, referred_to, asset_type], reference=path, *courses_files[referred_to, referred_type, lecture, course]

        """,
        {"query": "social choices"},
    )

results


In [2]:
from sentence_transformers import SentenceTransformer, CrossEncoder, util

bi_encoder = SentenceTransformer('multi-qa-MiniLM-L6-cos-v1')
bi_encoder.max_seq_length = 256
top_k = 32

In [3]:
sentences = client.run("""
    ?[slide_id,sentence_id, path, sentence] := *slide_sentences[path, slide_id, sentence_id, sentence]
    :order path, slide_id, sentence_id
    """).groupby(["path","slide_id"])

In [25]:
len("Hello. my name is Philipp and I'm from Germany hi fsadfsadafadf df sadfafsdfasffs fsf dsafsdfasfdfdsa sdfasfffa fafd safs")

121

In [24]:
bi_encoder.tokenize("Hello. my name is Philipp and I'm from Germany hi fsadfsadafadf df sadfafsdfasffs fsf dsafsdfasfdfdsa sdfasfffa fafd safs")["input_ids"].shape

torch.Size([121, 3])

In [80]:
# for each slide

#passages dict with passage, path, slide_id, start_sentence_id, end_sentence_id

passages = []
window_size = 3
max_len = 0
count = 0
for name, group in sentences:

    candidate_passage = ""
    end_idx = 0
    for start_idx in range(0, len(group)):
        candidate_passage = ""
        end_idx = start_idx
        next_passage = group.sentence.iloc[end_idx]
        
        while len(next_passage) < 256 and end_idx < len(group):
            candidate_passage = next_passage
            next_passage += group.sentence.iloc[end_idx]
            end_idx = min(end_idx + 1, len(group))
        if len(candidate_passage) == 0:
            end_idx = start_idx
            candidate_passage = group.sentence.iloc[start_idx]

        #print(candidate_passage)
        passages.append({
            "passage": candidate_passage,
            "path": group.path.iloc[0],
            "slide_id": group.slide_id.iloc[0],
            "start_sentence_id": start_idx,
            "end_sentence_id": end_idx
        })
        
        if end_idx == len(group):
            break
    # else:
    #     for sentence in group.sentence:
    #         candidate_passage += sentence + " "
    #         # if len(candidate_passage) > 256:
    #         #     count+=1
    #     passages.append({
    #         "passage": candidate_passage,
    #         "path": group.path.iloc[0],
    #         "slide_id": group.slide_id.iloc[0],
    #         "start_sentence_id": 0,
    #         "end_sentence_id": len(group)
    #     })

print(count,max_len)
print("Number of passages:", len(passages))

    


0 0
Number of passages: 3695


In [83]:
from tqdm import tqdm
for passage in tqdm(passages, desc="Embedding passages", total=len(passages)):
    passage["embedding"] = bi_encoder.encode(passage["passage"])

Embedding passages: 100%|██████████| 3695/3695 [01:37<00:00, 37.98it/s]


In [85]:
passages[0]["embedding"].shape

(384,)

In [91]:
# client.run("::remove passages")
client.run("""
    :create passages {path:String, slide_id:Int, start_sentence_id:Int, end_sentence_id:Int => embedding:<F32;384>}
""")

Unnamed: 0,status
0,OK


In [None]:
passage_relation = {
    "passages" : {
        "headers": ["path", "slide_id", "start_sentence_id", "end_sentence_id", "embedding"],
        "rows": []
    }
}
for passage in passages:
    passage_relation["passages"]["rows"].append([passage["path"], passage["slide_id"], passage["start_sentence_id"], passage["end_sentence_id"], passage["embedding"].tolist()])
passage_relation["passages"]["rows"][9]

In [101]:
client.import_relations(passage_relation)

In [102]:
client.run("""
    ?[path, slide_id, start_sentence_id, end_sentence_id, embedding] := *passages[path, slide_id, start_sentence_id, end_sentence_id, embedding]
""")

Unnamed: 0,path,slide_id,start_sentence_id,end_sentence_id,embedding
0,processed/2023_AI_BDDS_L1.json,0,0,3,"[-0.040028028190135956, 0.06684377044439316, -..."
1,processed/2023_AI_BDDS_L1.json,1,0,4,"[-0.05581993982195854, 0.04117434844374657, -0..."
2,processed/2023_AI_BDDS_L1.json,2,0,4,"[-0.05581993982195854, 0.04117434844374657, -0..."
3,processed/2023_AI_BDDS_L1.json,3,0,1,"[0.013655133545398712, 0.004581686574965715, -..."
4,processed/2023_AI_BDDS_L1.json,4,0,0,"[0.002677326090633869, -0.022596953436732292, ..."
...,...,...,...,...,...
3690,processed/da-lec9.json,42,1,1,"[-0.0014051345642656088, 0.03299049288034439, ..."
3691,processed/da-lec9.json,43,0,2,"[0.030137548223137856, -0.10110008716583252, 0..."
3692,processed/da-lec9.json,44,0,0,"[-0.035851120948791504, -0.017622200772166252,..."
3693,processed/da-lec9.json,45,0,0,"[-0.019187353551387787, -0.06824313849210739, ..."


In [103]:
client.run("""
    ::hnsw create passages:semantic {
    dim: 384,
    m: 50,
    dtype: F32,
    fields: [embedding],
    distance: Cosine,
    ef_construction: 20,
    extend_candidates: false,
    keep_pruned_connections: false,
}
""")

Unnamed: 0,status
0,OK


In [None]:
for passage in passages

In [146]:
query_embedding = bi_encoder.encode("what are nadir points")

In [168]:
passage_similarity = []
for passage in passages:
    sim = util.cos_sim(query_embedding, passage["embedding"])
    passage_similarity.append((passage["path"], passage["slide_id"], passage["start_sentence_id"], passage["end_sentence_id"], sim))

passage_similarity.sort(key=lambda x: x[4], reverse=True)
passage_similarity[:10]

[('processed/da-lec11-notes.json', 15, 0, 2, tensor([[0.5903]])),
 ('processed/da-lec11-notes.json', 15, 2, 5, tensor([[0.5238]])),
 ('processed/da-lec12.json', 30, 1, 3, tensor([[0.4785]])),
 ('processed/da-lec11-notes.json', 14, 4, 5, tensor([[0.4710]])),
 ('processed/da-lec11.json', 14, 0, 0, tensor([[0.4549]])),
 ('processed/da-lec9-notes.json', 32, 4, 6, tensor([[0.4341]])),
 ('processed/da-lec9-notes.json', 36, 6, 8, tensor([[0.3743]])),
 ('processed/da-lec9-notes.json', 41, 3, 6, tensor([[0.3667]])),
 ('processed/da-lec9-notes.json', 41, 2, 5, tensor([[0.3333]])),
 ('processed/da-lec12-notes.json', 10, 9, 12, tensor([[0.3328]]))]

In [177]:
results = client.run("""
    ?[dist, path, slide_id, start_sentence_id, end_sentence_id,embedding] := ~passages:semantic{ path, slide_id, start_sentence_id, end_sentence_id, embedding |
        query: q,
        k: 10,
        ef: 2000,
        bind_distance: dist,
    }, q = vec($query)
    :order dist
""", {"query": query_embedding.tolist()})

In [178]:
results

Unnamed: 0,dist,path,slide_id,start_sentence_id,end_sentence_id,embedding
0,0.409745,processed/da-lec11-notes.json,15,0,2,"[-0.03984746336936951, -0.003164630616083741, ..."
1,0.476207,processed/da-lec11-notes.json,15,2,5,"[-0.013826031237840652, 0.03698691725730896, -..."
2,0.521534,processed/da-lec12.json,30,1,3,"[0.021153435111045837, 0.030901985242962837, -..."
3,0.528978,processed/da-lec11-notes.json,14,4,5,"[-0.002994836075231433, -0.053591538220644, -0..."
4,0.545065,processed/da-lec11.json,14,0,0,"[-0.0051970453932881355, -0.021439701318740845..."
5,0.565863,processed/da-lec9-notes.json,32,4,6,"[0.025164050981402397, 0.024886319413781166, -..."
6,0.625654,processed/da-lec9-notes.json,36,6,8,"[0.011151257902383804, -0.005607254337519407, ..."
7,0.63331,processed/da-lec9-notes.json,41,3,6,"[0.06935732811689377, 0.020043212920427322, -0..."
8,0.666722,processed/da-lec9-notes.json,41,2,5,"[0.0754750445485115, 0.08729206025600433, -0.0..."
9,0.667171,processed/da-lec12-notes.json,10,9,12,"[-0.06814120709896088, -0.13321511447429657, -..."


In [180]:
util.cos_sim(query_embedding, embedding)

tensor([[0.5903]])

In [182]:
path, slide_id, start_sentence_id, end_sentence_id = results.path[0], results.slide_id[0], results.start_sentence_id[0], results.end_sentence_id[0]

client.run("""
    ?[path, slide_id, sentence_id, sentence] := *slide_sentences[path, slide_id, sentence_id, sentence], path=$path, slide_id=$slide_id, sentence_id >= $start_sentence_id, sentence_id <= $end_sentence_id
""", {"path": path, "slide_id": slide_id, "start_sentence_id": start_sentence_id, "end_sentence_id": end_sentence_id}).to_dict(orient="records")

[{'path': 'processed/da-lec11-notes.json',
  'slide_id': 15,
  'sentence_id': 0,
  'sentence': ' Since, in practice, the entire Pareto front is unknown, the nadir point can only be approximated.'},
 {'path': 'processed/da-lec11-notes.json',
  'slide_id': 15,
  'sentence_id': 1,
  'sentence': 'It is much easier to determine the max point with the worst objective  values of the entire objective space.'},
 {'path': 'processed/da-lec11-notes.json',
  'slide_id': 15,
  'sentence_id': 2,
  'sentence': 'Therefore, it is often used as an estimate of the nadir point.'}]

In [None]:
client.run(
            """
            ?[path, slide_id, sentence_id, sentence] := *slide_sentences[path, slide_id, sentence_id, sentence], path=$path, slide_id=$slide_id, sentence_id >= $start_sentence_id, sentence_id <= $end_sentence_id
        """,
            {
                "path": path,
                "slide_id": slide_id,
                "start_sentence_id": start_sentence_id,
                "end_sentence_id": end_sentence_id,
            },
        ).sentence.tolist()

In [None]:
bi_encoder.encode(passages, convert_to_tensor=True, show_progress_bar=True)

In [318]:
for result in results.to_dict(orient="records"):
    print(result["slide_id"], result["sentence"])

0 These include game theory, social choice theory, and multiple  objective optimization.
0  In the following two lectures, we will talk about social choice theory.
0 If you remembered the list of the recent hot topics in Artiﬁcial Intelligence, the computational social choice was in there.
0 Decision Analysis Introduction to social choice theory:  group decision making by voting  Miłosz Kadziński  Institute of Computing Science  Poznan University of Technology, Poland  Artiﬁcial Intellience Decision Analysis
0  In the previous lecture devoted to the social choice theory, you learned a few tens of voting rules and electoral systems.
1 Logic:  modeling the reasoning patterns  of agents engaging in strategic  interaction Mathematics:  infinite games (set theory) Economy:  Linguistics Biology Game theory Game theory interaction Philosophy:  analysis of the conflicts arising  between what people ought to do  and what they actually do (ethics) Economy:  modeling competing behaviors  of inter

In [None]:
#todo videos