In [1]:
from pycozo.client import Client

In [58]:
# read docs
paths = [
    r"../assets/text/da-lec1-notes.txt",
    r"../assets/text/da-lec2-notes.txt",
    r"../assets/text/da-lec3-notes.txt",
    r"../assets/text/da-lec4-notes.txt",
    r"../assets/text/da-lec5-notes.txt",
    r"../assets/text/da-lec6-notes.txt",
    r"../assets/text/da-lec7-notes.txt",
    r"../assets/text/da-lec8-notes.txt",
    r"../assets/text/da-lec9-notes.txt",
    r"../assets/text/da-lec10-notes.txt",
    r"../assets/text/da-lec11-notes.txt",
    r"../assets/text/da-lec12-notes.txt",
]
from pathlib import Path

paths = [Path(p) for p in paths]

In [59]:
import pandas as pd

document_relation = pd.DataFrame(
    columns=["document_name","document_path"],
    data=[(path.name, path) for i, path in enumerate(paths)]
)


In [2]:
client = Client('sqlite', 'file.db')

In [10]:
d = client.run("""

?[sentence, paragraph_id, doc_name] := ~sentence:phrases {paragraph_id,doc_name, sentence |

 query: $query,

 k: 1000

 }
""", {"query": f"NEAR/1(dominating strategy)"})
d.to_markdown("example.md")

First create a document path - document id relation

In [383]:
client.run(
    """
    :create document {document_name => document_path}
    """
)

Unnamed: 0,status
0,OK


In [384]:
for document_name, document_path in document_relation.values:
    client.run("""
    ?[document_name, document_path] <- [[$document_name, $document_path]]
    :put document {document_name => document_path}
    """,{"document_name":str(document_name),"document_path":str(document_path)})


In [385]:
client.run("""
    :create sentence {sentence_id, paragraph_id, doc_name => sentence} 
""")

Unnamed: 0,status
0,OK


In [52]:
client.run("""
    :create phrases {phrase:String, doc_name:String, paragraph_id:Int?}
""")

Unnamed: 0,status
0,OK


In [53]:
# read phrases 
import json
with open(r"../data/annotated_decision_analysis.json","r") as f:
    phrases = json.load(f)
phrases = {k+".txt":v for k,v in phrases.items()}
phrases.keys()

dict_keys(['da-lec8-notes.txt.txt', 'da-lec9-notes.txt.txt', 'da-lec10-notes.txt.txt', 'da-lec11-notes.txt.txt', 'da-lec12-notes.txt.txt'])

In [55]:
from tqdm import tqdm
from nltk import sent_tokenize

In [389]:
sentence_dict = {
    "sentence": {
        "headers": ["sentence_id", "paragraph_id", "doc_name", "sentence"],
        "rows": [],
    }
}
for path in paths:
    with open(path, 'r') as f:
        paragraphs = f.read().splitlines()
    doc_name = path.name
    doc_path = path
    for paragraph_id,paragraph in enumerate(paragraphs):
        if len(paragraph) > 0:
            sents = sent_tokenize(paragraph)
            for sentence_id, sent in enumerate(sents):
                sentence_dict["sentence"]["rows"].append(
                    [sentence_id, paragraph_id, doc_name, sent]
                )

# sentence_dict
client.import_relations(sentence_dict)

# tx.commit()


In [390]:
client.run("""
    ::fts create sentence:phrases {
    extractor: sentence,
    tokenizer: Simple,
    filters: [Lowercase,AlphaNumOnly, Stemmer('English')],
}
""")

Unnamed: 0,status
0,OK


In [60]:
phrase_dict = {
    "phrases": {
        "headers": ["phrase", "doc_name", "paragraph_id"],
        "rows": []
    }
}
all_phrases = [phrase for doc_phrases in phrases.values() for phrase in doc_phrases]
all_phrases = list(set(all_phrases))
for original_phrase in tqdm(all_phrases,desc="phrases",total=len(all_phrases)):
    #replace non alphanumeric characters with space
    phrase = "".join([c if c.isalnum() else " " for c in original_phrase])
    # split phrase into 2 word spans
    phrase = phrase.split()
    span = " ".join(phrase[0:2])

    total = client.run("""
        ?[paragraph_id, doc_name] := ~sentence:phrases {paragraph_id,doc_name, sentence |
            query: $query,
            k: 1000
        }
        """, {"query": f"NEAR/1({span})"})
    
    for i in range(1,len(phrase)-1):
        span = " ".join(phrase[i:i+2])
        results = client.run("""
        ?[paragraph_id, doc_name] := ~sentence:phrases {paragraph_id,doc_name, sentence |
            query: $query,
            k: 1000
        }
        """, {"query": f"NEAR/1({span})"})
        total = pd.merge(total,results,how="inner")
    for paragraph_id, doc_name in total.values:
        phrase_dict["phrases"]["rows"].append([original_phrase, doc_name, paragraph_id])
        
client.import_relations(phrase_dict)

phrases: 100%|██████████| 431/431 [00:03<00:00, 131.73it/s]


In [65]:
client.run("""
    ?[phrases] := *phrases[phrases, doc_name, paragraph_id]
    """)

Unnamed: 0,phrases
0,ASF
1,Alabama paradox
2,Arrow 's theorem
3,Baldwin method
4,Banzhaf power index
...,...
425,weighted majority graph
426,weighted sum method
427,weighted voting games
428,winner-turns-loser paradox


In [66]:
notes_path = Path(r"C:\Users\kryst\Documents\Personal\university\ai_sem6\nlp\testing")
notes_path

WindowsPath('C:/Users/kryst/Documents/Personal/university/ai_sem6/nlp/testing')

In [67]:
original_phrase

'dictatorship'

In [70]:
def get_paragraphs(original_phrase):
    dataframe = client.run("""
        paragraphs[doc_name, paragraph_id] := *phrases[$phrase, doc_name, paragraph_id],
        ?[sentence,sentence_id,paragraph_id,doc_name] := *sentence[sentence_id, paragraph_id, doc_name, sentence],paragraphs[doc_name, paragraph_id]
        :order doc_name, paragraph_id, sentence_id
        """,{"phrase":original_phrase})
    # for each combination of paragraph_id and doc_name, get the paragraph
    return dataframe.groupby(["doc_name","paragraph_id"]).agg({"sentence":lambda x: " ".join(x)})
    
    
df = get_paragraphs("dictatorship")

In [71]:
list(df.iterrows())

[(('da-lec10-notes.txt', 12),
  sentence     At this stage, you already know some properti...
  Name: (da-lec10-notes.txt, 12), dtype: object),
 (('da-lec10-notes.txt', 13),
  sentence     Now is the time to present the most famous th...
  Name: (da-lec10-notes.txt, 13), dtype: object),
 (('da-lec10-notes.txt', 21),
  sentence     Before we discuss another central result in v...
  Name: (da-lec10-notes.txt, 21), dtype: object),
 (('da-lec10-notes.txt', 28),
  sentence     To sum up what we have discussed so far, an a...
  Name: (da-lec10-notes.txt, 28), dtype: object)]

In [72]:
df.groupby(["doc_name","paragraph_id"]).all()

Unnamed: 0_level_0,Unnamed: 1_level_0,sentence
doc_name,paragraph_id,Unnamed: 2_level_1
da-lec10-notes.txt,12,True
da-lec10-notes.txt,13,True
da-lec10-notes.txt,21,True
da-lec10-notes.txt,28,True


In [74]:
for original_phrase in tqdm(all_phrases, desc="notes", total=len(all_phrases)):
    md_text = ""
    paragraphs = get_paragraphs(original_phrase)
    for paragraph_description,paragraph in paragraphs.iterrows():
        md_text += f"\n\n `document : {paragraph_description[0]} slide: {paragraph_description[1]}`"
        md_text += f"\n\n {paragraph['sentence']}"
    md_text += "\n\n## See also:"
    adjacent = client.run("""
        ?[phrase] := *phrases[phrase, doc_name, paragraph_id], *phrases[$phrase, doc_name, paragraph_id], phrase!= $phrase
        """,{"phrase":original_phrase}).values
    for phrase in adjacent:
        md_text += f"\n[[{phrase[0]}]]"
    cleaned_phrase = "".join([c if c.isalnum() else " " for c in original_phrase])
    with open(notes_path / f"{cleaned_phrase}.md","w") as f:
        f.write(md_text)


notes: 100%|██████████| 431/431 [00:30<00:00, 14.29it/s]


In [79]:
pagerank = client.run("""
        phrase_graph[phrase1,phrase2] := *phrases[phrase1, "da-lec12-notes.txt", paragraph_id], *phrases[phrase2, "da-lec12-notes.txt", paragraph_id], phrase1!= phrase2
        ?[phrase,page_rank] <~ PageRank(phrase_graph[])
        :order -page_rank
        """)

In [80]:
pagerank.head(20)

Unnamed: 0,phrase,page_rank
0,multi-objective optimization,0.024573
1,objective optimization,0.024567
2,dominated solutions,0.023775
3,non-dominated solutions,0.023771
4,evolutionary algorithm,0.022128
5,recombination,0.021444
6,genetic algorithm,0.020552
7,Pareto frontier,0.018569
8,crowding distance,0.016729
9,reference point,0.016642


In [81]:
community_detection_louvain = client.run("""
        phrase_graph[phrase1,phrase2] := *phrases[phrase1, "da-lec12-notes.txt", paragraph_id], *phrases[phrase2, "da-lec12-notes.txt", paragraph_id], phrase1!= phrase2
        ?[community_index,node_index] <~ CommunityDetectionLouvain(phrase_graph[])
        """)

In [83]:
community_detection_louvain

Unnamed: 0,community_index,node_index
0,"[0, 0]",Chebyshev function
1,"[0, 0]",MOEA/D
2,"[0, 0]",evolutionary algorithm
3,"[0, 0]",multiple objective problem
4,"[0, 0]",objective problem
...,...,...
114,"[6, 9]",evolved population
115,"[6, 9]",preference information
116,"[6, 10]",generation model
117,"[6, 10]",population management models


In [82]:
community_detection_louvain.to_csv("community_detection_louvain.csv")