In [None]:
!pyenv virtualenv 3.8.2 myenv

In [None]:
!pip install mwxml
!pip install tqdm
!pip install mwxml psycopg2-binary
!pip install ipywidgets
!pip install groq


In [None]:
conn_string = "postgresql://postgres:postgres@localhost:6666"
import mwxml
import psycopg2
from psycopg2.extras import execute_batch
from tqdm import tqdm
from datetime import datetime, timezone
import os

# Database connection URL
db_url = conn_string
# Connect to the PostgreSQL database
conn = psycopg2.connect(db_url)
def SQL(query, conn_string = conn_string):
    with psycopg2.connect(conn_string) as conn:
        with conn.cursor() as cursor:
            cursor.execute(query)
            try:
                res = cursor.fetchall()
                return res
            except Exception as e:
                if str(e) == 'no results to fetch':
                    return None
                raise e


In [None]:

# Database connection URL
db_url = conn_string
# Connect to the PostgreSQL database
conn = psycopg2.connect(db_url)
cursor = conn.cursor()

# Create table if not exists
create_table_query = """
DROP TABLE IF EXISTS wikipedia_pages;
CREATE TABLE IF NOT EXISTS wikipedia_pages (
    id SERIAL PRIMARY KEY,
    page_title TEXT,
    revision_id BIGINT,
    timestamp TIMESTAMP,
    contributor TEXT,
    text TEXT
);
"""
cursor.execute(create_table_query)
conn.commit()

def insert_pages(pages):
    insert_query = """
    INSERT INTO wikipedia_pages (page_title, revision_id, timestamp, contributor, text)
    VALUES (%s, %s, %s, %s, %s);
    """
    execute_batch(cursor, insert_query, pages)
    conn.commit()

# Load and parse the Wikipedia dump
batch_size = 1000
pages = []

dump_path = "hywiki-20240701-pages-articles.xml"

with open(dump_path, 'rb') as f:
    dump = mwxml.Dump.from_file(f)
    for page in tqdm(dump):
        for revision in page:
            pages.append(
                (
                    page.title,
                    revision.id,
                    datetime.fromisoformat(str(revision.timestamp)),
                    revision.contributor.user_text if hasattr(revision, 'contributor') else None,
                    revision.text
                )
            )
            if len(pages) >= batch_size:
                insert_pages(pages)
                pages = []

    if pages:
        insert_pages(pages)

# Close the database connection
cursor.close()
conn.close()


In [None]:
# open a connection via a context and run this

add_tsvector_column('wikipedia_pages', 'text')

In [None]:
# write a query retrieving all the rows related to armenian mathematicians

with psycopg2.connect(conn_string) as conn:
    with conn.cursor() as cursor:
        cursor.execute("""
SELECT page_title FROM wikipedia_pages
WHERE text_simple ilike  '%մաթեմատիկակա%' LIMIT 100;
""")
        for row in cursor:
            print(row)

In [None]:

SQL("""
    SELECT (string_to_array(lower(regexp_replace(text_simple, '\\W+', ' ', 'g')), ' ')) AS word, text_simple
    FROM wikipedia_pages LIMIT 10
""")

In [None]:
subset_size = 100_000

In [None]:
SQL(f"""

""")

In [None]:
SQL("SELECT page_title, text from wikipedia_pages where id = 79 LIMIT 10")

In [None]:
SQL("""
    
    SELECT
    tf.id,
    tf.word,
    tf.page_title,
    tf.frequency,
    tw.total,
    round(tf.frequency::decimal / tw.total, 2)::float AS term_frequency
FROM
    term_frequency tf
JOIN
    total_words tw ON tf.id = tw.id
ORDER BY
    tf.frequency::decimal / tw.total DESC, tf.id, tf.word;
    
    
    """)

# IDF

In [None]:
SQL(f"""
DROP TABLE IF EXISTS unique_terms;
CREATE TABLE unique_terms AS
SELECT
    id,
    UNNEST(string_to_array(lower(regexp_replace(text_simple, '\\W+', ' ', 'g')), ' ')) AS word
FROM
    (SELECT * FROM wikipedia_pages LIMIT {subset_size}) a
GROUP BY
    id, word;
--SELECT word, count(word) from unique_terms group by word order by count(word) desc;
""")

In [None]:
SQL("""
-- calculated in how many documents the given unique term appears in
DROP TABLE IF EXISTS doc_frequency;
CREATE TABLE doc_frequency AS
SELECT
    word,
    COUNT(DISTINCT id) AS doc_count
FROM
    unique_terms
GROUP BY
    word;
    """)



    
# -- Calculate the total number of documents
# WITH total_docs AS (
#     SELECT COUNT(*) AS total FROM wikipedia_pages
# )

# -- Calculate the number of documents containing each term
# ,
# -- Calculate the IDF for each term
# idfs AS (
#     SELECT
#         word,
#         LOG((SELECT total FROM total_docs)::decimal / df.doc_count) AS idf
#     FROM
#         doc_frequency df

# )
# SELECT * from idfs order by idf desc;

In [None]:
# requires CREATE EXTENSION fuzzystrmatch;
SQL("""
    SELECT word, count(*) from unique_terms where levenshtein('աստղակերպերի', word) < 2 group by word;
""")

In [None]:
SQL("""
with res as (

    SELECT
        (array_agg(corpus.page_title))[1],
        array_agg(tf.word),
        array_agg(tf.frequency),
        array_agg(df.doc_count),
        array_agg(cardinality(string_to_array(lower(regexp_replace(text_simple, '\\W+', ' ', 'g')), ' '))),
        SUM(tf.frequency::decimal / cardinality(string_to_array(lower(regexp_replace(text_simple, '\\W+', ' ', 'g')), ' '))
        * LOG((SELECT COUNT(*) AS count FROM wikipedia_pages) / df.doc_count)) AS tf_idf
        
        
    FROM
        (SELECT * FROM wikipedia_pages LIMIT 10000) corpus
    JOIN term_frequency tf ON corpus.id = tf.id
    JOIN doc_frequency df ON tf.word = df.word
    WHERE
        tf.word = ANY(string_to_array('պետություն մայրաքաղաք բութան տոգո', ' '))
    GROUP BY
        tf.id
    ) SELECT * from res order by tf_idf desc;
    """)

In [None]:
SQL("""
-- Create a function to search based on a query string using TF-IDF
CREATE OR REPLACE FUNCTION search_documents(query TEXT)
RETURNS TABLE(id INT, score DECIMAL) AS $$
DECLARE
    query_words TEXT[];
BEGIN
    -- Tokenize the query string into an array of words
    query_words := string_to_array(query, ' ');

    RETURN QUERY
    SELECT
        tf.id,
        SUM(tf.tf_idf) AS score
    FROM
        tf_idf tf
    WHERE
        tf.word = ANY(query_words)
    GROUP BY
        tf.id
    ORDER BY
        score DESC;
END;
$$ LANGUAGE plpgsql;

""")


## Almost Unique terms

There are too many unique terms as calculated above
~13M for the 100_000 subset of the wiki data. The problem is that lammetization does not work for armenian in postgres. Luckily, we can use levenshtein distance to consider words with small edit distance the same!

The problem is that it is an O(N^2) algorithm to find almost unique words from the unique_words table.

In [None]:
WITH word_distances AS (
    SELECT
        w1.id AS id1,
        w1.word AS word1,
        w2.id AS id2,
        w2.word AS word2,
        levenshtein(w1.word, w2.word) AS distance
    FROM
        (SELECT * FROM unique_terms LIMIT 10000) w1,
        (SELECT * FROM unique_terms LIMIT 10000) w2
    WHERE
        w1.id <> w2.id
),
unique_words AS (
    SELECT
        id1 AS id,
        word1 AS word
    FROM
        word_distances
    GROUP BY
        id1, word1
    HAVING
        MIN(distance) >= 2
)
SELECT COUNT(*)
FROM unique_words;

We can imporve the situation by defining hnsw vector index op classes for text data type and levenshtein distance metric!