# Import / Config

In [1]:
%load_ext autoreload
%autoreload 2

In [17]:
import os
from pathlib import Path
from dotenv import load_dotenv
import edurel.utils.duckdb as ddbu

load_dotenv() 
BASE_DIR = os.getenv("BASE_DIR")
DB_DIR = f"{BASE_DIR}/databases"


In [57]:
con.close()

In [None]:
ddbu.csv_to_parquet(
    in_path=f"{DB_DIR}/db-sozmed/csv",
    fn="*",
    spec="delim='|', header=True, AUTO_DETECT=TRUE",
    out_path=f"{DB_DIR}/db-sozmed/parquet",
    verbose=True
)

In [50]:
con = ddbu.file_con(f"{DB_DIR}/db-sozmed", read_only=False)

In [51]:
sql = """
CREATE TABLE TagClass (
    id bigint PRIMARY KEY,
    name text NOT NULL,
    url text NOT NULL,
    SubclassOfTagClassId bigint
    -- SubclassOfTagClassId bigint REFERENCES TagClass(id) -- null for the root TagClass (Thing)
);

CREATE TABLE Tag (
    id bigint PRIMARY KEY,
    name text NOT NULL,
    url text NOT NULL,
    TypeTagClassId bigint NOT NULL REFERENCES TagClass(id)
);

CREATE TABLE Country (
    id bigint PRIMARY KEY,
    name varchar(256) NOT NULL,
    url varchar(256) NOT NULL,
    PartOfContinentId bigint
);

CREATE TABLE City (
    id bigint PRIMARY KEY,
    name varchar(256) NOT NULL,
    url varchar(256) NOT NULL,
    PartOfCountryId bigint REFERENCES Country(id)
);

CREATE TABLE Company (
    id bigint PRIMARY KEY,
    name varchar(256) NOT NULL,
    url varchar(256) NOT NULL,
    LocationPlaceId bigint NOT NULL REFERENCES Country(id)
);

CREATE TABLE University (
    id bigint PRIMARY KEY,
    name varchar(256) NOT NULL,
    url varchar(256) NOT NULL,
    LocationPlaceId bigint NOT NULL REFERENCES City(id)
);

CREATE TABLE Person (
    creationDate timestamp with time zone NOT NULL,
    id bigint PRIMARY KEY,
    firstName text NOT NULL,
    lastName text NOT NULL,
    gender text NOT NULL,
    birthday date NOT NULL,
    locationIP text NOT NULL,
    browserUsed text NOT NULL,
    LocationCityId bigint NOT NULL REFERENCES City(id),
    speaks text NOT NULL,
    email text NOT NULL
);

CREATE TABLE Forum (
    creationDate timestamp with time zone NOT NULL,
    id bigint PRIMARY KEY,
    title text NOT NULL,
    ModeratorPersonId bigint REFERENCES Person(id) -- can be null as its cardinality is 0..1
);

CREATE TABLE Forum_hasMember_Person (
    creationDate timestamp with time zone NOT NULL,
    ForumId bigint NOT NULL REFERENCES Forum(id),
    PersonId bigint NOT NULL REFERENCES Person(id)
);

CREATE TABLE Forum_hasTag_Tag (
    creationDate timestamp with time zone NOT NULL,
    ForumId bigint NOT NULL REFERENCES Forum(id),
    TagId bigint NOT NULL REFERENCES Tag(id)
);

CREATE TABLE Person_hasInterest_Tag (
    creationDate timestamp with time zone NOT NULL,
    PersonId bigint NOT NULL REFERENCES Person(id),
    TagId bigint NOT NULL REFERENCES Tag(id)
);

CREATE TABLE Person_studyAt_University (
    creationDate timestamp with time zone NOT NULL,
    PersonId bigint NOT NULL REFERENCES Person(id),
    UniversityId bigint NOT NULL REFERENCES University(id),
    classYear int NOT NULL
);

CREATE TABLE Person_workAt_Company (
    creationDate timestamp with time zone NOT NULL,
    PersonId bigint NOT NULL REFERENCES Person(id),
    CompanyId bigint NOT NULL REFERENCES Company(id),
    workFrom int NOT NULL
);

CREATE TABLE Person_knows_Person (
    creationDate timestamp with time zone NOT NULL,
    Person1id bigint NOT NULL REFERENCES Person(id),
    Person2id bigint NOT NULL REFERENCES Person(id),
    PRIMARY KEY (Person1id, Person2id)
); 

CREATE TABLE Message (
    creationDate timestamp with time zone NOT NULL,
    id bigint PRIMARY KEY,
    language varchar(80),
    content varchar(2000),
    imageFile varchar(80),
    locationIP varchar(80) NOT NULL,
    browserUsed varchar(80) NOT NULL,
    length int NOT NULL,
    CreatorPersonId bigint NOT NULL REFERENCES Person(id),
    ContainerForumId bigint REFERENCES Forum(id),
    LocationCountryId bigint NOT NULL REFERENCES Country(id),
    ParentMessageId bigint
    -- ParentMessageId bigint REFERENCES Message(id)
);

CREATE TABLE Person_likes_Message (
    creationDate timestamp with time zone NOT NULL,
    PersonId bigint NOT NULL REFERENCES Person(id),
    MessageId bigint NOT NULL REFERENCES Message(id)
);

CREATE TABLE Message_hasTag_Tag (
    creationDate timestamp with time zone NOT NULL,
    MessageId bigint NOT NULL REFERENCES Message(id),
    TagId bigint NOT NULL REFERENCES Tag(id)
);

"""

con.execute(sql)

<_duckdb.DuckDBPyConnection at 0x7fa5bcbb70f0>

In [29]:
ddbu.schema_print(con)

Table: City (id BIGINT NOT NULL, name VARCHAR NOT NULL, url VARCHAR NOT NULL, PartOfCountryId BIGINT NULL)
Table: Company (id BIGINT NOT NULL, name VARCHAR NOT NULL, url VARCHAR NOT NULL, LocationPlaceId BIGINT NOT NULL)
Table: Country (id BIGINT NOT NULL, name VARCHAR NOT NULL, url VARCHAR NOT NULL)
Table: Forum (creationDate TIMESTAMP WITH TIME ZONE NOT NULL, id BIGINT NOT NULL, title VARCHAR NOT NULL, ModeratorPersonId BIGINT NULL)
Table: Forum_hasMember_Person (creationDate TIMESTAMP WITH TIME ZONE NOT NULL, ForumId BIGINT NOT NULL, PersonId BIGINT NOT NULL)
Table: Forum_hasTag_Tag (creationDate TIMESTAMP WITH TIME ZONE NOT NULL, ForumId BIGINT NOT NULL, TagId BIGINT NOT NULL)
Table: Message (creationDate TIMESTAMP WITH TIME ZONE NOT NULL, id BIGINT NOT NULL, language VARCHAR NULL, content VARCHAR NULL, imageFile VARCHAR NULL, locationIP VARCHAR NOT NULL, browserUsed VARCHAR NOT NULL, length INTEGER NOT NULL, CreatorPersonId BIGINT NOT NULL, ContainerForumId BIGINT NULL, LocationCo

In [None]:
tn = "TagClass"
sql = f"""
COPY {tn} 
FROM '{DB_DIR}/db-sozmed/parquet/{tn}.parquet' (FORMAT PARQUET);
"""
con.execute(sql)

tn = "Tag"
sql = f"""
COPY {tn} 
FROM '{DB_DIR}/db-sozmed/parquet/{tn}.parquet' (FORMAT PARQUET);
"""
con.execute(sql)

tn = "Country"
sql = f"""
COPY {tn} 
FROM '{DB_DIR}/db-sozmed/parquet/{tn}.parquet' (FORMAT PARQUET);
"""
con.execute(sql)

tn = "City"
sql = f"""
COPY {tn} 
FROM '{DB_DIR}/db-sozmed/parquet/{tn}.parquet' (FORMAT PARQUET);
"""
con.execute(sql)

tn = "Company"
sql = f"""
COPY {tn} 
FROM '{DB_DIR}/db-sozmed/parquet/{tn}.parquet' (FORMAT PARQUET);
"""
con.execute(sql)

tn = "University"
sql = f"""
COPY {tn} 
FROM '{DB_DIR}/db-sozmed/parquet/{tn}.parquet' (FORMAT PARQUET);
"""
con.execute(sql)

tn = "Person"
sql = f"""
COPY {tn} 
FROM '{DB_DIR}/db-sozmed/parquet/{tn}.parquet' (FORMAT PARQUET);
"""
con.execute(sql)

tn = "Forum"
sql = f"""
COPY {tn} 
FROM '{DB_DIR}/db-sozmed/parquet/{tn}.parquet' (FORMAT PARQUET);
"""
con.execute(sql)

tn = "Forum_hasMember_Person"
sql = f"""
COPY {tn} 
FROM '{DB_DIR}/db-sozmed/parquet/{tn}.parquet' (FORMAT PARQUET);
"""
con.execute(sql)

tn = "Forum_hasTag_Tag"
sql = f"""
COPY {tn} 
FROM '{DB_DIR}/db-sozmed/parquet/{tn}.parquet' (FORMAT PARQUET);
"""
con.execute(sql)

tn = "Person_hasInterest_Tag"
sql = f"""
COPY {tn} 
FROM '{DB_DIR}/db-sozmed/parquet/{tn}.parquet' (FORMAT PARQUET);
"""
con.execute(sql)

tn = "Person_studyAt_University"
sql = f"""
COPY {tn} 
FROM '{DB_DIR}/db-sozmed/parquet/{tn}.parquet' (FORMAT PARQUET);
"""
con.execute(sql)

tn = "Person_workAt_Company"
sql = f"""
COPY {tn} 
FROM '{DB_DIR}/db-sozmed/parquet/{tn}.parquet' (FORMAT PARQUET);
"""
con.execute(sql)

tn = "Message"
sql = f"""
COPY {tn} 
FROM '{DB_DIR}/db-sozmed/parquet/{tn}.parquet' (FORMAT PARQUET);
"""
con.execute(sql)

tn = "Person_likes_Message"
sql = f"""
COPY {tn} 
FROM '{DB_DIR}/db-sozmed/parquet/{tn}.parquet' (FORMAT PARQUET);
"""
con.execute(sql)

tn = "Message_hasTag_Tag"
sql = f"""
COPY {tn} 
FROM '{DB_DIR}/db-sozmed/parquet/{tn}.parquet' (FORMAT PARQUET);
"""
con.execute(sql)

<_duckdb.DuckDBPyConnection at 0x7fa5bcbb70f0>

In [None]:
sql = """
SELECT * from TagClass
"""

ddbu.sql_print(con, sql)

In [55]:
tn = "Message_hasTag_Tag"
sql = f"""
COPY {tn} 
FROM '{DB_DIR}/db-sozmed/parquet/{tn}.parquet' (FORMAT PARQUET);
"""
con.execute(sql)

<_duckdb.DuckDBPyConnection at 0x7fa5bcbb70f0>

In [56]:
sql = """
-- create indexes on foreign keys
CREATE INDEX i01 ON Forum (ModeratorPersonId);
CREATE INDEX i02 ON Forum_hasMember_Person (ForumId);
CREATE INDEX i03 ON Forum_hasMember_Person (PersonId);
CREATE INDEX i04 ON Forum_hasTag_Tag (ForumId);
CREATE INDEX i05 ON Forum_hasTag_Tag (TagId);
CREATE INDEX i06 ON Person_knows_Person (Person1Id);
CREATE INDEX i07 ON Person_knows_Person (Person2Id);
CREATE INDEX i08 ON Person_likes_Message (PersonId);
CREATE INDEX i09 ON Person_likes_Message (MessageId);
CREATE INDEX i10 ON University (LocationPlaceId);
CREATE INDEX i11 ON Company (LocationPlaceId);
CREATE INDEX i12 ON person (LocationCityId);
CREATE INDEX i13 ON Person_workAt_Company (PersonId);
CREATE INDEX i14 ON Person_workAt_Company (CompanyId);
CREATE INDEX i15 ON Person_hasInterest_Tag (PersonId);
CREATE INDEX i16 ON Person_hasInterest_Tag (TagId);
CREATE INDEX i17 ON Person_studyAt_University (PersonId);
CREATE INDEX i18 ON Person_studyAt_University (UniversityId);
CREATE INDEX i19 ON Message (CreatorPersonId);
CREATE INDEX i20 ON Message (LocationCountryId);
CREATE INDEX i21 ON Message (ContainerForumId);
CREATE INDEX i22 ON Message (ParentMessageId);
CREATE INDEX i23 ON Message_hasTag_Tag (MessageId);
CREATE INDEX i24 ON Message_hasTag_Tag (TagId);
CREATE INDEX i25 ON Tag (TypeTagClassId);
CREATE INDEX i26 ON TagClass (SubclassOfTagClassId);
"""
con.execute(sql)


<_duckdb.DuckDBPyConnection at 0x7fa5bcbb70f0>