In [None]:
import duckdb
conn = duckdb.connect(r"E:\app_data\sec_apps_data\test.duckdb", read_only=False)

# conn = duckdb.connect(r"E:\app_data\sec_apps_data\13f_filings_duckdb.duckdb", read_only=False)


In [None]:
conn.close()

In [None]:
results = conn.execute("SELECT 41, 23, 23").fetchall()

In [None]:
results = conn.execute("SELECT 41, 23, 23").df()

In [None]:
# Create a table
conn.execute("CREATE TABLE IF NOT EXISTS test_table (i INTEGER, J STRING)")

In [None]:
# Adding two entries into our newly created table using typical SQL script

conn.execute("INSERT INTO test_table VALUES (1, 'one'), (9, 'nine')")

In [None]:
# view the column i from out table and present it as pandas df
conn.execute("""SELECT i, j
             FROM test_table
                          
             """).fetchdf()

In [None]:
# view as numpy array
conn.execute("""SELECT i, j
             FROM test_table
             """).fetchnumpy()

In [None]:
# view as standard SQL output
conn.execute("""SELECT i, j
             FROM test_table
             """).fetchall()

In [None]:
# view only one entry from the table
conn.execute("""SELECT *
             FROM test_table
             """).fetchone()

In [None]:
# we can use executemany() to insert multiple values. !! Need to be careful though
# as official documentation says not to use it and use other (pandas or numpy type of
# syntax) to have better speed
# insert one row with placeholders
conn.execute("""
    INSERT INTO test_table VALUES (?,?) """, [2, 'two'])

# insert multiple rows with the help of placeholders
conn.executemany("""
    INSERT INTO test_table VALUES (?,?)""", [[3, 'three'],[4, 'four']])

conn.execute("SELECT * from test_table ").df()


In [None]:
# Create a table
# conn.execute("DROP TABLE test_table")
# conn.execute("CREATE TABLE IF NOT EXISTS test_table (accessionNumber VARCHAR PRIMARY KEY, cikManager VARCHAR, managerName VARCHAR,submissionType VARCHAR,filedAsOfDate DATE,periodOfReport DATE,report_Quarter VARCHAR,report_Year VARCHAR,xml_flag VARCHAR,created_at DATETIME)")
conn.execute("CREATE TABLE IF NOT EXISTS test_table AS SELECT * FROM \
read_csv_auto('E:\\app_data\sec_apps_data\data.csv', delim=',', header=True)")

In [None]:
conn.execute("SELECT * from test_table ").df().shape,
# conn.execute("SELECT * from test_table ").df().tail(6)

## Let's create a pandas df from scratch and play with SQL

In [None]:
import pandas as pd
test_df = pd.DataFrame.from_dict({"i":[1,2,3,4], "j": ["one", "two", "three", "four"]})
# make this df available in the duckdb as a view
conn.register("test_df", test_df)
conn.execute("""
SELECT j FROM test_df WHERE i > 1
""").fetchdf()

In [None]:
import pandas as pd
df = pd.read_csv(r"E:\app_data\sec_apps_data\bank_data.csv")
# make this df available in the duckdb as a view


In [None]:
df.head()

In [None]:
conn.register("bank_df", df)
conn.execute("""
SELECT * FROM bank_df WHERE age > 27
""").fetchdf()

In [None]:
df.describe(include='all')

In [None]:
# Creating a relation from a df
rel = conn.from_df(df)
rel.filter('age > 27').project('age + 1, sex').order('sex').limit(2) # project is like SELECT

In [None]:
# another way to create a relation from pandas df
rel = duckdb.df(df)

In [None]:
rel

In [None]:
# rel from existing table
rel = conn.table('test_table')
rel

In [None]:
# loading data from csv directly into rel
rel = duckdb.from_csv_auto(r"E:\app_data\sec_apps_data\bank_data.csv")
rel

In [None]:
rel.alias

In [None]:
rel2 = rel.set_alias('bank_data')
rel2.alias

In [None]:
rel2.type

In [None]:
rel2.columns

In [None]:
rel2.types

### Applyting python like functions

In [None]:
rel2.filter('age > 18').order('sex')

In [None]:
# project is like SELECT in SQL
rel2.project('id, age')

In [None]:
# we can transform data in column. For example adding 2 to age column
rel2.project('age + 2')

In [None]:
rel2.order('sex')

In [None]:
# limnit the output like .head()
rel2.limit(5)

In [None]:
# chaining functions together
rel2.filter('age > 19').project('age + 1, sex').order('sex').limit(2)

In [None]:
rel2.df().age > 19

In [None]:
rel2.df().query('age > 19')

In [None]:
mask = rel2.df().age > 19

In [None]:
rel2.df().age.values[mask]

### Aggregate functions

In [None]:
rel2.aggregate("sum(actual_recovery_amount)")

In [None]:
# if we specify first a column without agggregation and then a column with, then it'll 
# work as a group by for the first column
rel2.aggregate("age, sum(actual_recovery_amount)")

In [None]:
# if we want just one colulmn with the group by data then we reverse the colulmns' posisions
rel2.aggregate("sum(actual_recovery_amount)", "age")

In [None]:
# distinct
rel2.distinct()

In [None]:
# union
rel2.union(rel2)

In [None]:
# joins
rel3 = duckdb.df(df)
rel3
rel2.join(rel3, 'id')

In [None]:
print(rel2.set_alias('a').join(rel3.set_alias('b'), 'a.id=b.id'))

In [None]:
# we can work directly with the df through duckdb
print(duckdb.filter(df, 'age > 1'))
print(duckdb.project(df, 'age +1'))
print(duckdb.order(df, 'sex'))
print(duckdb.limit(df, 2))

In [None]:
# compute the query result from the relation 
res = rel2.execute()
print(res)

In [None]:
# res is a query result, you can call fetchdf() or fetchnumpy() or fetchone() on it
print(res.fetchone())
# print(res.fetchall())
print(res.df())

### Create tables

In [None]:
rel.create('test_table2')

In [None]:
# Inserting elements into table_3
conn.execute("CREATE TABLE test_table3 (i INTEGER, j STRING)")
print(conn.values([5, 'five']).insert_into("test_table3"))
rel_3 = conn.table("test_table3")
rel_3.insert([6,'six'])
rel_3

In [None]:
# conn.execute("CREATE TABLE test_table4 AS SELECT * FROM df")
conn.execute(" SELECT * FROM test_table4").df()

In [None]:
conn.execute("DROP TABLE test_table5")

In [None]:
# Inserting elements into table_3
conn.execute("CREATE TABLE test_table7 (i INTEGER, j STRING)")
print(conn.values([5, 'five']).insert_into("test_table7"))
rel_3 = conn.table("test_table7")
rel_3.insert([6,'six'])
rel_3

In [None]:
# we can write queries and later fetch the result of them 

In [None]:
res = rel.query('my_name_for_rel', 'SELECT * FROM my_name_for_rel LIMIT 5')

In [None]:
res.fetchall()

In [None]:
res = duckdb.query('SELECT * FROM df')
res.df()

# The code below is not relevant to the tut, but it's an attempt to a solution to write data to a Duckdb table without duplicates. There is no UPSERT or IGNORE functionality, so this solution becomes more important.

In [None]:
# This piece of code remove duplicates from the df. It could be an option to use
# in my app and 
conn.execute("SELECT * from test_table ").df().drop_duplicates(keep="last").tail(6)


In [None]:
# Insert data into a table from a df
conn.execute("INSERT INTO test_table SELECT * FROM read_csv_auto('E:\\app_data\sec_apps_data\data2.csv', delim=',', header=True)")

In [None]:
# Create a pandas df
import pandas as pd
# my_df = pd.DataFrame.from_dict({'a': [42],'b': [43],'c': [44]})
my_df = pd.read_csv(r"E:\app_data\sec_apps_data\data2.csv")


In [None]:
my_df.head()
# type(my_df)

In [None]:
# Create a table from df
# conn.execute("DROP TABLE my_table")
conn.execute("CREATE TABLE IF NOT EXISTS my_table AS SELECT * FROM my_df")
conn.execute("CREATE TABLE IF NOT EXISTS staging_my_table AS SELECT * FROM my_table")
conn.execute("INSERT INTO staging_my_table SELECT * FROM my_df")
conn.execute("""
INSERT INTO my_table 
SELECT * FROM staging_my_table
WHERE accessionNumber NOT IN (SELECT accessionNumber FROM my_table)
""")

In [None]:
conn.execute("SELECT rowid, * FROM my_table").df()

In [None]:
# conn.execute("SELECT rowid, * FROM my_table").df()
conn.execute("SELECT * FROM staging_my_table").df()


In [None]:
# Insert the same data into the same table again from a df - it creates duplicate data
conn.execute("INSERT INTO staging_my_table SELECT * FROM read_csv_auto('E:\\app_data\sec_apps_data\data2.csv', delim=',', header=True)")

In [None]:
conn.execute("""ALTER TABLE my_table
  ADD CONSTRAINT my_table_pk
    PRIMARY KEY (accessionNumber, cikManager)""")