In [156]:
import sqlite3

In [157]:
# open a sqlite database
conn = sqlite3.connect('darkwebreports.db')

In [158]:
tablename = 'pings'

In [159]:
# define columns as a list of pairs with title and type
columns = [
    ('HiddenService', 'TEXT'),
    ('Language', 'short_string CHAR(2)'),
    ('HTTPStatus', 'INTEGER'),
    ('FirstSeen', 'TEXT'),
    ('LastContacted', 'TEXT'),
    ('LastUp', 'TEXT'),
    ('Title', 'TEXT'),
    ('Sheetname', 'TEXT'),
]
# TODO: store dateime as int (and/or split date/time) to put an index on the date
# TODO: consider more indexes          

In [160]:
def generateCreateTableQuery(tablename, columns):
    query = 'CREATE TABLE IF NOT EXISTS "' + tablename + '" ('
    for column in columns:
        query += '\n"' + column[0] + '" ' + column[1] + ','
    query = query[:-1] + ');'
    print(query)
    return query

In [161]:
conn.execute(
    generateCreateTableQuery(tablename, columns)
)

CREATE TABLE IF NOT EXISTS "pings" (
"HiddenService" TEXT,
"Language" short_string CHAR(2),
"HTTPStatus" INTEGER,
"FirstSeen" TEXT,
"LastContacted" TEXT,
"LastUp" TEXT,
"Title" TEXT,
"Sheetname" TEXT);


<sqlite3.Cursor at 0x7f90881c5d50>

In [164]:
# create an index on the column "Hidden Service"
conn.execute('''CREATE INDEX IF NOT EXISTS "''' + tablename + '''_HiddenService" ON ''' + tablename + '''("HiddenService");''')        
conn.execute('''CREATE INDEX IF NOT EXISTS "''' + tablename + '''_Sheetname" ON ''' + tablename + '''("Sheetname");''')        
conn.execute('''CREATE INDEX IF NOT EXISTS "''' + tablename + '''_Title" ON ''' + tablename + '''("Title");''')        

<sqlite3.Cursor at 0x7f90881c5180>

In [165]:
import pandas as pd
import os

# iterate through the .xlsx files in the rawreports folder and add it to a list of files
files = []
for file in os.listdir('rawreports'):
    if file.endswith('.xlsx'):
        files.append(file)


In [166]:
#only take the first files for testing purposes
files = files[:10]
len(files)

10

In [167]:
sheets = ["New Today", "Down", "Up"]

In [168]:
def import_sheet(sheetname, file):
    # read the excel file into a pandas dataframe
    df = pd.read_excel('rawreports/' + file,sheetname)
    
    df.columns = df.columns.map(lambda x: x.replace(' ', ''))
    # if the column LastSeen does not exist create it with the value of the column FirstSeen
    if 'LastSeen' not in df.columns:
        df['LastSeen'] = df['FirstSeen']
        df = df.drop(columns=['FirstSeen'])
    # add a column for the sheetname
    df['Sheetname'] = sheetname
    # convert the dataframe to a sqlite table
    df.to_sql(tablename, conn, if_exists='append', index=False)
    return df

In [169]:
# define df as empty df to store the result for debugging purposes in the notebook
df = pd.DataFrame()

for file in files:
    for sheetname in sheets:
        print(file, sheetname)
        df = import_sheet(sheetname, file)
        print(len(df))
#print(df)

HiddenServices-2018-2-14.xlsx New Today
3
HiddenServices-2018-2-14.xlsx Down
11735
HiddenServices-2018-2-14.xlsx Up
4838
HiddenServices-2020-8-12.xlsx New Today
0
HiddenServices-2020-8-12.xlsx Down
1486
HiddenServices-2020-8-12.xlsx Up
3250
HiddenServices-2019-11-3.xlsx New Today
1
HiddenServices-2019-11-3.xlsx Down
1165
HiddenServices-2019-11-3.xlsx Up
3723
HiddenServices-2022-1-29.xlsx New Today
0
HiddenServices-2022-1-29.xlsx Down
2038
HiddenServices-2022-1-29.xlsx Up
2628
HiddenServices-2020-2-2.xlsx New Today
0
HiddenServices-2020-2-2.xlsx Down
1239
HiddenServices-2020-2-2.xlsx Up
3658
HiddenServices-2019-3-14.xlsx New Today
2
HiddenServices-2019-3-14.xlsx Down
774
HiddenServices-2019-3-14.xlsx Up
5474
HiddenServices-2020-1-22.xlsx New Today
5
HiddenServices-2020-1-22.xlsx Down
1168
HiddenServices-2020-1-22.xlsx Up
3766
HiddenServices-2023-3-17.xlsx New Today
0
HiddenServices-2023-3-17.xlsx Down
2171
HiddenServices-2023-3-17.xlsx Up
604
HiddenServices-2021-2-11.xlsx New Today
1
Hi

In [137]:
#df

In [151]:
conn.execute('VACUUM')
conn.commit()
conn.close()