In [17]:
%run database.py

In [18]:
conn = create_connection()

Connection to SQLite DB successful


In [19]:
tablename = 'pings'

In [20]:
# define columns as a list of pairs with title and type
columns = [
    ('HiddenService', 'TEXT'), # Indexed
    ('Language', 'short_string CHAR(2)'), # Indexed 
    ('HTTPStatus', 'INTEGER'), # Indexed
    ('LastContactedDate', 'TEXT'), # Indexed
    ('LastContactedTime', 'TEXT'),
    ('LastUpDate', 'TEXT'),  # Indexed
    ('Title', 'TEXT'), # Indexed
    ('Sheetname', 'TEXT'), # Indexed
]

In [21]:
conn.execute(
    generateCreateTableQuery(tablename, columns)
)

CREATE TABLE IF NOT EXISTS "pings" (
"HiddenService" TEXT,
"Language" short_string CHAR(2),
"HTTPStatus" INTEGER,
"LastContactedDate" TEXT,
"LastContactedTime" TEXT,
"LastUpDate" TEXT,
"Title" TEXT,
"Sheetname" TEXT);


<sqlite3.Cursor at 0x7fa4b451d500>

In [22]:
# create an index on the column "Hidden Service"
conn.execute('''CREATE INDEX IF NOT EXISTS "''' + tablename + '''_HiddenService" ON ''' + tablename + '''("HiddenService");''')        
conn.execute('''CREATE INDEX IF NOT EXISTS "''' + tablename + '''_Language" ON ''' + tablename + '''("Language");''')        
conn.execute('''CREATE INDEX IF NOT EXISTS "''' + tablename + '''_HTTPStatus" ON ''' + tablename + '''("HTTPStatus");''')        
conn.execute('''CREATE INDEX IF NOT EXISTS "''' + tablename + '''_Sheetname" ON ''' + tablename + '''("Sheetname");''')        
conn.execute('''CREATE INDEX IF NOT EXISTS "''' + tablename + '''_Title" ON ''' + tablename + '''("Title");''')        
conn.execute('''CREATE INDEX IF NOT EXISTS "''' + tablename + '''_LastContactedDate" ON ''' + tablename + '''("LastContactedDate");''')        
conn.execute('''CREATE INDEX IF NOT EXISTS "''' + tablename + '''_LastUpDate" ON ''' + tablename + '''("LastUpDate");''')        

<sqlite3.Cursor at 0x7fa4643a6500>

In [23]:
import pandas as pd
import os

# iterate through the .xlsx files in the rawreports folder and add it to a list of files
files = []
for file in os.listdir('rawreports'):
    if file.endswith('.xlsx'):
        files.append(file)


In [29]:
#only take the first files for testing purposes
#files = files[:3]
len(files)

3

In [30]:
sheets = ["New Today", "Down", "Up"]

In [31]:
import hashlib

def hash_url(url: str) -> str:
    return hashlib.sha256(url.encode()).hexdigest()

# url = "http://your_hidden_service_url_with_up_to_300_characters..."
# hashed_url = hash_url(url)
# print(hashed_url)

In [32]:
def import_sheet(sheetname, file):
    # read the excel file into a pandas dataframe
    df = pd.read_excel('rawreports/' + file,sheetname)
    
    df.columns = df.columns.map(lambda x: x.replace(' ', ''))
    
    # replace the column HIddenService with a hashed version of it
    df['HiddenService'] = df['HiddenService'].map(lambda x: hash_url(x))
    
    # if the column LastSeen does not exist create it with the value of the column FirstSeen
    if len(df) > 0: 
        if 'LastContacted' not in df.columns:
            df['LastContacted'] = df['FirstSeen']
            df = df.drop(columns=['FirstSeen'])
        # split LastContacted into date and time
        df['LastContactedDate'] = df['LastContacted'].str.split(' ', n=1, expand=True)[0] 
        df['LastContactedTime'] = df['LastContacted'].str.split(' ', n=1, expand=True)[1]
        df = df.drop(columns=['LastContacted'])

        if 'LastUp' in df.columns:
            # ignoring time from LastUp, as we probably have it in LastContacted
            df['LastUpDate'] = df['LastUp'].str.split(' ', n=1, expand=True)[0]
            df = df.drop(columns=['LastUp'])
            # replace 'Never' with Null in LastUpDate
            df['LastUpDate'] = df['LastUpDate'].replace('Never', None)

        # add a column for the sheetname
        df['Sheetname'] = sheetname
        # convert the dataframe to a sqlite table
        df.to_sql(tablename, conn, if_exists='append', index=False)
    return df

In [36]:
# define df as empty df to store the result for debugging purposes in the notebook
df = pd.DataFrame()


for file in files:
    # output the file index and how many files are remaining after each file
    print('File {} of {}'.format(files.index(file)+1, len(files)))
    for sheetname in sheets:
        print(file, sheetname)
        df = import_sheet(sheetname, file)
        #format length with thousand seperator
        print('{:,} records imported'.format(len(df)))
#print(df)

File 1 of 3
HiddenServices-2018-2-14.xlsx New Today
3 records imported
HiddenServices-2018-2-14.xlsx Down
11,735 records imported
HiddenServices-2018-2-14.xlsx Up
4,838 records imported
File 2 of 3
HiddenServices-2020-8-12.xlsx New Today
0 records imported
HiddenServices-2020-8-12.xlsx Down
1,486 records imported
HiddenServices-2020-8-12.xlsx Up
3,250 records imported
File 3 of 3
HiddenServices-2019-11-3.xlsx New Today
1 records imported
HiddenServices-2019-11-3.xlsx Down
1,165 records imported
HiddenServices-2019-11-3.xlsx Up
3,723 records imported


In [25]:
#df

In [26]:
close_connection(conn)

The SQLite connection is closed
