In [4]:
%run database.py

In [5]:
conn = create_connection()

Connection to SQLite DB successful


In [6]:
pings_tablename = 'pings'


In [7]:
# define columns as a list of pairs with title and type
pings_columns = [
    ('HiddenService', 'TEXT'), # Indexed
    ('Language', 'short_string CHAR(2)'), # Indexed 
    ('HTTPStatus', 'INTEGER'), # Indexed
    ('LastContactedDate', 'TEXT'), # Indexed
    ('LastContactedTime', 'TEXT'),
    ('LastUpDate', 'TEXT'),  # Indexed
    ('Title', 'TEXT'), # Indexed
    ('Sheetname', 'TEXT'), # Indexed
]

In [8]:
files_tablename = 'files'
files_columns = [
    ('Filename', 'TEXT PRIMARY KEY'), 
]    

In [9]:
conn.execute(generateCreateTableQuery(pings_tablename, pings_columns))
conn.execute(generateCreateTableQuery(files_tablename, files_columns))

CREATE TABLE IF NOT EXISTS "pings" (
"HiddenService" TEXT,
"Language" short_string CHAR(2),
"HTTPStatus" INTEGER,
"LastContactedDate" TEXT,
"LastContactedTime" TEXT,
"LastUpDate" TEXT,
"Title" TEXT,
"Sheetname" TEXT);
CREATE TABLE IF NOT EXISTS "files" (
"Filename" TEXT PRIMARY KEY);


<sqlite3.Cursor at 0x7f516c1ed8f0>

In [10]:
# create an index on the column "Hidden Service"
conn.execute('''CREATE INDEX IF NOT EXISTS "''' + pings_tablename + '''_HiddenService" ON ''' + pings_tablename + '''("HiddenService");''')        
conn.execute('''CREATE INDEX IF NOT EXISTS "''' + pings_tablename + '''_Language" ON ''' + pings_tablename + '''("Language");''')        
conn.execute('''CREATE INDEX IF NOT EXISTS "''' + pings_tablename + '''_HTTPStatus" ON ''' + pings_tablename + '''("HTTPStatus");''')        
conn.execute('''CREATE INDEX IF NOT EXISTS "''' + pings_tablename + '''_Sheetname" ON ''' + pings_tablename + '''("Sheetname");''')        
conn.execute('''CREATE INDEX IF NOT EXISTS "''' + pings_tablename + '''_Title" ON ''' + pings_tablename + '''("Title");''')        
conn.execute('''CREATE INDEX IF NOT EXISTS "''' + pings_tablename + '''_LastContactedDate" ON ''' + pings_tablename + '''("LastContactedDate");''')        
conn.execute('''CREATE INDEX IF NOT EXISTS "''' + pings_tablename + '''_LastUpDate" ON ''' + pings_tablename + '''("LastUpDate");''')        

<sqlite3.Cursor at 0x7f516c1eddc0>

In [11]:
import pandas as pd
import os

# iterate through the .xlsx files in the rawreports folder and add it to a list of files
files = []
for file in os.listdir('rawreports'):
    if file.endswith('.xlsx'):
        files.append(file)


In [12]:
#only take the first files for testing purposes
#files = files[:3]
len(files)

3

In [16]:
sheets = ["New Today", "Down", "Up"]

In [17]:
import hashlib

def hash_url(url: str) -> str:
    return hashlib.sha256(url.encode()).hexdigest()

# url = "http://your_hidden_service_url_with_up_to_300_characters..."
# hashed_url = hash_url(url)
# print(hashed_url)

In [25]:
def import_sheet(sheetname, file):
    # read the excel file into a pandas dataframe
    df = pd.read_excel('rawreports/' + file,sheetname)
    
    df.columns = df.columns.map(lambda x: x.replace(' ', ''))
    
    # replace the column HIddenService with a hashed version of it
    df['HiddenService'] = df['HiddenService'].map(lambda x: hash_url(x))
    
    # if the column LastSeen does not exist create it with the value of the column FirstSeen
    if len(df) > 0: 
        if 'LastContacted' not in df.columns:
            df['LastContacted'] = df['FirstSeen']
            df = df.drop(columns=['FirstSeen'])
        # split LastContacted into date and time
        df['LastContactedDate'] = df['LastContacted'].str.split(' ', n=1, expand=True)[0] 
        df['LastContactedTime'] = df['LastContacted'].str.split(' ', n=1, expand=True)[1]
        df = df.drop(columns=['LastContacted'])

        if 'LastUp' in df.columns:
            # ignoring time from LastUp, as we probably have it in LastContacted
            df['LastUpDate'] = df['LastUp'].str.split(' ', n=1, expand=True)[0]
            df = df.drop(columns=['LastUp'])
            # replace 'Never' with Null in LastUpDate
            df['LastUpDate'] = df['LastUpDate'].replace('Never', None)

        # add a column for the sheetname
        df['Sheetname'] = sheetname
        # convert the dataframe to a sqlite table
        df.to_sql(pings_tablename, conn, if_exists='append', index=False)
    return df

In [26]:
# define df as empty df to store the result for debugging purposes in the notebook
df = pd.DataFrame()


for file in files:
    # output the file index and how many files are remaining after each file
    print('File {} of {}'.format(files.index(file)+1, len(files)))
    # check if the file is already in the database
    if conn.execute('SELECT COUNT(*) FROM ' + files_tablename + ' WHERE Filename = ?', (file,)).fetchone()[0] > 0:
        print('File ' + file + ' already in database')
    else:
        # add the filename to the files table
        conn.execute('INSERT INTO ' + files_tablename + ' (Filename) VALUES (?)', (file,))
        for sheetname in sheets:
            print(file, sheetname)
            df = import_sheet(sheetname, file)
            if df is not None:
                #format length with thousand seperator
                print('{:,} records imported'.format(len(df)))
#print(df)

File 1 of 3
File HiddenServices-2018-2-14.xlsx already in database
File 2 of 3
File HiddenServices-2020-8-12.xlsx already in database
File 3 of 3
File HiddenServices-2019-11-3.xlsx already in database


In [25]:
#df

In [26]:
close_connection(conn)

The SQLite connection is closed
