# Create a Test Dataset to Develop a Query for HackerRank's 15 Days of SQL Problem

In [17]:
import datetime
import random
import sqlite3

import pandas as pd

## Generate Random Integers for Hacker ID values

In [13]:
def gen_unique_random_ints(n, low, high):
    ints = set()
    while len(ints) < n:
        ints.add(random.randint(low, high))
    return list(ints)

In [31]:
hacker_ids = gen_unique_random_ints(12, 100, 999)
hacker_ids

[580, 517, 614, 231, 620, 333, 910, 911, 369, 631, 475, 830]

## Generate Random Hacker Names
The *us-names.txt* file was downloaded from https://github.com/smashew/NameDatabases/blob/master/NamesDatabases/surnames/us.txt. Thanks to Matthew Hagar for making his names database available a the github repository https://github.com/smashew/NameDatabases.

In [32]:
with open("us-names.txt") as nfile:
    names = nfile.readlines()

In [33]:
hackers = [{"hacker_id": hacker_id, "name": random.choice(names).strip()} for hacker_id in hacker_ids]
hackers

[{'hacker_id': 580, 'name': 'Faux'},
 {'hacker_id': 517, 'name': 'Pontoriero'},
 {'hacker_id': 614, 'name': 'Ritch'},
 {'hacker_id': 231, 'name': 'Sandelin'},
 {'hacker_id': 620, 'name': 'Leadman'},
 {'hacker_id': 333, 'name': 'Withrow'},
 {'hacker_id': 910, 'name': 'Supple'},
 {'hacker_id': 911, 'name': 'Shoeman'},
 {'hacker_id': 369, 'name': 'Berwick'},
 {'hacker_id': 631, 'name': 'Dawood'},
 {'hacker_id': 475, 'name': 'Zwicker'},
 {'hacker_id': 830, 'name': 'Lounder'}]

## Build the Submissions Data

In [34]:
num_submission_choices = [4, 3, 3, 2, 2, 2, 2, 2,
                          1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
                          0, 0, 0, 0, 0, 0]

days = [1, 2, 3, 4]

submissions = []
for day in days:
    for hacker in hackers:
        num_submissions = random.choice(num_submission_choices)
        for _ in range(num_submissions):
            submissions.append(
                {"hacker_id": hacker["hacker_id"],
                 "submission_date": datetime.date(2021, 11, day),
                 "score": random.randint(0, 100)})
            
submission_ids = gen_unique_random_ints(len(submissions), 1000, 9999)
for i in range(len(submission_ids)):
    submissions[i]["submission_id"] = submission_ids[i]
    
submissions_df = pd.DataFrame(submissions)
submissions_df.sort_values(["hacker_id", "submission_date"])     

Unnamed: 0,hacker_id,submission_date,score,submission_id
34,231,2021-11-03,99,3527
35,231,2021-11-03,88,5062
36,231,2021-11-03,17,1230
37,231,2021-11-03,39,7378
38,333,2021-11-03,68,9044
48,333,2021-11-04,53,1262
23,369,2021-11-02,1,5566
40,369,2021-11-03,84,7261
49,369,2021-11-04,92,2289
50,369,2021-11-04,61,8308


In [40]:
hackers_df = pd.DataFrame(hackers).set_index("hacker_id")
hackers_df

Unnamed: 0_level_0,name
hacker_id,Unnamed: 1_level_1
580,Faux
517,Pontoriero
614,Ritch
231,Sandelin
620,Leadman
333,Withrow
910,Supple
911,Shoeman
369,Berwick
631,Dawood


In [38]:
submissions_df = submissions_df.set_index("submission_id")
submissions_df

Unnamed: 0_level_0,hacker_id,submission_date,score
submission_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
4102,580,2021-11-01,43
8198,580,2021-11-01,64
6664,517,2021-11-01,41
7178,517,2021-11-01,45
6539,517,2021-11-01,44
4876,614,2021-11-01,42
4236,614,2021-11-01,33
3218,614,2021-11-01,77
4501,614,2021-11-01,89
5910,620,2021-11-01,69


## Write Hacker and Submissions Data to SQLite

In [45]:
import sqlite3
con = sqlite3.connect("sql15days.sqlite3")

In [49]:
hackers_df.to_sql("Hackers", con)
submissions_df.to_sql("Submissions", con)
con.commit()
con.close()