# Data Generator
Generate random data of required size for the following database schema in `.csv` format

![](./schema.png)

## Libraries

In [39]:
import pandas as pd
import random
from datetime import datetime, timedelta
import os

## Dictionaries

### Cleaning `words.txt`

In [None]:
# with open('./data/words.txt') as words:
#     for word in words.readlines():
#         word = word.strip()
#         print(word)
# words.close()

### Load Data Dictionaries

In [41]:
def load(path):
    with open(path) as file:
        entities = list(file.read().split('\n'))
    file.close()
    return entities

In [42]:
names = load('./data/names.txt')
surnames = load('./data/surnames.txt')
words = load('./data/words.txt')

## Generate

### Helpers

In [44]:
def generate_password():
    ALLOWED_CHARS = list('abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789')
    password = ''.join(random.choice(ALLOWED_CHARS) for _ in range(16))
    return password

In [45]:
def generate_users(N):
    user_id = []
    name = []
    surname = []
    identity_card = []

    for i in range(1, N+1):
        user_id.append(i)
        name.append(random.choice(names))
        surname.append(random.choice(surnames))
        identity_card.append(generate_password())

    data = {
        'user_id': user_id,
        'name': name,
        'surname': surname,
        'identity_card': identity_card
    }
    users = pd.DataFrame(data)

    return users

In [46]:
SECONDS_IN_YEAR = 365 * 24 * 60 * 60

In [47]:
def generate_date():
    start = datetime(2022, 1, 1, 0, 0, 0)
    return str(start + timedelta(seconds=random.randint(0, SECONDS_IN_YEAR)))

In [53]:
# DATETIME - YYYY-MM-DD HH:MM:SS
# INSERT INTO comment VALUES
# (1, 'today hello', '2021-12-01 12:00:00', NULL);

def generate_comments(N):
    comment_id = []
    text = []
    created_at = []
    user_id = []

    for i in range(1, N+1):
        comment_id.append(i)
        text.append(' '.join(random.choice(words) for _ in range(8)))
        created_at.append(generate_date())
        user_id.append(random.randint(1, N))

    data = {
        'comment_id': comment_id,
        'text': text,
        'created_at': created_at,
        'user_id': user_id
    }
    comments = pd.DataFrame(data)

    return comments

In [49]:
def save(df, path):
    df.to_csv(path, header=True, index=False, sep='\t')

In [62]:
N = int(input())
OUTPUT_FOLDER = input()

users = generate_users(N)
comments = generate_comments(N)

# Saving
BASE_PATH = f'./output/{OUTPUT_FOLDER}'
os.makedirs(BASE_PATH, exist_ok=True)
save(users, f'{BASE_PATH}/users.csv')
save(comments, f'{BASE_PATH}/comments.csv')