#### Creates Customer Data

In [82]:
from faker import Faker
import pandas as pd
import random

fake = Faker()
num_customers = 500
customers = [
    {
        'cust_id': cust_id,
        'name': fake.name() if not fake.name().startswith('P') else None,
        'email': fake.email() if not len(fake.email()) > 28 else None,
        'address': fake.address(),
        'phone': fake.phone_number()
    }
        for cust_id in range(1, num_customers + 1)
]
customers_df = pd.DataFrame(customers)
customers_df

Unnamed: 0,cust_id,name,email,address,phone
0,1,Cheryl Jimenez,christinerogers@example.org,"2117 Gary Squares\nWest Joseph, PR 67562",001-379-324-2894
1,2,Emily Lewis,gshelton@example.net,"1950 Max Park\nPort Danielstad, AS 82791",543.851.0522
2,3,Jennifer Poole,kristincastro@example.net,"3202 Eric Bypass\nBethhaven, WA 97952",+1-410-717-7774
3,4,Brian Pierce,yatesrebecca@example.net,"02523 Macias Loop\nPort Josephshire, VA 85527",6238128007
4,5,Raymond Camacho,toddharris@example.net,"96412 Louis Dam Apt. 722\nDavidmouth, KY 27548",+1-686-703-6263x0142
...,...,...,...,...,...
495,496,Carla Lewis,ypatterson@example.com,"12484 Daniels Expressway\nSierraburgh, CO 19331",001-436-824-7644
496,497,Austin Reed,elizabeth63@example.net,"22414 Michael Greens\nGloverview, DC 71232",(666)979-1597x5347
497,498,Andrew Hawkins,alvarezlisa@example.org,"586 Glenn Station\nOlsonmouth, FM 78362",321.338.1053
498,499,Michelle Davidson,fgrimes@example.org,"9229 John Mount\nNorth Davidbury, VT 87332",8547599137


#### Sums of Nulls for 'email' and 'name'

In [None]:
customer_check = int(customers_df['email'].isnull().sum())
print(customer_check)

In [None]:
name_check = int(customers_df['name'].isnull().sum())
name_check

#### Creates Product Data

In [83]:
product = ['grass', 'hay', 'corn', 'soybean', 'wheat', 'cabbage', 'alfalfa', 'peonies', 'hibiscus']
product_category = ['implements', 'seed', 'decorative', 'feed', 'produce']

products = [
    {
        'product_id': product_id,
        'product_name': product,
        'price': round(random.uniform(1, 100), 2),
        'product_category': random.choice(product_category)
    }
        for product_id, product in enumerate(product, start=1)]

products_df = pd.DataFrame(products)
products_df

Unnamed: 0,product_id,product_name,price,product_category
0,1,grass,56.22,seed
1,2,hay,58.57,produce
2,3,corn,55.64,implements
3,4,soybean,76.39,seed
4,5,wheat,15.69,produce
5,6,cabbage,7.07,implements
6,7,alfalfa,49.98,implements
7,8,peonies,82.4,implements
8,9,hibiscus,20.48,feed


#### Creates Order Data

In [84]:
import random
num_orders = 1000
store_id = [ 1, 2, 3, 4 ]
orders = [
    {
        'order_id': order_id,
        'cust_id' : random.choice(customers_df['cust_id'].tolist()),
        'product_id': random.choice(products_df['product_id'].tolist()),
        'order_date': fake.date() if not pd.to_datetime(fake.date()).year == 2002 else random.choice(['not_a_date', 20002]),
        'store_id' : random.choice(store_id),
        'quantity' : random.randint(1, 3)
    }
        for order_id in range(1, num_orders + 1)
]
orders_df = pd.DataFrame(orders)
orders_df

Unnamed: 0,order_id,cust_id,product_id,order_date,store_id,quantity
0,1,298,3,1972-11-26,2,2
1,2,207,6,1970-10-25,1,3
2,3,430,2,1970-10-15,2,2
3,4,203,3,2019-12-10,4,2
4,5,353,2,2004-11-02,4,2
...,...,...,...,...,...,...
995,996,487,7,1979-09-18,4,1
996,997,411,6,2021-09-04,1,3
997,998,267,3,2021-09-06,2,3
998,999,340,2,1993-04-02,3,3


#### Gives Count of Bad Values by Name

In [None]:
count_bad_i = orders_df['order_date'].value_counts().get(20002, 0)
print(count_bad_i)

In [None]:
count_bad_s = orders_df['order_date'].value_counts().get('not_a_date', 0)
print(count_bad_s)

### In order to check that the customer id  and product ids are aligned across tables

In [None]:
assert orders_df['cust_id'].isin(customers_df['cust_id']).all()
assert orders_df['product_id'].isin(products_df['product_id']).all()

In [None]:
# orders_df
# customers_df
# products_df

### Maps tables to DB

In [85]:
from sqlalchemy import create_engine, text

engine = create_engine('postgresql://admin:admin@localhost:5433/postgres')

orders_table = """create table orders (
	order_id int generated always as identity primary key,
	cust_id int,
	order_date date,
	product_id int,
	store_id int,
	quantity int
);"""

with engine.connect() as conn :
    conn.execute(text(orders_table))
    conn.commit()

In [86]:
products_table = """create table products (
	product_id int generated always as identity primary key,
	product_name varchar,
	price float,
	product_cat varchar
);"""
with engine.connect() as conn :
    conn.execute(text(products_table))
    conn.commit()

In [87]:
customers_table = """create table customers (
	customer_id int generated always as identity primary key,
	name varchar,
	email varchar,
	address varchar,
	phone varchar
);"""
with engine.connect() as conn:
    conn.execute(text(customers_table))
    conn.commit()

### Loads data to DB

In [88]:
# Customers Data
customers_df.to_sql('customers', engine, if_exists='replace', index=False)

500

In [89]:
# Orders Data
orders_df.to_sql('orders', engine, if_exists='replace', index=False)

1000

In [90]:
# Products Data
products_df.to_sql('products', engine, if_exists='replace', index=False)

9

# SOME CLEANING FOR LATER (not for production)

#### In order to check for bad data in orders

In [None]:
orders_df['order_date'] = pd.to_datetime(orders_df['order_date'], errors='coerce')
bd_mask = orders_df['order_date'].isna()
bad_dates = orders_df[bd_mask]['order_date']
print(bad_dates.value_counts)

In [None]:
check_o_types = orders_df.dtypes
print(check_o_types)

#### In order to check bad data in customers

In [None]:
customer_mask = customers_df['name'].isna()
bad_names = customers_df[customer_mask]['name']
print(bad_names.value_counts)

In [None]:
check_c_types = customers_df.dtypes
print(check_c_types)

In [None]:
email_mask = customers_df['email'].isna()
bad_emails = customers_df[email_mask]['email']
print(bad_emails.value_counts)

#### In order to check bad data in products

In [None]:
product_mask = products_df['product_name'].isna()
bad_p_name = products_df[product_mask]['product_name']
print(bad_p_name.value_counts)

In [None]:
check_p_types = products_df.dtypes
print(check_p_types)