Let's understand the dataset layout, to help us create a good database design.

In [74]:
#import libraries
import pandas as pd
import chardet
import os  

In [75]:
#Path to CSV save location
path = r"C:\Users\Killian\Projects\SQL\Superstore\tables"
os.makedirs(path, exist_ok=True)   

In [76]:
# look at the first 100k bytes to guess the character encoding
with open("Sample-Superstore.csv", 'rb') as rawdata:
    result = chardet.detect(rawdata.read(100000))

# check what the character encoding might be
print(result)

{'encoding': 'Windows-1252', 'confidence': 0.73, 'language': ''}


In [77]:
#Import the CSV. Encoding as per previous step.
dfs = pd.read_csv('Sample-Superstore.csv', sep=',', encoding='windows-1252')

In [78]:
# Look at columns, size, nulls, datatypes

dfs.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9994 entries, 0 to 9993
Data columns (total 21 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   Row ID         9994 non-null   int64  
 1   Order ID       9994 non-null   object 
 2   Order Date     9994 non-null   object 
 3   Ship Date      9994 non-null   object 
 4   Ship Mode      9994 non-null   object 
 5   Customer ID    9994 non-null   object 
 6   Customer Name  9994 non-null   object 
 7   Segment        9994 non-null   object 
 8   Country        9994 non-null   object 
 9   City           9994 non-null   object 
 10  State          9994 non-null   object 
 11  Postal Code    9994 non-null   int64  
 12  Region         9994 non-null   object 
 13  Product ID     9994 non-null   object 
 14  Category       9994 non-null   object 
 15  Sub-Category   9994 non-null   object 
 16  Product Name   9994 non-null   object 
 17  Sales          9994 non-null   float64
 18  Quantity

In [79]:
#First look at contents
dfs.head(5)

Unnamed: 0,Row ID,Order ID,Order Date,Ship Date,Ship Mode,Customer ID,Customer Name,Segment,Country,City,...,Postal Code,Region,Product ID,Category,Sub-Category,Product Name,Sales,Quantity,Discount,Profit
0,1,CA-2016-152156,11/8/2016,11/11/2016,Second Class,CG-12520,Claire Gute,Consumer,United States,Henderson,...,42420,South,FUR-BO-10001798,Furniture,Bookcases,Bush Somerset Collection Bookcase,261.96,2,0.0,41.9136
1,2,CA-2016-152156,11/8/2016,11/11/2016,Second Class,CG-12520,Claire Gute,Consumer,United States,Henderson,...,42420,South,FUR-CH-10000454,Furniture,Chairs,"Hon Deluxe Fabric Upholstered Stacking Chairs,...",731.94,3,0.0,219.582
2,3,CA-2016-138688,6/12/2016,6/16/2016,Second Class,DV-13045,Darrin Van Huff,Corporate,United States,Los Angeles,...,90036,West,OFF-LA-10000240,Office Supplies,Labels,Self-Adhesive Address Labels for Typewriters b...,14.62,2,0.0,6.8714
3,4,US-2015-108966,10/11/2015,10/18/2015,Standard Class,SO-20335,Sean O'Donnell,Consumer,United States,Fort Lauderdale,...,33311,South,FUR-TA-10000577,Furniture,Tables,Bretford CR4500 Series Slim Rectangular Table,957.5775,5,0.45,-383.031
4,5,US-2015-108966,10/11/2015,10/18/2015,Standard Class,SO-20335,Sean O'Donnell,Consumer,United States,Fort Lauderdale,...,33311,South,OFF-ST-10000760,Office Supplies,Storage,Eldon Fold 'N Roll Cart System,22.368,2,0.2,2.5164


In [80]:
#We need to fix order date and ship date to be YYYY/MM/DD (can be ordered easily even when seen as a string). 
dfs[['Order Date','Ship Date']] = dfs[['Order Date','Ship Date']].apply(pd.to_datetime)

In [81]:
dfs.head(5)

Unnamed: 0,Row ID,Order ID,Order Date,Ship Date,Ship Mode,Customer ID,Customer Name,Segment,Country,City,...,Postal Code,Region,Product ID,Category,Sub-Category,Product Name,Sales,Quantity,Discount,Profit
0,1,CA-2016-152156,2016-11-08,2016-11-11,Second Class,CG-12520,Claire Gute,Consumer,United States,Henderson,...,42420,South,FUR-BO-10001798,Furniture,Bookcases,Bush Somerset Collection Bookcase,261.96,2,0.0,41.9136
1,2,CA-2016-152156,2016-11-08,2016-11-11,Second Class,CG-12520,Claire Gute,Consumer,United States,Henderson,...,42420,South,FUR-CH-10000454,Furniture,Chairs,"Hon Deluxe Fabric Upholstered Stacking Chairs,...",731.94,3,0.0,219.582
2,3,CA-2016-138688,2016-06-12,2016-06-16,Second Class,DV-13045,Darrin Van Huff,Corporate,United States,Los Angeles,...,90036,West,OFF-LA-10000240,Office Supplies,Labels,Self-Adhesive Address Labels for Typewriters b...,14.62,2,0.0,6.8714
3,4,US-2015-108966,2015-10-11,2015-10-18,Standard Class,SO-20335,Sean O'Donnell,Consumer,United States,Fort Lauderdale,...,33311,South,FUR-TA-10000577,Furniture,Tables,Bretford CR4500 Series Slim Rectangular Table,957.5775,5,0.45,-383.031
4,5,US-2015-108966,2015-10-11,2015-10-18,Standard Class,SO-20335,Sean O'Donnell,Consumer,United States,Fort Lauderdale,...,33311,South,OFF-ST-10000760,Office Supplies,Storage,Eldon Fold 'N Roll Cart System,22.368,2,0.2,2.5164


In [82]:
# Two new columns required for more in depth analysis. 
# Supplier price (what VS Group buy it for) and Catalogue price (what VS Group offer it for)

dfs["Catalogue price"] = (dfs.Sales/dfs.Quantity)/(1-dfs.Discount)

dfs["Supplier price"] = (dfs.Sales-dfs.Profit)/dfs.Quantity

dfs.head(5)

Unnamed: 0,Row ID,Order ID,Order Date,Ship Date,Ship Mode,Customer ID,Customer Name,Segment,Country,City,...,Product ID,Category,Sub-Category,Product Name,Sales,Quantity,Discount,Profit,Catalogue price,Supplier price
0,1,CA-2016-152156,2016-11-08,2016-11-11,Second Class,CG-12520,Claire Gute,Consumer,United States,Henderson,...,FUR-BO-10001798,Furniture,Bookcases,Bush Somerset Collection Bookcase,261.96,2,0.0,41.9136,130.98,110.0232
1,2,CA-2016-152156,2016-11-08,2016-11-11,Second Class,CG-12520,Claire Gute,Consumer,United States,Henderson,...,FUR-CH-10000454,Furniture,Chairs,"Hon Deluxe Fabric Upholstered Stacking Chairs,...",731.94,3,0.0,219.582,243.98,170.786
2,3,CA-2016-138688,2016-06-12,2016-06-16,Second Class,DV-13045,Darrin Van Huff,Corporate,United States,Los Angeles,...,OFF-LA-10000240,Office Supplies,Labels,Self-Adhesive Address Labels for Typewriters b...,14.62,2,0.0,6.8714,7.31,3.8743
3,4,US-2015-108966,2015-10-11,2015-10-18,Standard Class,SO-20335,Sean O'Donnell,Consumer,United States,Fort Lauderdale,...,FUR-TA-10000577,Furniture,Tables,Bretford CR4500 Series Slim Rectangular Table,957.5775,5,0.45,-383.031,348.21,268.1217
4,5,US-2015-108966,2015-10-11,2015-10-18,Standard Class,SO-20335,Sean O'Donnell,Consumer,United States,Fort Lauderdale,...,OFF-ST-10000760,Office Supplies,Storage,Eldon Fold 'N Roll Cart System,22.368,2,0.2,2.5164,13.98,9.9258


In [83]:
#Sales, Profit and Supplier Price are funky as heck. They need to be rounded to the nearest cent. 
#All prices will also need to be in CENTS to avoid floating point errors in SQLite.

cols = ["Sales", "Profit", "Supplier price", "Catalogue price"]

dfs.loc[:, cols] = (dfs[cols]*100).astype(int)

Now let's attempt to break up the dataframe into component tables that will later be relational tables in our RDBMS.
This may take some trial and error offline with ERD and sketches.
We will ascribe PK, FK, datatypes and any other constraints later.

In [84]:
#Reminder of contents..
dfs.columns

Index(['Row ID', 'Order ID', 'Order Date', 'Ship Date', 'Ship Mode',
       'Customer ID', 'Customer Name', 'Segment', 'Country', 'City', 'State',
       'Postal Code', 'Region', 'Product ID', 'Category', 'Sub-Category',
       'Product Name', 'Sales', 'Quantity', 'Discount', 'Profit',
       'Catalogue price', 'Supplier price'],
      dtype='object')

In [85]:
#Just trying some breakdowns to get a feel for the dataset.
#We can see that "Profit" is really just based on the catalogue price, we can still make money on "negative" sales.

#dfs.groupby(['State'])[["Sales", "Profit", "Catalogue price", "Supplier price"]].sum().sort_values(by=['Sales'], ascending=False)

In [86]:
# First pass

Customers = dfs[["Customer ID", "Customer Name", "Segment"]]
Address = dfs[["Customer ID", "Region", "State", "City", "Postal Code"]] # Address ID will be required to connect this to orders. Removed country as not required
Products = dfs[["Product ID", "Product Name", "Catalogue price", "Supplier price"]] #Needs Cat ID!
Category = dfs[["Category", "Sub-Category"]] #Category ID will be needed and linked back to Products
Orders = dfs[["Order ID", "Customer ID", "Order Date", "Ship Date", "Ship Mode"]] #Needs Address ID!
OrderDetails = dfs[["Order ID", "Product ID", "Quantity", "Sales", "Discount", "Profit"]] #Sales table did not survive 3NF

In [87]:
Customers.isnull().sum()

Customer ID      0
Customer Name    0
Segment          0
dtype: int64

In [88]:
Customers.head(3)

Unnamed: 0,Customer ID,Customer Name,Segment
0,CG-12520,Claire Gute,Consumer
1,CG-12520,Claire Gute,Consumer
2,DV-13045,Darrin Van Huff,Corporate


In [89]:
Customers["Customer ID"].unique().size

793

In [90]:
Customers["Customer Name"].unique().size

793

In [91]:
Customers = Customers.drop_duplicates(ignore_index=True)

In [92]:
Customers

Unnamed: 0,Customer ID,Customer Name,Segment
0,CG-12520,Claire Gute,Consumer
1,DV-13045,Darrin Van Huff,Corporate
2,SO-20335,Sean O'Donnell,Consumer
3,BH-11710,Brosina Hoffman,Consumer
4,AA-10480,Andrew Allen,Consumer
...,...,...,...
788,CJ-11875,Carl Jackson,Corporate
789,RS-19870,Roy Skaria,Home Office
790,SC-20845,Sung Chung,Consumer
791,RE-19405,Ricardo Emerson,Consumer


In [93]:
Address.isnull().sum()

Customer ID    0
Region         0
State          0
City           0
Postal Code    0
dtype: int64

In [94]:
Address.head(3)

Unnamed: 0,Customer ID,Region,State,City,Postal Code
0,CG-12520,South,Kentucky,Henderson,42420
1,CG-12520,South,Kentucky,Henderson,42420
2,DV-13045,West,California,Los Angeles,90036


In [95]:
#The goal here is to see if customers have multiple addresses.
Address.groupby(['Customer ID', 'Postal Code']).ngroups

4910

In [96]:
Address['Customer ID'].unique().size

793

In [97]:
Address=Address.drop_duplicates()

In [98]:
duplicate = Address[Address.duplicated('Customer ID')]

In [99]:
duplicate.sort_values(by='Customer ID', ascending=False)

Unnamed: 0,Customer ID,Region,State,City,Postal Code
3814,ZD-21925,South,Florida,Jacksonville,32216
3040,ZD-21925,South,Tennessee,Chattanooga,37421
5897,ZD-21925,Central,Indiana,Richmond,47374
8341,ZD-21925,West,California,Los Angeles,90036
8923,ZC-21910,South,North Carolina,Hickory,28601
...,...,...,...,...,...
807,AA-10375,Central,Nebraska,Omaha,68104
1299,AA-10315,West,California,San Francisco,94109
2229,AA-10315,West,California,San Francisco,94122
7468,AA-10315,East,New York,New York City,10011


In [100]:
#We need to make a new column to use as a primary key for Addresses. That, or use a composite primary.. 
# Good discussion here https://stackoverflow.com/questions/14588304/composite-primary-key-vs-additional-id-column
#So we will use these as a reference to increase the number and we will put UNIQUE constraints on the composite, but use the ID as primary key.
Address['AddressID'] = range(1, len(Address) + 1)

In [101]:
Address['AddressID'] = "A" + Address['AddressID'].astype(str).str.zfill(6)

In [102]:
Address

Unnamed: 0,Customer ID,Region,State,City,Postal Code,AddressID
0,CG-12520,South,Kentucky,Henderson,42420,A000001
2,DV-13045,West,California,Los Angeles,90036,A000002
3,SO-20335,South,Florida,Fort Lauderdale,33311,A000003
5,BH-11710,West,California,Los Angeles,90032,A000004
12,AA-10480,South,North Carolina,Concord,28027,A000005
...,...,...,...,...,...,...
9986,ML-17410,West,California,Los Angeles,90008,A004906
9987,RA-19885,South,Georgia,Athens,30605,A004907
9989,TB-21400,South,Florida,Miami,33180,A004908
9990,DB-13060,West,California,Costa Mesa,92627,A004909


In [103]:
Address.reset_index(drop=True)

Unnamed: 0,Customer ID,Region,State,City,Postal Code,AddressID
0,CG-12520,South,Kentucky,Henderson,42420,A000001
1,DV-13045,West,California,Los Angeles,90036,A000002
2,SO-20335,South,Florida,Fort Lauderdale,33311,A000003
3,BH-11710,West,California,Los Angeles,90032,A000004
4,AA-10480,South,North Carolina,Concord,28027,A000005
...,...,...,...,...,...,...
4905,ML-17410,West,California,Los Angeles,90008,A004906
4906,RA-19885,South,Georgia,Athens,30605,A004907
4907,TB-21400,South,Florida,Miami,33180,A004908
4908,DB-13060,West,California,Costa Mesa,92627,A004909


In [104]:
#Here I want to see if the prices remain consistent throughout the dataset. 
#Futureprooding would definitely require us to be able to change catalogue or supplier price going forward.
price_test = dfs[["Product ID","Catalogue price","Supplier price"]]

In [105]:
price_test.sort_values('Product ID')

Unnamed: 0,Product ID,Catalogue price,Supplier price
3512,FUR-BO-10000112,13097,10478
5494,FUR-BO-10000330,12098,10404
6401,FUR-BO-10000330,12098,10404
1760,FUR-BO-10000330,12098,10404
1594,FUR-BO-10000362,17098,13165
...,...,...,...
6635,TEC-PH-10004977,19598,13915
5064,TEC-PH-10004977,19599,13915
9727,TEC-PH-10004977,19599,13915
7477,TEC-PH-10004977,19599,13915


In [106]:
price_test = price_test.drop_duplicates(ignore_index=True)

In [107]:
#Many of these duplicates were not being caught due to rounding errors at the calculation stage despite precautions. 
price_test.sort_values('Product ID', ignore_index=True)

Unnamed: 0,Product ID,Catalogue price,Supplier price
0,FUR-BO-10000112,13097,10478
1,FUR-BO-10000330,12098,10404
2,FUR-BO-10000362,17098,13165
3,FUR-BO-10000468,4857,4080
4,FUR-BO-10000468,4858,4080
...,...,...,...
2734,TEC-PH-10004922,6699,5024
2735,TEC-PH-10004924,739,539
2736,TEC-PH-10004959,10049,7536
2737,TEC-PH-10004977,19599,13915


In [108]:
Category

Unnamed: 0,Category,Sub-Category
0,Furniture,Bookcases
1,Furniture,Chairs
2,Office Supplies,Labels
3,Furniture,Tables
4,Office Supplies,Storage
...,...,...
9989,Furniture,Furnishings
9990,Furniture,Furnishings
9991,Technology,Phones
9992,Office Supplies,Paper


In [109]:
#Ok, same approach as with addresses.
Category.groupby(['Category', 'Sub-Category']).ngroups

17

In [110]:
Category=Category.drop_duplicates(ignore_index=True)

In [111]:
Category = Category.sort_values(by='Category')

In [112]:
Category['CategoryID'] = range(1, len(Category) + 1)

In [113]:
#Allowing up to 9999 categories with current format. Could also just use a different letter later but this seems neater. 
#An E-Commerce website could expand its number of categories pretty easily. 
Category['CategoryID'] = "C" + Category['CategoryID'].astype(str).str.zfill(4)

In [114]:
Category

Unnamed: 0,Category,Sub-Category,CategoryID
0,Furniture,Bookcases,C0001
1,Furniture,Chairs,C0002
3,Furniture,Tables,C0003
5,Furniture,Furnishings,C0004
14,Office Supplies,Supplies,C0005
13,Office Supplies,Fasteners,C0006
12,Office Supplies,Envelopes,C0007
10,Office Supplies,Paper,C0008
9,Office Supplies,Appliances,C0009
8,Office Supplies,Binders,C0010


In [115]:
Category.columns

Index(['Category', 'Sub-Category', 'CategoryID'], dtype='object')

In [116]:
dfs2 = Category.merge(dfs, left_on=['Category', 'Sub-Category'], right_on=['Category', 'Sub-Category'], how='inner').sort_values(by="Row ID", ignore_index=True)

In [117]:
dfs2

Unnamed: 0,Category,Sub-Category,CategoryID,Row ID,Order ID,Order Date,Ship Date,Ship Mode,Customer ID,Customer Name,...,Postal Code,Region,Product ID,Product Name,Sales,Quantity,Discount,Profit,Catalogue price,Supplier price
0,Furniture,Bookcases,C0001,1,CA-2016-152156,2016-11-08,2016-11-11,Second Class,CG-12520,Claire Gute,...,42420,South,FUR-BO-10001798,Bush Somerset Collection Bookcase,26195,2,0.00,4191,13097,11002
1,Furniture,Chairs,C0002,2,CA-2016-152156,2016-11-08,2016-11-11,Second Class,CG-12520,Claire Gute,...,42420,South,FUR-CH-10000454,"Hon Deluxe Fabric Upholstered Stacking Chairs,...",73194,3,0.00,21958,24398,17078
2,Office Supplies,Labels,C0013,3,CA-2016-138688,2016-06-12,2016-06-16,Second Class,DV-13045,Darrin Van Huff,...,90036,West,OFF-LA-10000240,Self-Adhesive Address Labels for Typewriters b...,1462,2,0.00,687,731,387
3,Furniture,Tables,C0003,4,US-2015-108966,2015-10-11,2015-10-18,Standard Class,SO-20335,Sean O'Donnell,...,33311,South,FUR-TA-10000577,Bretford CR4500 Series Slim Rectangular Table,95757,5,0.45,-38303,34821,26812
4,Office Supplies,Storage,C0012,5,US-2015-108966,2015-10-11,2015-10-18,Standard Class,SO-20335,Sean O'Donnell,...,33311,South,OFF-ST-10000760,Eldon Fold 'N Roll Cart System,2236,2,0.20,251,1397,992
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9989,Furniture,Furnishings,C0004,9990,CA-2014-110422,2014-01-21,2014-01-23,Second Class,TB-21400,Tom Boeckenhauer,...,33180,South,FUR-FU-10001889,Ultra Door Pull Handle,2524,3,0.20,410,1052,704
9990,Furniture,Furnishings,C0004,9991,CA-2017-121258,2017-02-26,2017-03-03,Standard Class,DB-13060,Dave Brooks,...,92627,West,FUR-FU-10000747,Tenex B1-RE Series Chair Mats for Low Pile Car...,9196,2,0.00,1563,4598,3816
9991,Technology,Phones,C0014,9992,CA-2017-121258,2017-02-26,2017-03-03,Standard Class,DB-13060,Dave Brooks,...,92627,West,TEC-PH-10003645,Aastra 57i VoIP phone,25857,2,0.20,1939,16161,11959
9992,Office Supplies,Paper,C0008,9993,CA-2017-121258,2017-02-26,2017-03-03,Standard Class,DB-13060,Dave Brooks,...,92627,West,OFF-PA-10004041,"It's Hot Message Books with Stickers, 2 3/4"" x 5""",2960,4,0.00,1332,740,407


In [118]:
Products = dfs2[["Product ID", "Product Name", "Catalogue price", "Supplier price", "CategoryID"]]

In [119]:
Products.head(3)

Unnamed: 0,Product ID,Product Name,Catalogue price,Supplier price,CategoryID
0,FUR-BO-10001798,Bush Somerset Collection Bookcase,13097,11002,C0001
1,FUR-CH-10000454,"Hon Deluxe Fabric Upholstered Stacking Chairs,...",24398,17078,C0002
2,OFF-LA-10000240,Self-Adhesive Address Labels for Typewriters b...,731,387,C0013


In [120]:
Products = Products.drop_duplicates(subset="Product ID")

In [121]:
Products = Products.sort_values('Product ID', ignore_index=True)

In [122]:
Products.tail(3)

Unnamed: 0,Product ID,Product Name,Catalogue price,Supplier price,CategoryID
1859,TEC-PH-10004924,"SKILCRAFT Telephone Shoulder Rest, 2"" x 6.5"" x...",739,539,C0014
1860,TEC-PH-10004959,Classic Ivory Antique Telephone ZL1810,10049,7536,C0014
1861,TEC-PH-10004977,GE 30524EE4,19598,13915,C0014


In [123]:
Address.columns

Index(['Customer ID', 'Region', 'State', 'City', 'Postal Code', 'AddressID'], dtype='object')

In [124]:
dfs3 = Address.merge(dfs2, left_on=['Customer ID', 'Region', 'State', 'City', 'Postal Code'], 
                     right_on=['Customer ID', 'Region', 'State', 'City', 'Postal Code'], 
                     how='inner').sort_values(by="Row ID", ignore_index=True)

In [125]:
dfs3.columns

Index(['Customer ID', 'Region', 'State', 'City', 'Postal Code', 'AddressID',
       'Category', 'Sub-Category', 'CategoryID', 'Row ID', 'Order ID',
       'Order Date', 'Ship Date', 'Ship Mode', 'Customer Name', 'Segment',
       'Country', 'Product ID', 'Product Name', 'Sales', 'Quantity',
       'Discount', 'Profit', 'Catalogue price', 'Supplier price'],
      dtype='object')

In [126]:
Orders = dfs3[["Order ID", "Customer ID", "Order Date", "AddressID"]]

In [127]:
Orders

Unnamed: 0,Order ID,Customer ID,Order Date,AddressID
0,CA-2016-152156,CG-12520,2016-11-08,A000001
1,CA-2016-152156,CG-12520,2016-11-08,A000001
2,CA-2016-138688,DV-13045,2016-06-12,A000002
3,US-2015-108966,SO-20335,2015-10-11,A000003
4,US-2015-108966,SO-20335,2015-10-11,A000003
...,...,...,...,...
9989,CA-2014-110422,TB-21400,2014-01-21,A004908
9990,CA-2017-121258,DB-13060,2017-02-26,A004909
9991,CA-2017-121258,DB-13060,2017-02-26,A004909
9992,CA-2017-121258,DB-13060,2017-02-26,A004909


In [128]:
Orders = Orders.drop_duplicates(subset="Order ID", ignore_index=True)

In [129]:
Orders = Orders.sort_values('Order Date', ascending=True, ignore_index=True)

In [130]:
Orders

Unnamed: 0,Order ID,Customer ID,Order Date,AddressID
0,CA-2014-103800,DP-13000,2014-01-03,A003887
1,CA-2014-112326,PO-19195,2014-01-04,A000343
2,CA-2014-141817,MB-18085,2014-01-05,A000854
3,CA-2014-106054,JO-15145,2014-01-06,A003509
4,CA-2014-130813,LS-17230,2014-01-06,A002614
...,...,...,...,...
5004,CA-2017-130631,BS-11755,2017-12-29,A002686
5005,CA-2017-143259,PO-18865,2017-12-30,A000432
5006,CA-2017-115427,EB-13975,2017-12-30,A000610
5007,CA-2017-126221,CC-12430,2017-12-30,A000300


In [131]:
OrderDetails = OrderDetails.drop_duplicates(subset=["Order ID", "Product ID"], ignore_index=True)

In [132]:
OrderDetails

Unnamed: 0,Order ID,Product ID,Quantity,Sales,Discount,Profit
0,CA-2016-152156,FUR-BO-10001798,2,26195,0.00,4191
1,CA-2016-152156,FUR-CH-10000454,3,73194,0.00,21958
2,CA-2016-138688,OFF-LA-10000240,2,1462,0.00,687
3,US-2015-108966,FUR-TA-10000577,5,95757,0.45,-38303
4,US-2015-108966,OFF-ST-10000760,2,2236,0.20,251
...,...,...,...,...,...,...
9981,CA-2014-110422,FUR-FU-10001889,3,2524,0.20,410
9982,CA-2017-121258,FUR-FU-10000747,2,9196,0.00,1563
9983,CA-2017-121258,TEC-PH-10003645,2,25857,0.20,1939
9984,CA-2017-121258,OFF-PA-10004041,4,2960,0.00,1332


In [133]:
dfs3.head(3)

Unnamed: 0,Customer ID,Region,State,City,Postal Code,AddressID,Category,Sub-Category,CategoryID,Row ID,...,Segment,Country,Product ID,Product Name,Sales,Quantity,Discount,Profit,Catalogue price,Supplier price
0,CG-12520,South,Kentucky,Henderson,42420,A000001,Furniture,Bookcases,C0001,1,...,Consumer,United States,FUR-BO-10001798,Bush Somerset Collection Bookcase,26195,2,0.0,4191,13097,11002
1,CG-12520,South,Kentucky,Henderson,42420,A000001,Furniture,Chairs,C0002,2,...,Consumer,United States,FUR-CH-10000454,"Hon Deluxe Fabric Upholstered Stacking Chairs,...",73194,3,0.0,21958,24398,17078
2,DV-13045,West,California,Los Angeles,90036,A000002,Office Supplies,Labels,C0013,3,...,Corporate,United States,OFF-LA-10000240,Self-Adhesive Address Labels for Typewriters b...,1462,2,0.0,687,731,387


In [134]:
#Now I want to output CSVs just for flexibility and to be safe.

list_of_tables = ['Customers', 'Address', 'Products', 'Category', 'Orders', 'Order_details']
dataframes = [Customers, Address, Products, Category, Orders, OrderDetails]

In [135]:
for i in range (len(list_of_tables)):
    x = dataframes[i]
    y = list_of_tables[i]
    x.to_csv(f"{path}/{y}.csv", index=False)

In [136]:
dfs3.to_csv(f'{path}/db.csv', index=False)

In [137]:
#Lazy way to see longest value in each column. Useful later when creating db table constraints.

for col in dfs3:
    print(f'Longest val in {col} is {dfs3[col].astype(str).map(len).max()}, min is {dfs3[col].astype(str).map(len).min()}')

Longest val in Customer ID is 8, min is 8
Longest val in Region is 7, min is 4
Longest val in State is 20, min is 4
Longest val in City is 17, min is 4
Longest val in Postal Code is 5, min is 4
Longest val in AddressID is 7, min is 7
Longest val in Category is 15, min is 9
Longest val in Sub-Category is 11, min is 3
Longest val in CategoryID is 5, min is 5
Longest val in Row ID is 4, min is 1
Longest val in Order ID is 14, min is 14
Longest val in Order Date is 10, min is 10


Longest val in Ship Date is 10, min is 10
Longest val in Ship Mode is 14, min is 8
Longest val in Customer Name is 22, min is 7
Longest val in Segment is 11, min is 8
Longest val in Country is 13, min is 13
Longest val in Product ID is 15, min is 15
Longest val in Product Name is 127, min is 5
Longest val in Sales is 7, min is 2
Longest val in Quantity is 2, min is 1
Longest val in Discount is 4, min is 3
Longest val in Profit is 7, min is 1
Longest val in Catalogue price is 6, min is 2
Longest val in Supplier price is 6, min is 2


Over to dbcreation.ipynb for the database creation! 