In [1]:
#import libraries
import pandas as pd
import sqlite3
import openpyxl

In [2]:
# Import each sheet, first row as column headers
products = pd.read_excel('sales original.xlsx', sheet_name='products', header=0)
stock = pd.read_excel('sales original.xlsx', sheet_name='stock', header=0)
assessment = pd.read_excel('sales original.xlsx', sheet_name='assessment', header=0)
invoices = pd.read_excel('sales original.xlsx', sheet_name='invoices', header=0)
customers = pd.read_excel('sales original.xlsx', sheet_name='customers', header=0)

### Check the tables

In [3]:
stock.nunique()

StockCode    3942
ASIN         4134
dtype: int64

Okay, this is a problem. Apparently there exist different ASIN with the same StockCode.<br />
Question, are these just typos or do these products share the same physical stock space? Even if they share the same space, they should not have the same StockCode. Would make it difficult to rearrange just one of them or update number in stock.

In [4]:
out = stock.astype(str).groupby('StockCode').agg(lambda x: ', '.join(x.unique()))
out = out.loc[out['ASIN'].str.len() > 10]
out.count()

ASIN    657
dtype: int64

657 StockCodes share two or more ASIN. Will this impact my choice of primary key? Don't think so. The combination of ASIN+StockCode would still be unique to a product. Only difficulty I see is if two products with a Null ASIN share the same StockCode but have a different title.<br />
Let's check that.

In [5]:
products1 = products.copy()

# Extract StockCode and insert into test copy of products
extracted_col = stock['StockCode']
products1.insert(1, 'StockCode', extracted_col)

# Extract all the products with a Null ASIN
null_products = products1.loc[products1['ASIN'] == 'Null']

In [6]:
# Show all products where the title is Null
null_products.loc[null_products.title == 'Null']

Unnamed: 0,ASIN,StockCode,title,product_type


In [7]:
null_products.nunique()

ASIN             1
StockCode       70
title           64
product_type     5
dtype: int64

In the list of products with a Null ASIN all have a title. That is good. There are alo no products with no or a Null title.<br />
But there are 70 different StockCodes and only 64 different titles. So some StockCodes are associated with the same title. That is not optimal but better than the other way around. This way the combination of ASIN+StockCode as a primary key will still work.

Assessment will be merged into the products table, because records were appended sequentially this is not problematic. If assessment contained distinct review records instead of aggregated values for each product the assessment table would have been kept.

The customers table contains a primary key already, but let's check for null values and if there are CustomerIDs that point to different countries.

In [8]:
customers.loc[customers.CustomerID == 'Null']

Unnamed: 0,CustomerID,Country
11,Null,Germany
12,Null,Germany
13,Null,Germany
14,Null,Germany
15,Null,Germany
...,...,...
554375,Null,Germany
554376,Null,Germany
554377,Null,Germany
554378,Null,Germany


There are 139315 records in which the CustomerID is not known. This is not good. This means that invoices have a Null foreign key in place of the CustomerID and the invoice cannot be assigned to a customer.<br />
I could resolve this by merging the tables invoices and customers, grouping by InvoiceNo and assigning a new value to CustomerID where Null appears in an invoice. Running the risk of assigning a new ID to an existing customer.

Are there any invoices that do not have an ID?

In [10]:
invoices.loc[invoices.InvoiceNo == 'Null']

Unnamed: 0,InvoiceNo,ASIN,Quantity,price,total_sale,invoice_date,invoice_time,CustomerID


Thankfully there are no invoices that do not have an ID or are empty. Therefore customerIDs where there is a Null can be assigned a new ID based on an invoice.<br />
But there are of course null ASINs in the invoices which cannot be assigned to a product.

### Modifications

In [9]:
# In products: insert StockCode after ASIN to create a combined primary key
extracted_col = stock['StockCode']
products.insert(1, 'StockCode', extracted_col)

# insert price of a single item
extracted_col = invoices['price']
products.insert(4, 'price', extracted_col)

In [10]:
# In products: insert assessment; since there are no distinct reviews (with reviews) but only aggregated ones, it does not
# make sense to keep them in a separate table
products['rating'] = pd.Series(assessment['rating'])
products['review_count'] = pd.Series(assessment['review_count'])

In [11]:
products.columns

Index(['ASIN', 'StockCode', 'title', 'product_type', 'price', 'rating',
       'review_count'],
      dtype='object')

In [13]:
# Create a copy of invoices as a linking table to create a many-to-many relationship 
# between products and invoice.
# Could probably do this with slice too
invoice_product = invoices.copy()

# In invoice_product: insert StockCode after ASIN to create a combined foreign key
extracted_col = stock['StockCode']
invoice_product.insert(2, 'StockCode', extracted_col)

# Delete not needed columns
invoice_product = invoice_product.drop(['invoice_date', 'invoice_time', 'CustomerID'], axis=1)

In [14]:
# Delete columns from invoices now in invoice_products
invoices = invoices.drop(['ASIN', 'Quantity', 'price', 'total_sale'], axis=1)


In [15]:
# Transfer the customers table over to the invoice table
invoices['CustomerID'] = pd.Series(customers['CustomerID'])
invoices['Country'] = pd.Series(customers['Country'])

Begin: Testing the customerID assignment

In [16]:
invoices1 = invoices.copy()

In [17]:
# Show me the invoices that do not have a CustomerID
null_invoice = invoices1.loc[invoices1.CustomerID == 'Null']

In [18]:
# Assign 9+last four digits of invoice number to CustomerID == Null
# That way we can keep the invoices with Null CUstomers and assign a "temporary" ID to Null Customers
for index, row in invoices1.iterrows():
    if row['CustomerID'] == 'Null':
        invoices1.at[index, 'CustomerID'] = '9' + str(row['InvoiceNo'])[-4:]


In [19]:
# Check and compare null_invoice to the newly assigned
new_df = invoices1.loc[invoices1['CustomerID'].str.startswith('9', na=False)].copy()
new_df.groupby(['InvoiceNo', 'CustomerID']).size().reset_index().rename(columns={0:'count'})

Unnamed: 0,InvoiceNo,CustomerID,count
0,78536544,96544,551
1,78536555,96555,2
2,78536558,96558,1
3,78536565,96565,2
4,78536592,96592,618
...,...,...,...
2045,78581497,91497,60
2046,78581498,91498,233
2047,78A563185,93185,1
2048,78A563186,93186,1


In [20]:
null_invoice.groupby(['InvoiceNo', 'CustomerID']).size().reset_index().rename(columns={0:'count'})

Unnamed: 0,InvoiceNo,CustomerID,count
0,78536544,Null,551
1,78536555,Null,2
2,78536558,Null,1
3,78536565,Null,2
4,78536592,Null,618
...,...,...,...
2045,78581497,Null,60
2046,78581498,Null,233
2047,78A563185,Null,1
2048,78A563186,Null,1


End: Testing the customerID assignment

In [21]:
# Assign temporary IDs to Null CustomerIDs in invoices with an initial 9
# Why the trouble? Each invoice with a Null customerID can now be associated with an
# individual Null customer instead of lumping all Nulls together (n.b. will screw up 
# your country analysis). If that customer already had an ID or has another invoice 
# as Null is irrelevant. This way we do not have to discard the Null records in invoice
# and customers.
for index, row in invoices.iterrows():
    if row['CustomerID'] == 'Null':
        invoices.at[index, 'CustomerID'] = '9' + str(row['InvoiceNo'])[-4:]

In [22]:
# Transfer the customers back over to the customer table
customers['CustomerID'] = pd.Series(invoices['CustomerID'])
customers['Country'] = pd.Series(invoices['Country'])

# And drop the customers' country in the invoices table
invoices = invoices.drop(['Country'], axis=1)

In [23]:
# in stock: drop ASIN, delete duplicates
# in assessment: drop entire dataframe
# in products: delete duplicates
# in customers: delete duplicates
# in invoices: delete duplicates
# in invoice_product: delete duplicates, delete price

In [24]:
stock = stock.drop(['ASIN'], axis=1)
stock = stock.drop_duplicates()

products = products.drop_duplicates()

customers = customers.drop_duplicates()

invoices = invoices.drop_duplicates()

invoice_product = invoice_product.drop_duplicates()
invoice_product = invoice_product.drop(['price'], axis=1)

Write the dataframes stock, products, customers, invoices, invoice_product to a database.<br />
First try SQLite and then try mySQL.