In [1]:
#import libraries
import pandas as pd
import numpy as np
import sqlite3
import openpyxl

In [2]:
# Import each sheet, first row as column headers
products = pd.read_excel('data/sales original.xlsx', sheet_name='products', header=0)
stock = pd.read_excel('data/sales original.xlsx', sheet_name='stock', header=0)
assessment = pd.read_excel('data/sales original.xlsx', sheet_name='assessment', header=0)
invoices = pd.read_excel('data/sales original.xlsx', sheet_name='invoices', header=0)
customers = pd.read_excel('data/sales original.xlsx', sheet_name='customers', header=0)

<hr>

### Check the dataframes
We have already done a lot of checking up on empty and null values in the *read_in.ipynb*.

**Stock dataframe**

Summary:
* StockCode: 5 digits with optional one letter at the end
* StockCode can be associated with one or more ASINs
* There are null values in the ASINs but none in the StockCode

In [3]:
stock.head(12)

Unnamed: 0,StockCode,ASIN
0,21703,B07GWKDLGT
1,40001,B01MTLH408
2,85034A,B0064FS7HI
3,72798C,B013JHU5YG
4,20726,B06XWN9Q99
5,35271S,B07F9QN5Q9
6,20755,B006W1J3OK
7,22694,B00EO4A7L0
8,21733,B0000BZL0G
9,22366,B004JMZPJQ


In [4]:
stock.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 554417 entries, 0 to 554416
Data columns (total 2 columns):
 #   Column     Non-Null Count   Dtype 
---  ------     --------------   ----- 
 0   StockCode  554417 non-null  object
 1   ASIN       554417 non-null  object
dtypes: object(2)
memory usage: 8.5+ MB


In [5]:
stock.nunique()

StockCode    3942
ASIN         4134
dtype: int64

In [6]:
# Any empty StockCodes?
stock.loc[stock['StockCode'] == 'Null']

Unnamed: 0,StockCode,ASIN


In [7]:
# Any empty ASINs? If yes, how many and associated with which StockCodes?
stock.loc[stock['ASIN'] == 'Null'].value_counts()

StockCode  ASIN
23202      Null    889
22084      Null    446
22983      Null    444
21933      Null    421
22908      Null    387
                  ... 
84531B     Null      1
84531A     Null      1
72732      Null      1
23084      Null      1
90187A     Null      1
Length: 70, dtype: int64

In [8]:
out = stock.astype(str).groupby('StockCode').agg(lambda x: ', '.join(x.unique()))
out = out.loc[out['ASIN'].str.len() > 10]
out#.count()

Unnamed: 0_level_0,ASIN
StockCode,Unnamed: 1_level_1
10080,"B07D9D82R7, B000L47AHG"
10133,"B074VB8VT5, B07CM7N7TG"
15058A,"B0189228CS, B07D263CHY"
15058C,"B07NQMD216, B07D263CHY"
16008,"B016NDCXKM, B000L47AHG"
...,...
90195A,"B07JCKZ3RC, B000L47AHG"
90210D,"B07L9B57G5, B000L47AHG"
DCGS0003,"B07DNV7GJL, B07PQ8S46W"
DCGS0069,"B0754DLLMK, B07PQ8S46W"


We have already established that there are ASINs with the same StockCode.

There are 70 distinct ASINs which are Null. Hurrah for sequential adding of records in the Excel-file, by concatenating the dataframes products and stock even the Null ASINs have a StockCode.

We will drop the ASIN feature and concatenate with the products data frame.

In [9]:
# Extract StockCode and insert into products after ASIN
extracted_col = stock['StockCode']
products.insert(1, 'StockCode', extracted_col)

**Assessment dataframe**

Summary:
* The ranking and review_count is aggregated for each row (resp. ASIN).
* There are no individual reviews.
* Rating will be float, review_count will be integer

In [10]:
assessment.head(12)

Unnamed: 0,rating,review_count,ASIN
0,4.6,1399,B07GWKDLGT
1,4.6,289,B01MTLH408
2,4.4,333,B0064FS7HI
3,4.3,86,B013JHU5YG
4,4.7,15717,B06XWN9Q99
5,4.3,83,B07F9QN5Q9
6,4.2,253,B006W1J3OK
7,4.1,2281,B00EO4A7L0
8,4.5,6615,B0000BZL0G
9,4.3,925,B004JMZPJQ


In [11]:
assessment.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 554417 entries, 0 to 554416
Data columns (total 3 columns):
 #   Column        Non-Null Count   Dtype  
---  ------        --------------   -----  
 0   rating        554417 non-null  float64
 1   review_count  554417 non-null  int64  
 2   ASIN          554417 non-null  object 
dtypes: float64(1), int64(1), object(1)
memory usage: 12.7+ MB


In [12]:
assessment.nunique()

rating            26
review_count    1375
ASIN            4134
dtype: int64

In [13]:
assessment.loc[assessment['ASIN'] == 'Null']

Unnamed: 0,rating,review_count,ASIN
527,4.0,470,Null
528,4.3,191,Null
529,4.5,375,Null
530,4.2,848,Null
531,4.3,193,Null
...,...,...,...
553778,4.4,1683,Null
553779,4.4,1683,Null
553780,4.4,1683,Null
553781,4.4,1244,Null


Same as with the stock data frame.<br />
Assessment will be merged into the products data frame, because records were appended sequentially this is not problematic. If assessment contained distinct review records instead of aggregated values for each product a different solution would be necessary.

In [14]:
# In products: insert rating and reviiew_count
products['rating'] = pd.Series(assessment['rating'])
products['review_count'] = pd.Series(assessment['review_count'])

**Customers data frame**

Summary:
* CustomerID is an integer and may contain the string value null
* Country is a string; there are 37 values, this would be a good candidate for a category

In [15]:
customers.head(12)


Unnamed: 0,CustomerID,Country
0,18011,Germany
1,18011,Germany
2,18011,Germany
3,18011,Germany
4,18011,Germany
5,18011,Germany
6,18011,Germany
7,13576,Germany
8,15235,Germany
9,16835,Germany


In [16]:
customers.info()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 554417 entries, 0 to 554416
Data columns (total 2 columns):
 #   Column      Non-Null Count   Dtype 
---  ------      --------------   ----- 
 0   CustomerID  554417 non-null  object
 1   Country     554417 non-null  object
dtypes: object(2)
memory usage: 8.5+ MB


In [17]:
customers.nunique()


CustomerID    4340
Country         37
dtype: int64

In [18]:
customers.loc[customers['CustomerID'] == 'Null']

Unnamed: 0,CustomerID,Country
11,Null,Germany
12,Null,Germany
13,Null,Germany
14,Null,Germany
15,Null,Germany
...,...,...
554375,Null,Germany
554376,Null,Germany
554377,Null,Germany
554378,Null,Germany


Note that among the countries there is "European Community" which is not a country and "Unknown".

In [19]:
customers.Country.unique()

array(['Germany', 'Ireland', 'Norway', 'Australia', 'France',
       'United Kingdom', 'Netherlands', 'Italy', 'Portugal', 'Poland',
       'Switzerland', 'Belgium', 'Spain', 'Japan', 'Lithuania', 'Iceland',
       'Denmark', 'Cyprus', 'Sweden', 'Finland', 'Austria', 'Bahrain',
       'Israel', 'Hong Kong', 'Greece', 'Singapore', 'Lebanon',
       'United Arab Emirates', 'Saudi Arabia', 'Czech Republic', 'Canada',
       'Unknown', 'Brazil', 'United States', 'European Community',
       'Malta', 'South Africa'], dtype=object)

We have already established that there are 139315 records where the CustomerIDs contain the string value null.

We will merge the customers data frame with the invoices data frame and generate a temporary ID for each of these customers as in the *read_in.ipynb* notebook.

**Invoices data frame**

Summary:
* InvoiceNo is a string(!), there are three InvoiceNo that contain a letter
* ASIN and CustomerID are foreign keys, if you want to call them that
* Quantity is integer, price and total_sale is float, invoice_date a datetime, and invoice_time an integer

In [20]:
invoices.head(12)

Unnamed: 0,InvoiceNo,ASIN,Quantity,price,total_sale,invoice_date,invoice_time,CustomerID
0,78536597,B07GWKDLGT,4,496.95,1987.8,2018-12-01,17,18011
1,78536597,B01MTLH408,4,39.99,159.96,2018-12-01,17,18011
2,78536597,B0064FS7HI,1,12.99,12.99,2018-12-01,17,18011
3,78536597,B013JHU5YG,1,199.99,199.99,2018-12-01,17,18011
4,78536597,B06XWN9Q99,1,7.49,7.49,2018-12-01,17,18011
5,78536597,B07F9QN5Q9,14,23.99,335.86,2018-12-01,17,18011
6,78536597,B006W1J3OK,6,11.95,71.7,2018-12-01,17,18011
7,78536595,B00EO4A7L0,5,79.0,395.0,2018-12-01,17,13576
8,78536594,B0000BZL0G,6,21.9,131.4,2018-12-01,17,15235
9,78536593,B004JMZPJQ,2,49.99,99.98,2018-12-01,17,16835


In [21]:
invoices.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 554417 entries, 0 to 554416
Data columns (total 8 columns):
 #   Column        Non-Null Count   Dtype         
---  ------        --------------   -----         
 0   InvoiceNo     554417 non-null  object        
 1   ASIN          554417 non-null  object        
 2   Quantity      554417 non-null  int64         
 3   price         554417 non-null  float64       
 4   total_sale    554417 non-null  float64       
 5   invoice_date  554417 non-null  datetime64[ns]
 6   invoice_time  554417 non-null  int64         
 7   CustomerID    554417 non-null  object        
dtypes: datetime64[ns](1), float64(2), int64(2), object(3)
memory usage: 33.8+ MB


In [22]:
invoices.nunique()

InvoiceNo       20534
ASIN             4134
Quantity          454
price            1363
total_sale      16947
invoice_date      305
invoice_time       15
CustomerID       4340
dtype: int64

In [23]:
invoices.loc[invoices['InvoiceNo'] == 'Null']

Unnamed: 0,InvoiceNo,ASIN,Quantity,price,total_sale,invoice_date,invoice_time,CustomerID


In [24]:
invoices.loc[invoices['ASIN'] == 'Null']

Unnamed: 0,InvoiceNo,ASIN,Quantity,price,total_sale,invoice_date,invoice_time,CustomerID
527,78536592,Null,1,24.99,24.99,2018-12-01,17,Null
528,78536592,Null,1,32.53,32.53,2018-12-01,17,Null
529,78536592,Null,1,38.99,38.99,2018-12-01,17,Null
530,78536587,Null,6,33.99,203.94,2018-12-01,16,14142
531,78536557,Null,2,29.99,59.98,2018-12-01,14,17841
...,...,...,...,...,...,...,...,...
553778,78581498,Null,3,12.95,38.85,2019-12-09,10,Null
553779,78581498,Null,1,9.97,9.97,2019-12-09,10,Null
553780,78581492,Null,2,9.97,19.94,2019-12-09,10,Null
553781,78581492,Null,2,169.99,339.98,2019-12-09,10,Null


In [25]:
invoices.loc[invoices['CustomerID'] == 'Null']

Unnamed: 0,InvoiceNo,ASIN,Quantity,price,total_sale,invoice_date,invoice_time,CustomerID
11,78536592,B007I5B3EW,2,30.70,61.40,2018-12-01,17,Null
12,78536592,B0149KE4FI,5,39.95,199.75,2018-12-01,17,Null
13,78536592,B07817MWDB,1,69.95,69.95,2018-12-01,17,Null
14,78536592,B00NP7G1U8,1,65.00,65.00,2018-12-01,17,Null
15,78536592,B07Q6BBS1L,1,28.99,28.99,2018-12-01,17,Null
...,...,...,...,...,...,...,...,...
554375,78581492,B07QMRRHPY,1,119.98,119.98,2019-12-09,10,Null
554376,78581492,B07XG1HT7G,1,49.99,49.99,2019-12-09,10,Null
554377,78581492,B00NOT2URU,1,10.99,10.99,2019-12-09,10,Null
554378,78581492,B07YJYGDQ2,1,7.99,7.99,2019-12-09,10,Null


Thankfully there are no invoices that do not have an ID or are empty. Therefore CustomerIDs where there is a null can be assigned a new ID based on an invoice.

In [26]:
# Assign temporary IDs to Null CustomerIDs in invoices with an initial 9

for index, row in invoices.iterrows():
    if row['CustomerID'] == 'Null':
        invoices.at[index, 'CustomerID'] = '9' + str(row['InvoiceNo'])[-4:]

In [27]:
# Overwrite the CustomerID in customers dataframe with the CustomerIDs incl. the
# new IDs from invoices.
customers['CustomerID'] = pd.Series(invoices['CustomerID'])

**Customers data frame continued**

Because we gave customers with a null CustomerID a new ID starting with 9 in invoices and wrote that into the customers dataframe, there will be no null CustomerIDs in customers dataframe.

In [28]:
# Create a temporary copy of customers and drop the duplicates, 
cust_loc = customers.copy()
cust_loc = cust_loc.drop_duplicates()

# then check if there are CustomerIDs that are associated with different countries
pd.concat(g for _, g in cust_loc.groupby('CustomerID') if len(g) > 1)

Unnamed: 0,CustomerID,Country
29852,12370,Cyprus
35444,12370,Austria
180899,12394,Belgium
392660,12394,Denmark
35415,12417,Belgium
174672,12417,Spain
64973,12422,Australia
244572,12422,Switzerland
20537,12429,Denmark
171442,12429,Austria


This is where the un-pythonic code from *read_in.ipynb* comes in. Largest country count wins the CustomerID.

In [71]:
#print(customers.Country[customers['CustomerID'] == 12370].value_counts())
# Cyprus     166
# Austria      9

# print(customers.Country[customers['CustomerID'] == 12394].value_counts())
# Belgium    20
# Denmark     6

# print(customers.Country[customers['CustomerID'] == 12417].value_counts())
# Belgium    166
# Spain       23

# print(customers.Country[customers['CustomerID'] == 12422].value_counts())
# Australia      22
# Switzerland    17

# print(customers.Country[customers['CustomerID'] == 12429].value_counts())
# Denmark    76
# Austria    20

# print(customers.Country[customers['CustomerID'] == 12431].value_counts())
# Australia    191
# Belgium       53

# print(customers.Country[customers['CustomerID'] == 12455].value_counts())
# Spain     48
# Cyprus    46

# print(customers.Country[customers['CustomerID'] == 12457].value_counts())
# Switzerland    59
# Cyprus          2

#------
# I have no idea why the temporary IDs generated above (starting with 9...) require 
# quotation marks to work.
#------

# print(customers.Country[customers['CustomerID'] == '91652'].value_counts())
# Hong Kong    29
# Germany       2

# print(customers.Country[customers['CustomerID'] == '92540'].value_counts())
# Germany    181
# Ireland     10

# print(customers.Country[customers['CustomerID'] == '92541'].value_counts())
# Germany    185
# Ireland      2

# print(customers.Country[customers['CustomerID'] == '93547'].value_counts())
# Ireland    23
# Germany     1

# print(customers.Country[customers['CustomerID'] == '95927'].value_counts())
# Ireland    65
# Germany     2

# print(customers.Country[customers['CustomerID'] == '95928'].value_counts())
# Ireland    46
# Germany     1

# print(customers.Country[customers['CustomerID'] == '96558'].value_counts())
# Germany      1
# Hong Kong    1

# print(customers.Country[customers['CustomerID'] == '97675'].value_counts())
# Hong Kong    15
# Germany       1

# print(customers.Country[customers['CustomerID'] == '99337'].value_counts())
# Germany    307
# Ireland     29

# print(customers.Country[customers['CustomerID'] == '99738'].value_counts())
# Germany      96
# Hong Kong    35


In [29]:
customers['Country'] = np.where((customers['CustomerID'] == 12370), 'Cyprus', customers['Country'])
customers['Country'] = np.where((customers['CustomerID'] == 12394), 'Belgium', customers['Country'])
customers['Country'] = np.where((customers['CustomerID'] == 12417), 'Belgium', customers['Country'])
customers['Country'] = np.where((customers['CustomerID'] == 12422), 'Australia', customers['Country'])
customers['Country'] = np.where((customers['CustomerID'] == 12429), 'Denmark', customers['Country'])
customers['Country'] = np.where((customers['CustomerID'] == 12431), 'Australia', customers['Country'])
customers['Country'] = np.where((customers['CustomerID'] == 12457), 'Switzerland', customers['Country'])
customers['Country'] = np.where((customers['CustomerID'] == '91652'), 'Hong Kong', customers['Country'])
customers['Country'] = np.where((customers['CustomerID'] == '92540'), 'Germany', customers['Country'])
customers['Country'] = np.where((customers['CustomerID'] == '92541'), 'Germany', customers['Country'])
customers['Country'] = np.where((customers['CustomerID'] == '93547'), 'Ireland', customers['Country'])
customers['Country'] = np.where((customers['CustomerID'] == '95927'), 'Ireland', customers['Country'])
customers['Country'] = np.where((customers['CustomerID'] == '95928'), 'Ireland', customers['Country'])
customers['Country'] = np.where((customers['CustomerID'] == '97675'), 'Hong Kong', customers['Country'])
customers['Country'] = np.where((customers['CustomerID'] == '99337'), 'Germany', customers['Country'])
customers['Country'] = np.where((customers['CustomerID'] == '99738'), 'Germany', customers['Country'])


There are two CustomerID that cannot be securely attributed to one country. We cannot drop them because then we would disturb the sequentiality of the imported rows. We will do that later.

Now merge the customers dataframe with the invoices.

In [30]:
extracted_col = customers['Country']
invoices.insert(7, 'Country', extracted_col)

**Invoices data frame continued**

Okay, there are duplicates in the invoices. We will get to them when we have merged products and invoices. Cannot drop them now as that would mess up the sequence of original data.

In [74]:
invoices[invoices.duplicated() == True]

Unnamed: 0,InvoiceNo,ASIN,Quantity,price,total_sale,invoice_date,invoice_time,Country,CustomerID
326,78536530,B07QZXJ14V,1,37.99,37.99,2018-12-01,13,Germany,17905
338,78536528,B004AA2Y9Q,1,47.66,47.66,2018-12-01,13,Germany,15525
339,78536528,B005KP473Q,1,25.99,25.99,2018-12-01,13,Germany,15525
341,78536528,B06XR6QX29,1,119.98,119.98,2018-12-01,13,Germany,15525
396,78536464,B00GE4MNQA,1,52.99,52.99,2018-12-01,12,Germany,17968
...,...,...,...,...,...,...,...,...,...
554085,78581538,B083G8NCSQ,1,17.98,17.98,2019-12-09,11,Germany,14446
554089,78581538,B00EO4A7L0,1,79.00,79.00,2019-12-09,11,Germany,14446
554141,78581498,B00EO4A7L0,1,79.00,79.00,2019-12-09,10,Germany,91498
554323,78581492,B01M4J5WCM,2,19.28,38.56,2019-12-09,10,Germany,91492


We know this from formatting the data for SQLite export. But here we check for duplicate InvoiceNo that differ in their date or time.

In [31]:
inv_dupl = invoices[['InvoiceNo', 'invoice_date', 'invoice_time']].copy()
inv_dupl = inv_dupl.drop_duplicates()

In [32]:
print(inv_dupl[inv_dupl['InvoiceNo'].duplicated() == True])

       InvoiceNo invoice_date  invoice_time
151777  78549245   2019-04-07            12


Found one! It belongs to an original CustomerID.

This did not delete when dropping duplicates because the invoice time is once 11 and once 12. Maybe the order went through just as the hour went from 11 to 12. No way to know without the minutes and seconds or a proper timestamp.

Because more records are associated with 11h I am changing the invoice_time of that specific InvoiceNo to 11h and overwriting 12h.

In [33]:
# This invoice number only ordered on one day so we do not need to make invoice_date a condition
invoices['invoice_time'] = np.where((invoices['InvoiceNo'] == 78549245), '11', invoices['invoice_time'])


In [34]:
invoices.loc[invoices['InvoiceNo'] == 78549245]

Unnamed: 0,InvoiceNo,ASIN,Quantity,price,total_sale,invoice_date,invoice_time,Country,CustomerID
151448,78549245,B00H2KL42Y,4,16.99,67.96,2019-04-07,11,Germany,15005
151449,78549245,B06XWMQ81P,2,10.29,20.58,2019-04-07,11,Germany,15005
151450,78549245,B07J1ZNZXF,1,175.0,175.0,2019-04-07,11,Germany,15005
151451,78549245,B0749GV5L3,6,299.0,1794.0,2019-04-07,11,Germany,15005
151452,78549245,B06XWN9Q99,2,7.49,14.98,2019-04-07,11,Germany,15005
151769,78549245,B077YNVRHN,2,15.49,30.98,2019-04-07,11,Germany,15005
151770,78549245,B01KT73IU2,4,18.49,73.96,2019-04-07,11,Germany,15005
151771,78549245,B074QM6F7X,2,72.99,145.98,2019-04-07,11,Germany,15005
151772,78549245,B01AJOX7LU,4,69.99,279.96,2019-04-07,11,Germany,15005
151773,78549245,B007T1CSFS,1,23.99,23.99,2019-04-07,11,Germany,15005


**Products data frame**

Summary:
* ASIN is a 10 character long alphanumeric string, title is a long string
* product_type is a string and a category candidate

In [35]:
products.head()

Unnamed: 0,ASIN,StockCode,title,product_type,rating,review_count
0,B07GWKDLGT,21703,Nikon D3500 W/ AF-P DX NIKKOR 18-55mm f/3.5-5....,dslr camera,4.6,1399
1,B01MTLH408,40001,"Manfrotto Element Aluminum 5-Section Monopod, ...",dslr camera,4.6,289
2,B0064FS7HI,85034A,"STK LP-E8 Battery for Canon Rebel T5i, T3i, T2...",dslr camera,4.4,333
3,B013JHU5YG,72798C,Lowepro Whistler BP 350 AW (Grey) . Profession...,dslr camera,4.3,86
4,B06XWN9Q99,20726,Samsung (MB-ME32GA/AM) 32GB 95MB/s (U1) microS...,dslr camera,4.7,15717


In [36]:
products.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 554417 entries, 0 to 554416
Data columns (total 6 columns):
 #   Column        Non-Null Count   Dtype  
---  ------        --------------   -----  
 0   ASIN          554417 non-null  object 
 1   StockCode     554417 non-null  object 
 2   title         554417 non-null  object 
 3   product_type  554417 non-null  object 
 4   rating        554417 non-null  float64
 5   review_count  554417 non-null  int64  
dtypes: float64(1), int64(1), object(4)
memory usage: 25.4+ MB


In [37]:
out = stock.astype(str).groupby('StockCode').agg(lambda x: ', '.join(x.unique()))
out = out.loc[out['ASIN'].str.len() > 10]
out#.count()

Unnamed: 0_level_0,ASIN
StockCode,Unnamed: 1_level_1
10080,"B07D9D82R7, B000L47AHG"
10133,"B074VB8VT5, B07CM7N7TG"
15058A,"B0189228CS, B07D263CHY"
15058C,"B07NQMD216, B07D263CHY"
16008,"B016NDCXKM, B000L47AHG"
...,...
90195A,"B07JCKZ3RC, B000L47AHG"
90210D,"B07L9B57G5, B000L47AHG"
DCGS0003,"B07DNV7GJL, B07PQ8S46W"
DCGS0069,"B0754DLLMK, B07PQ8S46W"


657 StockCodes share two or more ASIN.

In [38]:
# Extract all the products with a Null ASIN
null_products = products.loc[products['ASIN'] == 'Null']

# Show all products where the title is Null
null_products.loc[null_products.title == 'Null']

Unnamed: 0,ASIN,StockCode,title,product_type,rating,review_count


In [39]:
null_products.nunique()

ASIN             1
StockCode       70
title           64
product_type     5
rating          12
review_count    52
dtype: int64

In the list of products with a null ASIN all have a title. There are also no products with no or a null title.

But there are 70 different StockCodes and only 64 different titles. So some StockCodes are associated with the same title. This we also know from the previous file.

In [40]:
products['product_type'].value_counts()

smartphone     134109
keyboard       128524
mouse          101728
dslr camera     96531
processor       71551
monitor         21974
Name: product_type, dtype: int64

Let's merge the invoices and products dataframes.

In [41]:
# Drop ASIN in invoices
invoices = invoices.drop(['ASIN'], axis=1)

In [42]:
# Combine products and invoices into one dataframe
sales_db = pd.concat([products, invoices], axis=1)

<hr>

Nice! Let's look at the new and huge dataframe.

In [43]:
sales_db.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 554417 entries, 0 to 554416
Data columns (total 14 columns):
 #   Column        Non-Null Count   Dtype         
---  ------        --------------   -----         
 0   ASIN          554417 non-null  object        
 1   StockCode     554417 non-null  object        
 2   title         554417 non-null  object        
 3   product_type  554417 non-null  object        
 4   rating        554417 non-null  float64       
 5   review_count  554417 non-null  int64         
 6   InvoiceNo     554417 non-null  object        
 7   Quantity      554417 non-null  int64         
 8   price         554417 non-null  float64       
 9   total_sale    554417 non-null  float64       
 10  invoice_date  554417 non-null  datetime64[ns]
 11  invoice_time  554417 non-null  object        
 12  Country       554417 non-null  object        
 13  CustomerID    554417 non-null  object        
dtypes: datetime64[ns](1), float64(3), int64(2), object(8)
memory usage: 

In [44]:
print(sales_db[sales_db.duplicated() == True])

              ASIN StockCode  \
326     B07QZXJ14V     22097   
338     B004AA2Y9Q     22911   
339     B005KP473Q     22411   
341     B06XR6QX29    84985A   
396     B00GE4MNQA     21992   
...            ...       ...   
553591  B011HMDZ0Q     21481   
553810  B07TC2BK1X     23343   
553816  B07SXNDKNM     22075   
554085  B083G8NCSQ     22068   
554089  B00EO4A7L0     22694   

                                                    title product_type  \
326     Neewer 12 SMD LED Bulb Mini Pocket-Size On-Cam...  dslr camera   
338     Polaroid Studio Series IR Light Bar â€“Recharg...  dslr camera   
339      AmazonBasics 60-Inch Lightweight Tripod with Bag  dslr camera   
341     Monopod, COMAN KX3232 73.2 inch Professional M...  dslr camera   
396     Neewer 750II TTL Flash Speedlite with LCD Disp...  dslr camera   
...                                                   ...          ...   
553591  Redragon M602 RGB Wired Gaming Mouse RGB Spect...        mouse   
553810  Raspberry Pi 4 

In [45]:
sales_db.nunique()

ASIN             4134
StockCode        3942
title            4188
product_type        6
rating             26
review_count     1375
InvoiceNo       20534
Quantity          454
price            1363
total_sale      16947
invoice_date      305
invoice_time       15
Country            37
CustomerID       6250
dtype: int64

In [46]:
# Drop the duplicates
sales_db = sales_db.drop_duplicates()

In [47]:
# Drop the two customer IDs that could not be securely attributed to a country:
# CustomerID 12455 either Spain (48) or Cyprus (46) and 
# CustomerID 96558 either Germany (1) or Hong Kong (1).

sales_db.drop(sales_db[(sales_db['CustomerID'] == 12455) | (sales_db['CustomerID'] == 96558)].index, inplace = True)


In [48]:
sales_db.dtypes

ASIN                    object
StockCode               object
title                   object
product_type            object
rating                 float64
review_count             int64
InvoiceNo               object
Quantity                 int64
price                  float64
total_sale             float64
invoice_date    datetime64[ns]
invoice_time            object
Country                 object
CustomerID              object
dtype: object

In [49]:
sales_db.nunique()

ASIN             4134
StockCode        3942
title            4188
product_type        6
rating             26
review_count     1375
InvoiceNo       20529
Quantity          454
price            1363
total_sale      16945
invoice_date      305
invoice_time       15
Country            37
CustomerID       6249
dtype: int64

Modify the datatypes

In [50]:
# combine invoice_time and invoice_date to datetime
temp = sales_db['invoice_time']
sales_db['invoice_time'] = pd.to_datetime(temp, format='%H').dt.time

sales_db['invoice_date'] = sales_db.apply(lambda r : pd.datetime.combine(r['invoice_date'],r['invoice_time']),1)

sales_db.drop('invoice_time', axis=1, inplace=True)

# Note to self: do not forget this!
# FutureWarning: The pandas.datetime class is deprecated and will be removed from 
# pandas in a future version. Import from datetime module instead.


  sales_db['invoice_date'] = sales_db.apply(lambda r : pd.datetime.combine(r['invoice_date'],r['invoice_time']),1)


In [51]:
# product_type and country to category
# and customerID to int

sales_db['product_type'] = sales_db.product_type.astype('category')
sales_db['Country'] = sales_db.Country.astype('category')

sales_db['CustomerID'] = sales_db.CustomerID.astype('int64')

Are we done yet? Nope. What about those StockCodes that had more than one ASIN? Let's see what is up with them.

In [52]:
# This might take a bit, have patience
out = sales_db.astype(str).groupby('StockCode').agg(lambda x: ', '.join(x.unique()))
out = out.loc[out['ASIN'].str.len() > 10]
out#.count()

Unnamed: 0_level_0,ASIN,title,product_type,rating,review_count,InvoiceNo,Quantity,price,total_sale,invoice_date,Country,CustomerID
StockCode,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
10080,"B07D9D82R7, B000L47AHG",Cooler Master MasterAccessory Large Wrist Rest...,"keyboard, dslr camera","4.5, 4.4","147, 614","78545060, 78557568, 78558614, 78558911, 785613...","2, 48, 12, 24, 1, 3, 22, 4, 26","9.99, 16.95","19.98, 479.52, 119.88, 239.76, 9.99, 29.97, 37...","2019-02-27 13:00:00, 2019-06-21 10:00:00, 2019...",Germany,"15547, 17629, 98614, 14189, 15203, 16551, 1660..."
10133,"B074VB8VT5, B07CM7N7TG",ASUS Chromebook Flip C302 2-In-1 Laptop- 12.5â...,keyboard,"4.1, 4.2","1970, 240","78536446, 78536876, 78537225, 78537155, 785371...","5, 1, 10, 3, 2, 20, 6, 4, 7, 22, 12, 40, 70, 3...","549.99, 27.41","2749.95, 549.99, 5499.9, 1649.97, 1099.98, 109...","2018-12-01 12:00:00, 2018-12-03 11:00:00, 2018...","Germany, Portugal, Switzerland, Canada, Irelan...","15983, 96876, 12748, 18118, 17259, 16519, 1527..."
15058A,"B0189228CS, B07D263CHY",Neewer Aluminum Alloy 360 Degree Rotating Swiv...,"dslr camera, mouse","4.6, 4.2","347, 83","78538200, 78545226, 78545721, 78546486, 785477...","1, 4, 2, 3, 10, 30, 6, 7","21.99, 19.99","21.99, 87.96, 43.98, 65.97, 219.9, 599.7, 131....","2018-12-10 11:00:00, 2019-03-01 09:00:00, 2019...","Germany, Finland, Sweden, France, Switzerland,...","17912, 12428, 15039, 97788, 12639, 14524, 1473..."
15058C,"B07NQMD216, B07D263CHY",Ultra-Thin 2.4G Office Wireless Mouse Mute Cha...,mouse,"4.1, 4.2","275, 83","78543126, 78545545, 78545530, 78546433, 785468...","4, 1, 10, 2, 20, 30, 3","12.99, 19.99","51.96, 12.99, 129.9, 25.98, 259.8, 599.7, 38.9...","2019-02-03 14:00:00, 2019-03-03 15:00:00, 2019...","Germany, Netherlands, France, United Kingdom","13581, 17841, 17416, 12778, 96896, 15039, 9098..."
16008,"B016NDCXKM, B000L47AHG",Adamanta 16GB (2x8GB) Apple Memory Upgrade Com...,"processor, dslr camera","4.7, 4.4","270, 614","78538073, 78538652, 78543535, 78546062, 785492...","2, 12, 20, 72, 24, 240, 1, 3, 48, 288, 4, 6, 9...","89.99, 16.95","179.98, 1079.88, 1799.8, 6479.28, 2159.76, 215...","2018-12-09 14:00:00, 2018-12-13 15:00:00, 2019...","Germany, Iceland, Israel, Norway, Switzerland,...","17816, 17890, 17537, 16221, 12347, 14298, 1747..."
...,...,...,...,...,...,...,...,...,...,...,...,...
90195A,"B07JCKZ3RC, B000L47AHG",WANSENDA USB 3.0/3.1 128GB Type C Dual High Sp...,"smartphone, dslr camera","4.1, 4.4","117, 614","78536749, 78538056, 78538206, 78540468, 785423...","1, 4, 2, 45, 6","29.99, 16.95","29.99, 119.96, 59.98, 762.75, 179.94","2018-12-02 13:00:00, 2018-12-09 13:00:00, 2018...",Germany,"17976, 17198, 17827, 90468, 13174, 95617, 9987..."
90210D,"B07L9B57G5, B000L47AHG","PAWACA [Upgrade Newest 14"" Screen Magnifier Sm...","smartphone, dslr camera","3.4, 4.4","110, 614","78538662, 78538895, 78539307, 78539223, 785590...","12, 60, 1, 28, 10","19.88, 16.95","238.56, 1192.8, 19.88, 474.6, 198.8","2018-12-13 15:00:00, 2018-12-15 09:00:00, 2018...",Germany,"15159, 14401, 13694, 99223, 99055, 99876, 1326..."
DCGS0003,"B07DNV7GJL, B07PQ8S46W","Cell Phone Stand for Desk, YOSHINE Portable Ce...","smartphone, keyboard","4.6, 3.9","93, 96","78538349, 78539451, 78542622, 78551995, 78561256","1, 7","12.96, 6.89","12.96, 48.23","2018-12-10 14:00:00, 2018-12-17 16:00:00, 2019...",Germany,"98349, 99451, 92622, 91995, 91256"
DCGS0069,"B0754DLLMK, B07PQ8S46W","Sceptre C275B-144MN 27"" Curved 144Hz Gaming LE...","monitor, keyboard",3.9,"67, 96","78542541, 78561252","1, 5","295.76, 6.89","295.76, 34.45","2019-01-28 14:00:00, 2019-07-26 11:00:00",Germany,"92541, 91252"


I guess we will just have to live with two different products sharing the same StockCode. For now. But this needs fixing in logistics.<br />
But look, there are some in that list which only have one product_type! Do we have undicovered duplicates? 

In [53]:
with pd.option_context('display.max_colwidth', None):
  display(out.title)

StockCode
10080                                                                                                                                                                                                                            Cooler Master MasterAccessory Large Wrist Rest with Low-Friction Surface, Anti-Slip Base, and Water-Resistant Coating, Professional Mini Ball Head Camera Mount
10133                                                                                                         ASUS Chromebook Flip C302 2-In-1 Laptop- 12.5â€ Full HD 4-Way NanoEdge Touchscreen, Intel Core M5, 4GB RAM, 64GB Flash Storage, All-Metal Body, Backlit Keyboard, Chrome OS- C302CA-DH54 Silver, AmazonBasics PC Programmable Gaming Mouse | Adjustable 12,000 DPI, Weight Tuning
15058A          Neewer Aluminum Alloy 360 Degree Rotating Swivel Mini Ball Head with 1/4" 3/8" Thread Base Mount for DSLR Camera Like Canon,Nikon,Sony/Camcorder/iPhone 6s/6/5S/5/4S/4,Gopro HD Hero 1/2/3/3+/4, Gaming Mous

Doesn't look very pretty but in the records with only one product_type and one StockCode there are two products just with the same product_type.


In [54]:
# Chris's cleaning function
# Removes all 'weird' symbols from the title column

products_symbols=[' ', '!', '"', '#', '$', '%', '&', "'", '(', ')', '+', ',',\
                  '-', '.', '/', '0', '1', '2', '3', '4', '5', '6', '7', '8',\
                  '9', ':', ';', '<', '>', '?', '@', 'A', 'B', 'C', 'D', 'E',\
                  'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M', 'N', 'O', 'P', 'Q',\
                  'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z', '[', ']', '_',\
                  'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l',\
                  'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x',\
                  'y', 'z', '|', '~', '\x81', '\x90', '\x9d', '\xa0', '¡', '¥',\
                  '¦', '\xad', '°', '±', '³', '¼', 'Â', 'Ã', 'Î', 'â', 'ã', 'ï',\
                  'Œ', 'œ', 'ˆ', '˜', '—', '‘', '“', '”', '†', '‰', '‹', '€', '™']

products_clean_symbols=['\x81', '\x90', '\x9d', '\xa0', '¡', '¥',\
                  '\xad', '°', '±', '³', '¼', 'Â', 'Ã', 'Î', 'â', 'ã', 'ï',\
                  'Œ', 'œ', 'ˆ', '˜', '—', '‘', '“', '”','€','™']

#Products:
for c in products_clean_symbols:
  sales_db.title=sales_db.title.str.replace(c,'')

In [55]:
# Tableau does not like decimal separators as periods so make sure decimals export with a comma
#sales_db.to_csv('data/sales_db_tab.csv', sep='\t', decimal=",", index = False)

# Note: no need for this anymore. You can specify the comma in Tableau when you connect the
# data source

# Pandas prefers its decimals with a period.
sales_db.to_csv('data/sales_db.csv', sep='\t', index = False)


Are we done now? For now, yes, we are done.