In [678]:
import pandas as pd
from pandasql import PandaSQL
import numpy as np

In [679]:
# import dataframes

man_df = pd.read_csv('../data/manufacturers.csv', delimiter = ";")
prod_des_df = pd.read_csv('../data/product_descriptions.csv', delimiter = ";")
prod_prop_df = pd.read_csv('../data/product_properties.csv', delimiter = ";")

## Inspecting dataframes

In [680]:
# shows what the ID and name of manufacturer is
man_df.head(5)

Unnamed: 0,Manufacturernumber,Manufacturername
0,0 601 9K8 000,BOSCH
1,71222062000,FEIN
2,0 601 6B9 000,BOSCH
3,0 601 9K6 100,BOSCH
4,0 601 482 400,BOSCH


In [681]:
# each row shows the short and long descriptions of every product
prod_des_df.head(5)

Unnamed: 0,Articlenumber,Short description,Short description 2,Long description,Language
0,S-1124F-A,STYcutter 1124F-A – unser Einstiegsmodell mit ...,,Die STYcutter 1124F-A – unser Einstiegsmodell ...,de
1,S-1310-A,STYcutter 1310-A – Der Allrounder,,Die STYcutter 1310 – Der Allrounder\n\n\nUm d...,de
2,S-1310RW-A,STYcutter 1310RW-A – Der Allrounder,,Die STYcutter 1310RW-A – Der Allrounder\n\n\n...,de
3,S-1310SD-A,STYcutter 1310SD-A – bewährte Qualität,,Um den Anforderungen der Verarbeiter gerecht z...,de
4,S-UL55-22-K,"STYcutter UL55-22-K im Koffer – wenn mobil, da...",,"Die STYcutter UL55-22-K – wenn mobil, dann ric...",de


In [682]:
# each row should represent a unique product sold by a manufacturer
# Manufacturernumber and Articlenumber should be unique
prod_prop_df.head(5)

Unnamed: 0,Manufacturernumber,Articlenumber,EAN,Technical details,Product category,Technical specifications,Picture normal reduced,Depth m,Width m,Length m,...,Delivery time days,Type of product,Price quantity,ETIM Features,ETIM,ECLASS Features,ECLASS,PROFICLASS Features,PROFICLASS,Product features
0,,S-1124F-A,,§Schnitthöhe§1130§mm|§Schnitttiefe§250§mm|§Geh...,,,'https://www.nexmart.com/media/catalog/ampshar...,0.28,0.67,1.4,...,,,1,,,,,,,
1,,S-1310-A,,§Schnitthöhe§1130§mm|§Schnitttiefe§310§mm|§Geh...,,,'https://www.nexmart.com/media/catalog/ampshar...,0.28,0.67,1.4,...,,,1,,,,,,,
2,,S-1310RW-A,,§Schnitthöhe§1130§mm|§Schnitttiefe§310§mm|§Geh...,,,'https://www.nexmart.com/media/catalog/ampshar...,0.28,0.67,1.4,...,,,1,,,,,,,
3,,S-1310SD-A,,§Schnitthöhe§1130§mm|§Schnitttiefe§310§mm|§Geh...,,,'https://www.nexmart.com/media/catalog/ampshar...,0.28,0.67,1.4,...,,,1,,,,,,,
4,,S-UL55-22-K,,§Schnitthöhe§550§mm|§Schnitttiefe§220§mm|§Gehr...,,,'https://www.nexmart.com/media/catalog/ampshar...,0.19,0.45,0.86,...,,,1,,,,,,,


In [683]:
prod_prop_df.info()
prod_prop_df['Type of product'].unique()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 430 entries, 0 to 429
Data columns (total 21 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   Manufacturernumber        285 non-null    object 
 1   Articlenumber             426 non-null    object 
 2   EAN                       342 non-null    float64
 3   Technical details         400 non-null    object 
 4   Product category          0 non-null      float64
 5   Technical specifications  0 non-null      float64
 6   Picture normal reduced    430 non-null    object 
 7   Depth m                   370 non-null    float64
 8   Width m                   372 non-null    float64
 9   Length m                  370 non-null    float64
 10  Weight kg                 425 non-null    float64
 11  Delivery time days        4 non-null      float64
 12  Type of product           218 non-null    object 
 13  Price quantity            430 non-null    int64  
 14  ETIM Featu

array([nan, 'main_product'], dtype=object)

In [684]:
# clean and join data
# start with man_df

man_df = man_df[man_df['Manufacturernumber'].notna() & man_df['Manufacturername'].notna()]

In [685]:
# check if id is unique

man_df = man_df.drop_duplicates()
man_df = man_df.groupby('Manufacturernumber').head(1)

In [686]:
# now prod_des_df

prod_des_df = prod_des_df[prod_des_df['Articlenumber'].notna()]

# drop duplicates
prod_des_df.drop_duplicates()
prod_des_df.groupby('Articlenumber').head(1)

# seems like some items have item descriptions in one, both, or neither language
prod_des_df[prod_des_df.Articlenumber.isin(prod_des_df.query("Articlenumber.duplicated()").Articlenumber)].\
    sort_values(['Articlenumber','Language'])


Unnamed: 0,Articlenumber,Short description,Short description 2,Long description,Language
15,06012A0400,Akku-Bandsäge GCB 18V-63,,"Akku-Bandsäge GCB 18V-63, Die ergonomischste u...",de
445,06012A0400,Cordless band saw GCB 18V-63,GCB 18V-63 (C) CLC,,en
29,06012B4001,Akku-Geradschleifer GGS 18V-10 SLC,GGS 18V-10 SLC (C),"Akku-Geradschleifer GGS 18V-10 SLC, Das nächst...",de
459,06012B4001,Cordless straight grinder GGS 18V-10 SLC,GGS 18V-10 SLC (C),,en
14,0601372201,Akku-Exzenterschleifer GEX 18V-125,GEX 18V-125 (C) solo CLC,"Akku-Exzenterschleifer GEX 18V-125, Der Exzent...",de
...,...,...,...,...,...
433,S-1310SD-A,STYcutter 1310SD-A – proven quality,,,en
4,S-UL55-22-K,"STYcutter UL55-22-K im Koffer – wenn mobil, da...",,"Die STYcutter UL55-22-K – wenn mobil, dann ric...",de
434,S-UL55-22-K,"STYcutter UL55-22-K in case – when mobile, the...",,,en
5,S-UL55-22-KA,STYcutter UL55-22-KA im Koffer mit Akkus und L...,,"Die STYcutter UL55-22-KA – wenn mobil, dann ri...",de


In [687]:
# seems like there are 40 articles with both en and de descriptions, keep both

np.sum(prod_des_df.Language=='en')

np.int64(40)

In [688]:
prod_des_less5 = prod_des_df[prod_des_df.applymap(lambda x: len(x)<10 if pd.notna(x) else x)]
prod_des_less5.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 470 entries, 0 to 469
Data columns (total 5 columns):
 #   Column               Non-Null Count  Dtype 
---  ------               --------------  ----- 
 0   Articlenumber        137 non-null    object
 1   Short description    25 non-null     object
 2   Short description 2  0 non-null      object
 3   Long description     0 non-null      object
 4   Language             470 non-null    object
dtypes: object(5)
memory usage: 18.5+ KB


  prod_des_less5 = prod_des_df[prod_des_df.applymap(lambda x: len(x)<10 if pd.notna(x) else x)]


In [689]:
# short description does not look very informative for these products but we can include them
prod_des_less5.query('`Short description`.notna()')

Unnamed: 0,Articlenumber,Short description,Short description 2,Long description,Language
6,,EL33 Zero,,,de
137,PTX002,Oktagon,,,de
175,21933,Easy HD,,,de
358,2640.778,BXT3-13,,,de
359,2640.558,BXT3-16,,,de
360,2640.668,BXT3-19,,,de
361,2640.828,BXT3-32,,,de
362,1179.228,BPT-H32,,,de
363,1179.568,BPT-L19,,,de
364,1179.868,BST,,,de


In [690]:
# label NAs as 1, otherwise 0
prod_des_df = pd.concat([prod_des_df[['Articlenumber','Language']], 
                         prod_des_df.drop(columns=['Articlenumber','Language']).isna().astype(int)],
                         axis=1)

prod_des_df.head()

Unnamed: 0,Articlenumber,Language,Short description,Short description 2,Long description
0,S-1124F-A,de,0,1,0
1,S-1310-A,de,0,1,0
2,S-1310RW-A,de,0,1,0
3,S-1310SD-A,de,0,1,0
4,S-UL55-22-K,de,0,1,0


In [691]:
# now prod_prop_df

prod_prop_df = prod_prop_df.query("Manufacturernumber.notna() & Articlenumber.notna()")

# extract needed columns
# Type of product excluded because only two values, main product or NA, not very informative for buyer
prod_prop_df = prod_prop_df[['Manufacturernumber', 'Articlenumber', 'Technical details','Picture normal reduced', 'Depth m', 'Width m', 'Length m',
              'Weight kg', 'Delivery time days', 'Price quantity']]



In [692]:
# check for string columns with less than 5 characters

prod_prop_str = prod_prop_df.select_dtypes(include=["object", "string"])
prod_prop_less5 = prod_prop_str[prod_prop_str.applymap(lambda x: len(x)<10 if pd.notna(x) else x)]
prod_prop_less5.info()

<class 'pandas.core.frame.DataFrame'>
Index: 281 entries, 7 to 331
Data columns (total 4 columns):
 #   Column                  Non-Null Count  Dtype 
---  ------                  --------------  ----- 
 0   Manufacturernumber      31 non-null     object
 1   Articlenumber           31 non-null     object
 2   Technical details       0 non-null      object
 3   Picture normal reduced  4 non-null      object
dtypes: object(4)
memory usage: 11.0+ KB


  prod_prop_less5 = prod_prop_str[prod_prop_str.applymap(lambda x: len(x)<10 if pd.notna(x) else x)]


In [693]:
# what do they look like

prod_prop_less5.query("`Picture normal reduced`.notna()")

Unnamed: 0,Manufacturernumber,Articlenumber,Technical details,Picture normal reduced
308,,,,'
321,,,,'
325,,,,'
327,,,,'


In [694]:
# we drop ' from the analysis for Picture normal reduced
prod_prop_df.loc[prod_prop_less5.query("`Picture normal reduced`.notna()").index, 'Picture normal reduced'] = np.nan

In [695]:
# label NAs as 1, otherwise 0
prod_prop_df = pd.concat([prod_prop_df[['Manufacturernumber', 'Articlenumber']], 
                         prod_prop_df.drop(columns=['Manufacturernumber', 'Articlenumber']).isna().astype(int)],
                         axis=1)

prod_prop_df.head()

Unnamed: 0,Manufacturernumber,Articlenumber,Technical details,Picture normal reduced,Depth m,Width m,Length m,Weight kg,Delivery time days,Price quantity
7,0 601 6B4 000,06016B4000,0,0,0,0,0,0,1,0
8,0 601 9J4 002,06019J4002,0,0,0,0,0,0,1,0
9,0 601 9H6 000,06019H6000,0,0,0,0,0,0,1,0
10,0 601 9H6 100,06019H6100,0,0,0,0,0,0,1,0
11,0 601 9H6 300,06019H6300,0,0,0,0,0,0,1,0


In [696]:
# now join all dataframes together

prop_des_merge = pd.merge(prod_prop_df, prod_des_df, on="Articlenumber", how="left")
merged_df = pd.merge(prop_des_merge, man_df, on='Manufacturernumber', how='left')

# order col names

merged_df = merged_df[['Manufacturername', 'Manufacturernumber', 'Articlenumber', 'Language', 'Short description', 
           'Short description 2', 'Long description', 'Picture normal reduced', 'Width m', 'Length m', 'Depth m', 
           'Weight kg', 'Technical details', 'Price quantity', 'Delivery time days']]

## SQL Analysis

1. Which manufacturers have the biggest improvement potential in their data quality in absolute and relative numbers?
- for each manufacturer, how many products (absolute) have bad data quality and what percentage of all their products is that (relative)

2. What product variable/column (description or property) usually contains data of good quality per manufacturer? And what is the % of good quality records per variable/column and manufacturer?
- which variable (single) has good quality when normalized across all manufacturers?
- for each manufacturer, what is the percentage of each variable that is good quality?

3. Which other interesting insights did you find? Please give us a short explanation together with the SQL statement used to retrieve it.



### Biggest improvement potential

In [699]:
# Create a PandaSQL instance
pandasql_instance = PandaSQL()

# Define the SQL query
query = "SELECT * FROM merged_df"

result_df = pandasql_instance(query, locals())

result_df


Unnamed: 0,Manufacturername,Manufacturernumber,Articlenumber,Language,Short description,Short description 2,Long description,Picture normal reduced,Width m,Length m,Depth m,Weight kg,Technical details,Price quantity,Delivery time days
0,BOSCH,0 601 6B4 000,06016B4000,de,0,0,0,0,0,0,0,0,0,0,1
1,BOSCH,0 601 6B4 000,06016B4000,en,0,1,1,0,0,0,0,0,0,0,1
2,BOSCH,0 601 9J4 002,06019J4002,de,0,0,0,0,0,0,0,0,0,0,1
3,BOSCH,0 601 9J4 002,06019J4002,en,1,0,1,0,0,0,0,0,0,0,1
4,BOSCH,0 601 9H6 000,06019H6000,de,0,0,0,0,0,0,0,0,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
303,GUSTAV KLAUKE GMBH,LS100FLEXCFB,LS100FLEXCFB,de,0,1,0,1,1,1,1,0,0,0,1
304,GUSTAV KLAUKE GMBH,LS50FLEXCFB,LS50FLEXCFB,de,0,1,0,0,1,1,1,0,0,0,1
305,BOSCH,RALB1EU,RALB1EU,de,0,1,0,0,1,1,1,0,0,0,1
306,BOSCH,RALB2EU,RALB2EU,de,0,1,0,0,1,1,1,0,0,0,1
