In [1]:
import pandas as pd
import re

## First ETL

### Function to clean strings:

In [2]:
def string_cleaner(dataframe,col):
    
    """
    Remove all special characters, remove '&' and '-', remove double spaces and end and start string spaces.
    Return the cleaned Dataframe
    
    Inputs:
    
    dataframe : A pandas dataframe
    col : A collumn name as string
    
    """
    df[col] = df[col].str.lower().replace('[^a-zA-Z0-9]', '')
    df[col] = df[col].str.replace('[&-/]', ' ')
    df[col] = df[col].str.replace('  ', ' ')
    df[col] = df[col].str.strip()
    return df

In [3]:
#Original DF
df = pd.read_csv('../data/merged_database/combined.csv' , index_col=0, dtype={'Quantity': str, 'Price': str})

In [4]:
#Original size:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 137406 entries, 0 to 137405
Data columns (total 5 columns):
 #   Column        Non-Null Count   Dtype 
---  ------        --------------   ----- 
 0   Manufacturer  137273 non-null  object
 1   Partnumber    136270 non-null  object
 2   Quantity      132860 non-null  object
 3   Price         135739 non-null  object
 4   supplier      137406 non-null  object
dtypes: object(5)
memory usage: 6.3+ MB


In [5]:
# Cleaning the two string cols:
string_cleaner(df,'Manufacturer')
string_cleaner(df,'Partnumber')

  df[col] = df[col].str.replace('[&-/]', ' ')


Unnamed: 0,Manufacturer,Partnumber,Quantity,Price,supplier
0,frisart,40004,1,27.34,metal
1,sanfil,5045,1,52.58,metal
2,branil,bcht004a,1,12.3,metal
3,branil,bcho108,1,22.09,metal
4,branil,bchv101rv,1,53.15,metal
...,...,...,...,...,...
137401,3 rho,7735,3,155.2132,sueyasu
137402,3 rho,7737,5,146.234,sueyasu
137403,3 rho,7744,2,153.9712,sueyasu
137404,3 rho,7748,1,188.93120000000002,sueyasu


### Transforming the price col:

In [6]:
df['Price'] = df['Price'].astype(str)
df['Price'] = df['Price'].apply(lambda x: x.replace(',','.'))
df['Price'] = pd.to_numeric(df['Price'],errors='coerce').round(2)

### Dics to alter the quantity col:

In [7]:
compel = {"Indisponível" : 0,
'A': 3,
'B' : 15,
 'C': 20}

In [8]:
real = {'A' : 4,
'B' : 11}

In [9]:
# Changing compel
df[df.supplier == 'compel'] = df[df.supplier == 'compel'].replace(compel, regex=True)

In [10]:
# Changing real
df[df.supplier == 'real'] = df[df.supplier == 'real'].replace(compel, regex=True)

### Modifying the Quantity col:

In [11]:
# Replace '>' and white spaces
df.Quantity = df.Quantity.str.replace('>', '')
df.Quantity = df.Quantity.str.strip()

In [12]:
#transforming to numeric
df.Quantity =pd.to_numeric(df['Quantity'],errors='coerce')

In [13]:
# Creating the key col:
df['key'] = df.Manufacturer+df.Partnumber

In [14]:
#Creating the pointer col:
df['pointer'] = df.supplier+df.Manufacturer+df.Partnumber

In [15]:
# Size of the first etl dataframe
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 137406 entries, 0 to 137405
Data columns (total 7 columns):
 #   Column        Non-Null Count   Dtype  
---  ------        --------------   -----  
 0   Manufacturer  137273 non-null  object 
 1   Partnumber    136270 non-null  object 
 2   Quantity      99980 non-null   float64
 3   Price         135554 non-null  float64
 4   supplier      137406 non-null  object 
 5   key           136167 non-null  object 
 6   pointer       136167 non-null  object 
dtypes: float64(2), object(5)
memory usage: 8.4+ MB


In [16]:
df.to_csv('../data/etl/first_etl.csv')

## Second ETL (for each step i show the size of the modifield dataframe)

In [17]:
#parts having a quantity equal to 0

df = df[df['Quantity'] !=0]

In [18]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 133378 entries, 0 to 137405
Data columns (total 7 columns):
 #   Column        Non-Null Count   Dtype  
---  ------        --------------   -----  
 0   Manufacturer  133245 non-null  object 
 1   Partnumber    132544 non-null  object 
 2   Quantity      95952 non-null   float64
 3   Price         131841 non-null  float64
 4   supplier      133378 non-null  object 
 5   key           132441 non-null  object 
 6   pointer       132441 non-null  object 
dtypes: float64(2), object(5)
memory usage: 8.1+ MB


In [19]:
#parts having the quantity blank

df = df[df['Quantity'].isna() == False]

In [20]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 95952 entries, 0 to 137405
Data columns (total 7 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   Manufacturer  95915 non-null  object 
 1   Partnumber    95127 non-null  object 
 2   Quantity      95952 non-null  float64
 3   Price         94613 non-null  float64
 4   supplier      95952 non-null  object 
 5   key           95120 non-null  object 
 6   pointer       95120 non-null  object 
dtypes: float64(2), object(5)
memory usage: 5.9+ MB


In [21]:
#parts having a negative quantity

df = df[df['Quantity'] > 0]

In [22]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 95952 entries, 0 to 137405
Data columns (total 7 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   Manufacturer  95915 non-null  object 
 1   Partnumber    95127 non-null  object 
 2   Quantity      95952 non-null  float64
 3   Price         94613 non-null  float64
 4   supplier      95952 non-null  object 
 5   key           95120 non-null  object 
 6   pointer       95120 non-null  object 
dtypes: float64(2), object(5)
memory usage: 5.9+ MB


In [23]:
#parts having a price equal to 0

df = df[df['Price'] != 0.]

In [24]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 95117 entries, 0 to 137405
Data columns (total 7 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   Manufacturer  95087 non-null  object 
 1   Partnumber    94301 non-null  object 
 2   Quantity      95117 non-null  float64
 3   Price         93778 non-null  float64
 4   supplier      95117 non-null  object 
 5   key           94294 non-null  object 
 6   pointer       94294 non-null  object 
dtypes: float64(2), object(5)
memory usage: 5.8+ MB


In [25]:
#parts having the price blank
df = df[df['Price'].isna() == False]

In [26]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 93778 entries, 0 to 137405
Data columns (total 7 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   Manufacturer  93748 non-null  object 
 1   Partnumber    92962 non-null  object 
 2   Quantity      93778 non-null  float64
 3   Price         93778 non-null  float64
 4   supplier      93778 non-null  object 
 5   key           92955 non-null  object 
 6   pointer       92955 non-null  object 
dtypes: float64(2), object(5)
memory usage: 5.7+ MB


In [27]:
#parts having the manufacturer blank
df = df[df['Manufacturer'].isna() == False]

In [28]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 93748 entries, 0 to 137405
Data columns (total 7 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   Manufacturer  93748 non-null  object 
 1   Partnumber    92955 non-null  object 
 2   Quantity      93748 non-null  float64
 3   Price         93748 non-null  float64
 4   supplier      93748 non-null  object 
 5   key           92955 non-null  object 
 6   pointer       92955 non-null  object 
dtypes: float64(2), object(5)
memory usage: 5.7+ MB


In [29]:
# parts have the partnumber blank
df = df[df['Partnumber'].isna() == False]

In [30]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 92955 entries, 0 to 137405
Data columns (total 7 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   Manufacturer  92955 non-null  object 
 1   Partnumber    92955 non-null  object 
 2   Quantity      92955 non-null  float64
 3   Price         92955 non-null  float64
 4   supplier      92955 non-null  object 
 5   key           92955 non-null  object 
 6   pointer       92955 non-null  object 
dtypes: float64(2), object(5)
memory usage: 5.7+ MB


In [32]:
df.to_csv('../data/etl/second_etl.csv')