In [None]:
import pandas as pd
import re

## First ETL

### Function to clean strings:

In [None]:
def string_cleaner(dataframe,col):
    
    """
    Remove all special characters, remove '&' and '-', remove double spaces and end and start string spaces.
    Return the cleaned Dataframe
    
    Inputs:
    
    dataframe : A pandas dataframe
    col : A collumn name as string
    
    """
    df[col] = df[col].str.lower().replace('[^a-zA-Z0-9]', '')
    df[col] = df[col].str.replace('[&-/]', ' ')
    df[col] = df[col].str.replace('  ', ' ')
    df[col] = df[col].str.strip()
    return df

In [None]:
#Original DF
df = pd.read_csv('../data/merged_database/combined.csv' , index_col=0, dtype={'Quantity': str, 'Price': str})

In [None]:
#Original size:
df.info()

In [None]:
# Cleaning the two string cols:
string_cleaner(df,'Manufacturer')
string_cleaner(df,'Partnumber')

### Transforming the price col:

In [None]:
df['Price'] = df['Price'].astype(str)
df['Price'] = df['Price'].apply(lambda x: x.replace(',','.'))
df['Price'] = pd.to_numeric(df['Price'],errors='coerce').round(2)

### Dics to alter the quantity col:

In [None]:
compel = {"Indisponível" : '0',
'A': '3',
'B' : '15',
 'C': '20'}

In [None]:
real = {'A' : '4',
'B' : '11'}

In [None]:
# Changing compel
df[df.supplier == 'compel'] = df[df.supplier == 'compel'].replace(compel, regex=True)

In [None]:
# Changing real
df[df.supplier == 'real'] = df[df.supplier == 'real'].replace(compel, regex=True)

### Modifying the Quantity col:

In [None]:
# Replace '>' and white spaces
df.Quantity = df.Quantity.str.replace('>', '')
df.Quantity = df.Quantity.str.strip()

In [None]:
#transforming to numeric
df.Quantity =pd.to_numeric(df['Quantity'],errors='coerce')

In [None]:
# Creating the key col:
df['key'] = df.Manufacturer+df.Partnumber

In [None]:
#Creating the pointer col:
df['pointer'] = df.supplier+df.Manufacturer+df.Partnumber

In [None]:
# Size of the first etl dataframe
df.info()

In [None]:
# Partnumber as string
df.Partnumber = df.Partnumber.astype(str)

In [None]:
# Removing the dots in Partnumber
df.Partnumber = df.Partnumber.str.replace(".","")

In [None]:
#Removing white space in Partnumber
df.Partnumber = df.Partnumber.str.strip()
df.Partnumber = df.Partnumber.str.replace(" ","")

In [None]:
# Manufacturer as string
df.Manufacturer = df.Manufacturer.astype(str)

In [None]:
# Removing the dots in Partnumber
df.Manufacturer = df.Manufacturer.str.replace(".","")

In [None]:
#Removing white space in Partnumber
df.Manufacturer = df.Manufacturer.str.strip()
df.Manufacturer = df.Manufacturer.str.replace(" ","")

In [None]:
# Fill Nans with 0
df = df.replace("", 0)
df = df.replace("nan",0)
df = df.fillna(0)

In [None]:
# Removing empty Manufactures
df = df[df.Manufacturer != 0]

In [None]:
df.to_csv('../data/etl/first_etl.csv')

## Second ETL (for each step i show the size of the modifield dataframe)

In [None]:
#parts having a quantity equal to 0

df = df[df['Quantity'] !=0]

In [None]:
df.info()

In [None]:
df.supplier.value_counts()

In [None]:
#parts having the quantity blank

df = df[df['Quantity'].isna() == False]

In [None]:
df.info()

In [None]:
df.supplier.value_counts()

In [None]:
#parts having a negative quantity

df = df[df['Quantity'] > 0]

In [None]:
df.info()

In [None]:
#parts having a price equal to 0

df = df[df['Price'] != 0.]

In [None]:
df.info()

In [None]:
#parts having the price blank
df = df[df['Price'].isna() == False]

In [None]:
df.info()

In [None]:
#parts having the manufacturer blank
df = df[df['Manufacturer'].isna() == False]

In [None]:
df.info()

In [None]:
# parts have the partnumber blank
df = df[df['Partnumber'].isna() == False]

In [None]:
df.info()

In [None]:
# Read the synonyms data
syn = pd.read_excel("../data/synonyms/manufacturers_synonyms.xlsx")

In [None]:
# for each entrie replace the values with the syn dataframe:
for n,i in enumerate(syn.name_to_be_replaced):
    df = df.replace(i,syn.iloc[n,1])

In [None]:
df.to_csv('../data/etl/second_etl.csv')