# Challenge 2: Exploratory Data Analysis (EDA)

In [50]:
# Data manipulation
import pandas as pd # data manipulation and dataframes
import numpy as np # arrays manipulation and mathematical operations

# Fuzzy string matching
from fuzzywuzzy import fuzz

In [51]:
# Pandas configuration
pd.set_option('display.max_columns', None)  # shows all columns
pd.set_option('display.max_colwidth', None)  # shows all cell content

In [52]:
# Reads csvs
initial_csv = pd.read_csv('../data/datamarket-dia.csv', index_col = 0)
scrapped_csv = pd.read_csv('../data/scrapeo(2023-07-10 14-50-32).csv', index_col = 0)

# Index
initial_csv.reset_index(drop = False, inplace = True) # resets index named by url

# Nulls
initial_csv.isnull().sum() # 3053473 description, 5480 reference_unit

# Drops
initial_csv.drop(columns=['description'], axis = 1, inplace = True) # descrition column (all values are null)
initial_csv.drop(columns=['product_id'], axis = 1, inplace = True) # product_id column (not used information)
scrapped_csv.drop(columns=['subcategory'], axis = 1, inplace = True) # subcategory column (not used information)

# Checks duplicated values
initial_csv['name'].duplicated().sum() # 3041311
scrapped_csv['name'].duplicated().sum() # 119

# Gets YYYY-MM-DD format
initial_csv['insert_date'] = initial_csv['insert_date'].str.split(' ', expand = True).get(0) 

# Matches reference_unit column content between both dfs
units = scrapped_csv['reference_unit'].tolist()

new_units = []

for u in units:

    if u == 'kilo':
        new_units.append('kg')

    elif u == 'unidad':
        new_units.append('ud')

    elif u == 'litro':
        new_units.append('l')

    elif u == 'metro':
        new_units.append('m')

    elif u == 'lavado':
        new_units.append('lavado')

    else:
        new_units.append(np.nan)

scrapped_csv['reference_unit'] = new_units

# Adjusts column dtypes
initial_csv['insert_date'] = pd.to_datetime(initial_csv['insert_date']) # to datetime
scrapped_csv['insert_date'] = pd.to_datetime(scrapped_csv['insert_date']) # to datetime
scrapped_csv.iloc[2698, scrapped_csv.columns.get_loc('reference_price')] = '13.82' # corrects supermarket labelling error
scrapped_csv['reference_price'] = pd.to_numeric(scrapped_csv['reference_price']) # to float

# No url info for scraped csv
scrapped_csv['url'] = np.nan
