# Olist Brazilian E-Commerce - Data Consistency and Wrangling Procedures

## This script contains the following:

### 1. Import Libraries
### 2. olist_orders_dataset
### 3. olist_order_items_dataset
### 4. olist_products_dataset
### 5. product_category_name_translation
### 6. olist_order_payments_dataset
### 7. olist_sellers_dataset
### 8. olist_customers_dataset
### 9. olist_geolocation_dataset
### 10. olist_order_reviews_dataset

# 1. Import Libraries

In [1]:
# Import libraries 
import pandas as pd
import numpy as np
import os

# 2. olist_orders_dataset

In [7]:
# define a path

path = r'C:\Users\junio\OneDrive\Career Foundry Project\Achievement 6\02-2024 Brazilian E-Commerce'

## 2.1. Import Data Set

In [71]:
# Import olist_orders_data_set

df_ords = pd.read_csv(os.path.join(path, '02 Data', 'Original Data', 'olist_orders_dataset.csv'), index_col = False)

In [None]:
# Check head

df_ords.head(10)

## 2.2. Dropping Columns

In [72]:
# Dropping 'order_approved_at' column from 'olist_orders_data_set', since this column will not be relevant for the analysis.

df_ords = df_ords.drop(columns = ['order_approved_at'])

In [None]:
# Dropping 'order_approved_at' column from 'olist_orders_data_set', since this column will not be relevant for the analysis.

df_ords = df_ords.drop(columns = ['order_approved_at'])

## 2.3. Consistency Checks

In [None]:
# Check head

df_ords.head(10)

In [34]:
# Check shape

df_ords.shape

(99441, 8)

In [None]:
# Check info

df_ords.info()

## 2.4. Changing Data Types

In [23]:
# Select the timestamp columns to convert to datetime

timestamp_cols = ['order_purchase_timestamp', 'order_approved_at', 'order_delivered_carrier_date', 'order_delivered_customer_date', 'order_estimated_delivery_date']

In [24]:
# Change data type to datetime for the selected columns

df_ords[timestamp_cols] = df_ords[timestamp_cols].apply(pd.to_datetime)

In [None]:
# Check data type

df_ords.dtypes

In [None]:
# Check data type

df_ords.describe()

## 2.5. Looking for missing values

In [46]:
 # DataFrame with True where missing values are present
    
missing_values_ords = df_ords.isnull() 

# Count of missing values in each column

missing_counts_ords = df_ords.isnull().sum()  


In [None]:
missing_counts_ords

## 2.6. Dropping  missing values

In [90]:
# Drop missing values, the missing values account for no more than 5% of the data, thus, I decide to drop them

df_ords_cleaned = df_ords.dropna(subset=['order_delivered_carrier_date', 'order_delivered_customer_date'])


In [93]:
missing_values_ords2 = df_ords_cleaned.isnull()

# Count of missing values in each column

missing_counts_ords2 = df_ords_cleaned.isnull().sum()  


In [None]:
# check missing values again

missing_counts_ords2

In [97]:
# Check the sahpe
df_ords_cleaned.shape

(96475, 7)

## 2.7. Looking for Duplicates

In [98]:
# Series with True where rows are duplicates

duplicate_rows_ords = df_ords_cleaned.duplicated() 

# Count of duplicate rows
duplicate_count_ords = df_ords_cleaned.duplicated().sum()  

In [99]:
duplicate_count_ords

0

## 2.8. Export Wrangled 'olist_orders_dataset'

In [100]:
# Exporting data set

df_ords_cleaned.to_csv(os.path.join(path, '02 Data','Prepared Data', 'olist_orders_dataset_wrangled.csv'))


# 3. olist_order_items_dataset


## 3.1 Import Data Set

In [107]:
# Importing 'olist_order_items_dataset'

df_ords_items = pd.read_csv(os.path.join(path, '02 Data', 'Original Data', 'olist_order_items_dataset.csv'), index_col = False)

In [125]:
# Check head

df_ords_items.head(10)

Unnamed: 0,order_id,order_item_id,product_id,seller_id,shipping_limit_date,price,freight_value
0,00010242fe8c5a6d1ba2dd792cb16214,1,4244733e06e7ecb4970a6e2683c13e61,48436dade18ac8b2bce089ec2a041202,2017-09-19 09:45:35,58.9,13.29
1,00018f77f2f0320c557190d7a144bdd3,1,e5f2d52b802189ee658865ca93d83a8f,dd7ddc04e1b6c2c614352b383efe2d36,2017-05-03 11:05:13,239.9,19.93
2,000229ec398224ef6ca0657da4fc703e,1,c777355d18b72b67abbeef9df44fd0fd,5b51032eddd242adc84c38acab88f23d,2018-01-18 14:48:30,199.0,17.87
3,00024acbcdf0a6daa1e931b038114c75,1,7634da152a4610f1595efa32f14722fc,9d7a1d34a5052409006425275ba1c2b4,2018-08-15 10:10:18,12.99,12.79
4,00042b26cf59d7ce69dfabb4e55b4fd9,1,ac6c3623068f30de03045865e4e10089,df560393f3a51e74553ab94004ba5c87,2017-02-13 13:57:51,199.9,18.14
5,00048cc3ae777c65dbb7d2a0634bc1ea,1,ef92defde845ab8450f9d70c526ef70f,6426d21aca402a131fc0a5d0960a3c90,2017-05-23 03:55:27,21.9,12.69
6,00054e8431b9d7675808bcb819fb4a32,1,8d4f2bb7e93e6710a28f34fa83ee7d28,7040e82f899a04d1b434b795a43b4617,2017-12-14 12:10:31,19.9,11.85
7,000576fe39319847cbb9d288c5617fa6,1,557d850972a7d6f792fd18ae1400d9b6,5996cddab893a4652a15592fb58ab8db,2018-07-10 12:30:45,810.0,70.75
8,0005a1a1728c9d785b8e2b08b904576c,1,310ae3c140ff94b03219ad0adc3c778f,a416b6a846a11724393025641d4edd5e,2018-03-26 18:31:29,145.95,11.65
9,0005f50442cb953dcd1d21e1fb923495,1,4535b0e1091c278dfd193e5a1d63b39f,ba143b05f0110f0dc71ad71b4466ce92,2018-07-06 14:10:56,53.99,11.4


In [3]:
# Set display options to show all rows and columns

pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)

## 3.2. Consistency Checks

In [114]:
# Check shape

df_ords_items.shape

(112650, 7)

In [126]:
df_ords_items.describe()

Unnamed: 0,order_item_id,shipping_limit_date,price,freight_value
count,112650.0,112650,112650.0,112650.0
mean,1.197834,2018-01-07 15:36:52.192685312,120.653739,19.99032
min,1.0,2016-09-19 00:15:34,0.85,0.0
25%,1.0,2017-09-20 20:57:27.500000,39.9,13.08
50%,1.0,2018-01-26 13:59:35,74.99,16.26
75%,1.0,2018-05-10 14:34:00.750000128,134.9,21.15
max,21.0,2020-04-09 22:35:08,6735.0,409.68
std,0.705124,,183.633928,15.806405


In [122]:
# Check info

df_ords_items.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 112650 entries, 0 to 112649
Data columns (total 7 columns):
 #   Column               Non-Null Count   Dtype         
---  ------               --------------   -----         
 0   order_id             112650 non-null  object        
 1   order_item_id        112650 non-null  int64         
 2   product_id           112650 non-null  object        
 3   seller_id            112650 non-null  object        
 4   shipping_limit_date  112650 non-null  datetime64[ns]
 5   price                112650 non-null  float64       
 6   freight_value        112650 non-null  float64       
dtypes: datetime64[ns](1), float64(2), int64(1), object(3)
memory usage: 6.0+ MB


## 3.3. Changing Data Types

In [117]:
# change 'shipping_limit_date' into a datetime data-type

df_ords_items['shipping_limit_date'] = df_ords_items['shipping_limit_date'].astype('datetime64[ns]')


## 3.4. Looking for missing values

In [118]:
# dataframe where missing values are present

missing_values_ords_items = df_ords_items.isnull()

# counts of missing values

missing_counts_ords_items = df_ords_items.isnull() .sum()

In [119]:
missing_counts_ords_items

order_id               0
order_item_id          0
product_id             0
seller_id              0
shipping_limit_date    0
price                  0
freight_value          0
dtype: int64

## 3.5. Looking for Duplicates

In [120]:
# create a df where rows are duplicates

duplicate_ords_items = df_ords_items.duplicated() 

# Count of duplicate rows
duplicate_counts_ords_items = df_ords_items.duplicated().sum()  

In [121]:
# No duplicates found

duplicate_counts_ords_items

0

## 3.6. Export Wrangled 'olist_order_items_dataset'

In [124]:
# Exporting cleaned version of data set

df_ords_items.to_csv(os.path.join(path, '02 Data', 'Prepared Data', 'olist_order_items_wrangled.csv'))


# 4. olist_products_dataset

## 4.1. Import data set

In [9]:
# Importing products data set

df_products = pd.read_csv(os.path.join(path, '02 Data', 'Original Data', 'olist_products_dataset.csv'), index_col = False)

## 4.2. Consistency Checks

In [11]:
# Check head

df_products.head(10)

Unnamed: 0,product_id,product_category_name,product_name_lenght,product_description_lenght,product_photos_qty,product_weight_g,product_length_cm,product_height_cm,product_width_cm
0,1e9e8ef04dbcff4541ed26657ea517e5,perfumaria,40.0,287.0,1.0,225.0,16.0,10.0,14.0
1,3aa071139cb16b67ca9e5dea641aaa2f,artes,44.0,276.0,1.0,1000.0,30.0,18.0,20.0
2,96bd76ec8810374ed1b65e291975717f,esporte_lazer,46.0,250.0,1.0,154.0,18.0,9.0,15.0
3,cef67bcfe19066a932b7673e239eb23d,bebes,27.0,261.0,1.0,371.0,26.0,4.0,26.0
4,9dc1a7de274444849c219cff195d0b71,utilidades_domesticas,37.0,402.0,4.0,625.0,20.0,17.0,13.0
5,41d3672d4792049fa1779bb35283ed13,instrumentos_musicais,60.0,745.0,1.0,200.0,38.0,5.0,11.0
6,732bd381ad09e530fe0a5f457d81becb,cool_stuff,56.0,1272.0,4.0,18350.0,70.0,24.0,44.0
7,2548af3e6e77a690cf3eb6368e9ab61e,moveis_decoracao,56.0,184.0,2.0,900.0,40.0,8.0,40.0
8,37cc742be07708b53a98702e77a21a02,eletrodomesticos,57.0,163.0,1.0,400.0,27.0,13.0,17.0
9,8c92109888e8cdf9d66dc7e463025574,brinquedos,36.0,1156.0,1.0,600.0,17.0,10.0,12.0


In [12]:
# Check Shape

df_products.shape

(32951, 9)

In [67]:
# check counts

df_products.describe()

Unnamed: 0,product_id,product_category_name
count,32951,32341
unique,32951,73
top,1e9e8ef04dbcff4541ed26657ea517e5,cama_mesa_banho
freq,1,3029


In [None]:
# check counts of 'product_category_name'

df_products['product_category_name'].value_counts(dropna = False)

In [20]:
# Check data types

df_products.dtypes

product_id                     object
product_category_name          object
product_name_lenght           float64
product_description_lenght    float64
product_photos_qty            float64
product_weight_g              float64
product_length_cm             float64
product_height_cm             float64
product_width_cm              float64
dtype: object

## 4.3. Dropping Unnecessary Columns

In [21]:
# Drop columns that will be not relevant for the analysis

df_products = df_products.drop(columns = ['product_name_lenght', 'product_description_lenght', 'product_photos_qty', 'product_weight_g', 'product_length_cm', 'product_height_cm', 'product_width_cm'])

In [22]:
# Check Shape

df_products.shape

(32951, 2)

In [28]:
df_products.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 32951 entries, 0 to 32950
Data columns (total 2 columns):
 #   Column                 Non-Null Count  Dtype 
---  ------                 --------------  ----- 
 0   product_id             32951 non-null  object
 1   product_category_name  32341 non-null  object
dtypes: object(2)
memory usage: 515.0+ KB


## 4.4. Looking for missing values

In [30]:
# dataframe where missing values are present

missing_values_products = df_products.isnull()

# counts of missing values

missing_counts_products = df_products.isnull() .sum()

In [31]:
missing_counts_products

product_id                 0
product_category_name    610
dtype: int64

In [35]:
# Drop missing values

df_prods_cleaned = df_products.dropna(subset=['product_category_name'])

In [79]:
# check new shape

df_prods_cleaned.shape

(32341, 2)

In [None]:
df_products.isnull() .sum()

In [52]:
# Check for missing values

missing_values_prods2 = df_prods_cleaned.isnull().any()

# Print columns with missing values, if any
if missing_values_prods2.any():
    print(missing_values_prods2[missing_values_prods2])
else: 
    print('No missing values found in the DataFrame.')


No missing values found in the DataFrame.


## 4.4. Duplicates

In [58]:
# check duplicates values

duplicates_prods = df_prods_cleaned.duplicated()

 # Count of duplicate rows
duplicate_counts_ords_items = df_prods_cleaned.duplicated().sum()  

In [59]:
#check counts

# no duplicates found

duplicate_counts_ords_items

0

## 4.5. Export Wrangled 'olist_products_dataset'

In [60]:
# Exporting wrangled df

df_prods_cleaned.to_csv(os.path.join(path, '02 Data', 'Prepared Data','olist_products_wrangled.csv'))

# 5. product_category_name_translation

## 5.1. Import data set

In [64]:
# Import product_category_name_translation

df_prods_category_translation = pd.read_csv(os.path.join(path, '02 Data', 'Original Data', 'product_category_name_translation.csv'), index_col = False)

In [74]:
# check info

df_prods_category_translation.shape

(71, 2)

In [72]:
# check head

df_prods_category_translation.head()

Unnamed: 0,product_category_name,product_category_name_english
0,beleza_saude,health_beauty
1,informatica_acessorios,computers_accessories
2,automotivo,auto
3,cama_mesa_banho,bed_bath_table
4,moveis_decoracao,furniture_decor


## 5.2.  Merging products_name data set with the products_name_eng_translation

In [76]:
# Merge products category names with the 'name english translation'

df_prods_merged = df_prods_cleaned.merge(df_prods_category_translation, on = 'product_category_name', indicator = True)

In [77]:
# check shape

df_prods_merged.shape

(32328, 4)

In [83]:
# check how the merge procedure worked

df_prods_merged['_merge'].value_counts()

_merge
both          32328
left_only         0
right_only        0
Name: count, dtype: int64

In [85]:
# check head

df_prods_merged.head()

Unnamed: 0,product_id,product_category_name,product_category_name_english,_merge
0,1e9e8ef04dbcff4541ed26657ea517e5,perfumaria,perfumery,both
1,6a2fb4dd53d2cdb88e0432f1284a004c,perfumaria,perfumery,both
2,0d009643171aee696f4733340bc2fdd0,perfumaria,perfumery,both
3,b1eae565a61935e0011ee7682fef9dc9,perfumaria,perfumery,both
4,8da90b37f0fb171b4877c124f965b1f6,perfumaria,perfumery,both


## 5.3. Dropping Unnecessary Columns

In [87]:
# drop _merge and product_category_name columns

df_prods_category_eng = df_prods_merged.drop(columns = ['product_category_name', '_merge'])

In [88]:
# check head

df_prods_category_eng.head()

Unnamed: 0,product_id,product_category_name_english
0,1e9e8ef04dbcff4541ed26657ea517e5,perfumery
1,6a2fb4dd53d2cdb88e0432f1284a004c,perfumery
2,0d009643171aee696f4733340bc2fdd0,perfumery
3,b1eae565a61935e0011ee7682fef9dc9,perfumery
4,8da90b37f0fb171b4877c124f965b1f6,perfumery


In [89]:
# check shape
df_prods_category_eng.shape

(32328, 2)

## 5.4. Export merged products category name(english) data set

In [90]:
# Exporting english version of products categerory name

df_prods_category_eng.to_csv(os.path.join(path, '02 Data', 'Prepared Data','olist_products_eng_merged.csv'))

# 6. olist_order_payments_dataset

## 6.1. Import Data Set

In [91]:
# Importing 'order payments' data set

df_ords_payments = pd.read_csv(os.path.join(path, '02 Data', 'Original Data', 'olist_order_payments_dataset.csv'), index_col = False)

## 6.2. Consistency Checks

In [93]:
#check head

df_ords_payments.head()

Unnamed: 0,order_id,payment_sequential,payment_type,payment_installments,payment_value
0,b81ef226f3fe1789b1e8b2acac839d17,1,credit_card,8,99.33
1,a9810da82917af2d9aefd1278f1dcfa0,1,credit_card,1,24.39
2,25e8ea4e93396b6fa0d3dd708e76c1bd,1,credit_card,1,65.71
3,ba78997921bbcdc1373bb41e913ab953,1,credit_card,8,107.78
4,42fdf880ba16b47b59251dd489d4441a,1,credit_card,2,128.45


In [94]:
# Check shape

df_ords_payments.shape

(103886, 5)

In [None]:
#check info

df_ords_payments.info()

In [None]:
df_ords_payments.describe()

## 6.3. Dropping Unnecessary Columns

In [98]:
# Drop columns that will not be relevant for the analysis

df_ords_payments_drop = df_ords_payments.drop(columns = ['payment_sequential', 'payment_type'])

In [99]:
# Check head

df_ords_payments_drop.head()

Unnamed: 0,order_id,payment_installments,payment_value
0,b81ef226f3fe1789b1e8b2acac839d17,8,99.33
1,a9810da82917af2d9aefd1278f1dcfa0,1,24.39
2,25e8ea4e93396b6fa0d3dd708e76c1bd,1,65.71
3,ba78997921bbcdc1373bb41e913ab953,8,107.78
4,42fdf880ba16b47b59251dd489d4441a,2,128.45


In [None]:
#check counts of 'payment_installments'

df_ords_payments_drop['payment_installments'].value_counts()

In [104]:
df_ords_payments_drop.shape

(103886, 3)

## 6.4. Looking for missing values

In [105]:
# Look for missing values

# dataframe where missing values are present

df_ords_payments_missing = df_ords_payments_drop.isnull()

# counts of missing values

df_ords_payments_counts_missing = df_ords_payments_drop.isnull() .sum()

In [128]:
df_ords_payments_counts_missing

order_id                0
payment_installments    0
payment_value           0
dtype: int64

## 6.5. Looking for Duplicates

In [111]:
# check duplicates values

duplicates_payments = df_ords_payments_drop.duplicated()

 # Count of duplicate rows
counts_payments = df_ords_payments_drop.duplicated().sum()  

In [120]:
counts_payments

614

In [129]:
duplicates_payments.value_counts(dropna = False)

False    103272
True        614
Name: count, dtype: int64

In [130]:
# Check for full duplicates
full_duplicates_payments = df_ords_payments_drop.duplicated(keep=False)

# Count of full duplicate rows
full_duplicates_count = full_duplicates_payments.sum()

if full_duplicates_count > 0:
    print("There are full duplicates in the DataFrame.")
else:
    print("There are no full duplicates in the DataFrame.")

There are full duplicates in the DataFrame.


In [135]:
full_duplicates_count

921

In [132]:
# Remove full duplicates (keeping the first occurrence)
df_ords_payments_unique = df_ords_payments_drop.drop_duplicates(keep=False)

# Check if duplicates are removed
if len(df_ords_payments_unique) == len(df_ords_payments_drop):
    print("No full duplicates remain after removal.")
else:
    print(f"{len(df_ords_payments_drop) - len(df_ords_payments_unique)} full duplicates removed.")


921 full duplicates removed.


In [136]:
# Check new shape

df_ords_payments_unique.shape

(102965, 3)

## 6.6. Export Wrangled Data Set

In [145]:
df_ords_payments_unique.to_csv(os.path.join(path, '02 Data', 'Prepared Data', 'olist_ords_payments_wrangled.csv'))

# 7. olist_sellers_dataset

## 7.1. Import Data Set

In [146]:
# Importing Sellers data set

df_sellers = pd.read_csv(os.path.join(path, '02 Data', 'Original Data', 'olist_sellers_dataset.csv'), index_col = False)

## 7.2. Consistency Checks

In [None]:
# check head

df_sellers.head(10)

In [149]:
# check shape

df_sellers.shape

(3095, 4)

In [None]:
# check info 

df_sellers.info()

## 7.3. Looking for missing values

In [None]:
# Look for missing values

# dataframe where missing values are present

missing_sellers = df_sellers.isnull()

# counts of missing values

missing_counts_sellers = df_sellers.isnull() .sum()

In [None]:
# check counts of missing values (if there's)
missing_counts_sellers

## 7.4. Looking for Duplicates

In [159]:
# check duplicates values

duplicates_sellers = df_sellers.duplicated()

 # Count of duplicate rows
counts_duplicates_sellers = df_sellers.duplicated().sum()  

In [160]:
counts_duplicates_sellers

0

## 7.5. Export Wrangled Data Set

In [162]:
# export the wrangled version of dataframe

df_sellers.to_csv(os.path.join(path, '02 Data', 'Prepared Data','olist_sellers_wrangled.csv'))

# 8. olist_customers_dataset

## 8.1.  Import Dataset

In [164]:
# Import customers data set

df_customers = pd.read_csv(os.path.join(path,'02 Data', 'Original Data', 'olist_customers_dataset.csv'), index_col = False)

## 8.2. Consistency Checks

In [165]:
# check shape

df_customers.shape

(99441, 5)

In [None]:
# check head 

df_customers.head(10)

In [None]:
# check info

df_customers.info()

## 8.3. Looking for missing values

In [169]:
# Look for missing values

#df where missing values are present

missing_customer = df_customers.isnull()

# counts of missing

missing_counts_customers = df_customers.isnull().sum()

In [None]:
missing_counts_customers

## 8.4. Looking for Duplicates

In [175]:
# check duplicates values

duplicates_customers = df_customers.duplicated()

 # Count of duplicate rows
counts_duplicates_customers = df_customers.duplicated().sum()  

In [176]:
counts_duplicates_customers

0

## 8.5. Export Wrangled  Dataset

In [178]:
# Export wrangled version os customers dataset

df_customers.to_csv(os.path.join(path, '02 Data', 'Prepared Data', 'olist_customers_wrangled.csv'))

# 9. olist_geolocation_dataset

## 9.1. Import Dataset

In [180]:
# Import data set containing location data

df_location = pd.read_csv(os.path.join(path, '02 Data', 'Original Data', 'olist_geolocation_dataset.csv'), index_col = False)

## 9.2. Consistency Checks

In [206]:
# check head

df_location.head()

Unnamed: 0,zip_code_prefix,geolocation_lat,geolocation_lng,geolocation_city,geolocation_state
0,1037,-23.545621,-46.639292,sao paulo,SP
1,1046,-23.546081,-46.64482,sao paulo,SP
2,1046,-23.546129,-46.642951,sao paulo,SP
3,1041,-23.544392,-46.639499,sao paulo,SP
4,1035,-23.541578,-46.641607,sao paulo,SP


In [182]:
# check shape

df_location.shape

(1000163, 5)

In [183]:
# check info

df_location.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000163 entries, 0 to 1000162
Data columns (total 5 columns):
 #   Column                       Non-Null Count    Dtype  
---  ------                       --------------    -----  
 0   geolocation_zip_code_prefix  1000163 non-null  int64  
 1   geolocation_lat              1000163 non-null  float64
 2   geolocation_lng              1000163 non-null  float64
 3   geolocation_city             1000163 non-null  object 
 4   geolocation_state            1000163 non-null  object 
dtypes: float64(2), int64(1), object(2)
memory usage: 38.2+ MB


## 9.3. Changing columns name

In [188]:
# change columns name of geolocation_zip_code_prefix to 'zip_code_prefix' for merging this data with other datasets later

df_location.rename(columns={'geolocation_zip_code_prefix' : 'zip_code_prefix'}, inplace= True)

In [191]:
# check head

df_location.columns

Index(['zip_code_prefix', 'geolocation_lat', 'geolocation_lng',
       'geolocation_city', 'geolocation_state'],
      dtype='object')

In [None]:
# check counts of states

df_location['geolocation_state'].value_counts(dropna = False)

## 9.4. Looking for missing values

In [193]:
# check for missing values

# dataframe with missing

missing_location = df_location.isnull()

# counts of missing values

missing_counts_location = df_location.isnull().sum()

In [194]:
missing_counts_location

zip_code_prefix      0
geolocation_lat      0
geolocation_lng      0
geolocation_city     0
geolocation_state    0
dtype: int64

## 9.5. Looking for duplicates

In [196]:
# Look for duplicates

duplicates_location = df_location.duplicated()

 # Count of duplicate rows
counts_duplicates_location = df_location.duplicated().sum()  

In [197]:
# counts of duplicates

counts_duplicates_location

261831

In [201]:
# Check for full duplicates
full_duplicates_location = df_location.duplicated(keep=False)

# Count of full duplicate rows
full_duplicates_counts_location = full_duplicates_location.sum()

if full_duplicates_counts_location > 0:
    print("There are full duplicates in the DataFrame.")
else:
    print("There are no full duplicates in the DataFrame.")

There are full duplicates in the DataFrame.


In [202]:
full_duplicates_counts_location

390005

In [207]:
# Check for duplicated rows
duplicated_rows = df_location[df_location.duplicated(keep=False)]

# Count the number of duplicated rows
print("Number of duplicated rows:", len(duplicated_rows))

# Inspect a sample of duplicated rows
if not duplicated_rows.empty:
    print("\nSample of duplicated rows:")
    print(duplicated_rows.head())
else:
    print("\nNo duplicated rows found.")


Number of duplicated rows: 390005

Sample of duplicated rows:
   zip_code_prefix  geolocation_lat  geolocation_lng geolocation_city  \
0             1037       -23.545621       -46.639292        sao paulo   
1             1046       -23.546081       -46.644820        sao paulo   
2             1046       -23.546129       -46.642951        sao paulo   
6             1047       -23.546273       -46.641225        sao paulo   
7             1013       -23.546923       -46.634264        sao paulo   

  geolocation_state  
0                SP  
1                SP  
2                SP  
6                SP  
7                SP  


#### for now I will left it unchangend

## 9.6. Export Wrangled Dataset

In [210]:
df_location.to_csv(os.path.join(path, '02 Data', 'Prepared Data', 'olist_geolocation.csv'))

# 10. olist_order_reviews_dataset

## 10.1. Import Dataset

In [10]:
# Importing order reviews dataset

df_ords_reviews = pd.read_csv(os.path.join(path,'02 Data', 'Original Data', 'olist_order_reviews_dataset.csv'), index_col = False)

## 10.2. Consistency Checks

In [14]:
# Check head

df_ords_reviews.head()

Unnamed: 0,review_id,order_id,review_score,review_comment_title,review_comment_message,review_creation_date,review_answer_timestamp
0,7bc2406110b926393aa56f80a40eba40,73fc7af87114b39712e6da79b0a377eb,4,,,2018-01-18 00:00:00,2018-01-18 21:46:59
1,80e641a11e56f04c1ad469d5645fdfde,a548910a1c6147796b98fdf73dbeba33,5,,,2018-03-10 00:00:00,2018-03-11 03:05:13
2,228ce5500dc1d8e020d8d1322874b6f0,f9e4b658b201a9f2ecdecbb34bed034b,5,,,2018-02-17 00:00:00,2018-02-18 14:36:24
3,e64fb393e7b32834bb789ff8bb30750e,658677c97b385a9be170737859d3511b,5,,Recebi bem antes do prazo estipulado.,2017-04-21 00:00:00,2017-04-21 22:02:06
4,f7c4243c7fe1938f181bec41a392bdeb,8e6bfb81e283fa7e4f11123a3fb894f1,5,,Parabéns lojas lannister adorei comprar pela I...,2018-03-01 00:00:00,2018-03-02 10:26:53


In [15]:
# Check shape

df_ords_reviews.shape

(99224, 7)

In [None]:
# check info

df_ords_reviews.describe()

## 10.2. Dropping Unnecessary Columns

In [18]:
# Drop Uncecessary columns for the analysis

df_reviews_dropped = df_ords_reviews.drop(columns = ['review_id','review_creation_date', 'review_answer_timestamp'])

In [20]:
# check shape

df_reviews_dropped.shape

(99224, 4)

## 10.3. Looking for Missing Values

In [24]:
# look for missing values
# dataframe with missing values

reviews_missing = df_reviews_dropped.isnull()

#data frame with missing counts
reviews_missing_counts = df_reviews_dropped.isnull().sum()

In [25]:
# check counts of missing

reviews_missing_counts

order_id                      0
review_score                  0
review_comment_title      87656
review_comment_message    58247
dtype: int64

#### There are a lot of missing values, about 90% review_comment_tittle variable and 60% for review_comment_message. Since anwsering to these surveys are voluntary, great values like these are expect, so they will remain unchangend. 

## 10.4. Looking for Duplicates

In [28]:
# look for duplicates

reviews_duplicates = df_reviews_dropped.duplicated()

# counts of duplicates

reviews_duplicates_counts = df_reviews_dropped.duplicated().sum()

In [29]:
reviews_duplicates_counts

227

In [37]:
# Drop duplicate rows from the DataFrame

df_reviews_without_duplicates = df_reviews_dropped.drop_duplicates()


In [38]:
# check shape

df_reviews_without_duplicates.shape

(98997, 4)

## 10.5. Export Wrangled Dataset

In [40]:
# Exporting wrangled orders reviews dataset

df_reviews_without_duplicates.to_csv(os.path.join(path, '02 Data', 'Prepared Data', 'olist_order_reviews_wrangled.csv'))