In [1]:
# In notebook (01-data_exploration.ipynb)
import pandas as pd
import sys
import os
import numpy as np

# Add the parent directory to sys.path
sys.path.append(os.path.abspath(os.path.join(os.getcwd(), '..')))

from superstore_sales.config import RAW_DATA_FILE
import pandas as pd

df_raw = pd.read_csv(RAW_DATA_FILE, encoding='ISO-8859-1')
df_clean = df_raw.copy()

In [2]:
df_raw.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9994 entries, 0 to 9993
Data columns (total 21 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   Row ID         9994 non-null   int64  
 1   Order ID       9994 non-null   object 
 2   Order Date     9994 non-null   object 
 3   Ship Date      9994 non-null   object 
 4   Ship Mode      9994 non-null   object 
 5   Customer ID    9994 non-null   object 
 6   Customer Name  9994 non-null   object 
 7   Segment        9994 non-null   object 
 8   Country        9994 non-null   object 
 9   City           9994 non-null   object 
 10  State          9994 non-null   object 
 11  Postal Code    9994 non-null   int64  
 12  Region         9994 non-null   object 
 13  Product ID     9994 non-null   object 
 14  Category       9994 non-null   object 
 15  Sub-Category   9994 non-null   object 
 16  Product Name   9994 non-null   object 
 17  Sales          9994 non-null   float64
 18  Quantity

In [3]:
display(df_clean.iloc[:,0:9].sample(5))
display(df_clean.iloc[:,10:].sample(5))

Unnamed: 0,Row ID,Order ID,Order Date,Ship Date,Ship Mode,Customer ID,Customer Name,Segment,Country
6110,6111,CA-2016-124590,11/12/2016,11/16/2016,Standard Class,SP-20920,Susan Pistek,Consumer,United States
4021,4022,CA-2014-154963,6/22/2014,6/27/2014,Standard Class,AA-10645,Anna Andreadi,Consumer,United States
2,3,CA-2016-138688,6/12/2016,6/16/2016,Second Class,DV-13045,Darrin Van Huff,Corporate,United States
1236,1237,CA-2016-144344,10/28/2016,10/28/2016,Same Day,PG-18820,Patrick Gardner,Consumer,United States
7931,7932,US-2016-168095,7/15/2016,7/20/2016,Standard Class,MC-17425,Mark Cousins,Corporate,United States


Unnamed: 0,State,Postal Code,Region,Product ID,Category,Sub-Category,Product Name,Sales,Quantity,Discount,Profit
9804,New York,10024,East,FUR-FU-10002364,Furniture,Furnishings,"Eldon Expressions Wood Desk Accessories, Oak",7.38,1,0.0,2.1402
5216,Michigan,48066,Central,OFF-EN-10000461,Office Supplies,Envelopes,"#10- 4 1/8"" x 9 1/2"" Recycled Envelopes",17.48,2,0.0,8.2156
6967,South Carolina,29203,South,OFF-PA-10001461,Office Supplies,Paper,HP Office Paper (20Lb. and 87 Bright),60.12,9,0.0,28.8576
8532,Michigan,48227,Central,OFF-ST-10001370,Office Supplies,Storage,Sensible Storage WireTech Storage Systems,496.86,7,0.0,24.843
3526,Mississippi,39212,South,TEC-PH-10000148,Technology,Phones,Cyber Acoustics AC-202b Speech Recognition Ste...,38.97,3,0.0,0.7794


In [4]:
df_clean.describe()

Unnamed: 0,Row ID,Postal Code,Sales,Quantity,Discount,Profit
count,9994.0,9994.0,9994.0,9994.0,9994.0,9994.0
mean,4997.5,55190.379428,229.858001,3.789574,0.156203,28.656896
std,2885.163629,32063.69335,623.245101,2.22511,0.206452,234.260108
min,1.0,1040.0,0.444,1.0,0.0,-6599.978
25%,2499.25,23223.0,17.28,2.0,0.0,1.72875
50%,4997.5,56430.5,54.49,3.0,0.2,8.6665
75%,7495.75,90008.0,209.94,5.0,0.2,29.364
max,9994.0,99301.0,22638.48,14.0,0.8,8399.976


In [5]:
df_clean.describe(include='object')

Unnamed: 0,Order ID,Order Date,Ship Date,Ship Mode,Customer ID,Customer Name,Segment,Country,City,State,Region,Product ID,Category,Sub-Category,Product Name
count,9994,9994,9994,9994,9994,9994,9994,9994,9994,9994,9994,9994,9994,9994,9994
unique,5009,1237,1334,4,793,793,3,1,531,49,4,1862,3,17,1850
top,CA-2017-100111,9/5/2016,12/16/2015,Standard Class,WB-21850,William Brown,Consumer,United States,New York City,California,West,OFF-PA-10001970,Office Supplies,Binders,Staple envelope
freq,14,38,35,5968,37,37,5191,9994,915,2001,3203,19,6026,1523,48


In [6]:
df_clean.columns

Index(['Row ID', 'Order ID', 'Order Date', 'Ship Date', 'Ship Mode',
       'Customer ID', 'Customer Name', 'Segment', 'Country', 'City', 'State',
       'Postal Code', 'Region', 'Product ID', 'Category', 'Sub-Category',
       'Product Name', 'Sales', 'Quantity', 'Discount', 'Profit'],
      dtype='object')

In [7]:
# Change numbers to str 
df_clean[['Row ID', 'Postal Code']]=df_clean[['Row ID', 'Postal Code']].astype('str')
df_clean['Postal Code']=df_clean['Postal Code'].str.zfill(5) # fill the postal codes with zeros to the left 

# Transform date columns to date type
df_clean[['Order Date', 'Ship Date']] = df_clean[['Order Date', 'Ship Date']].apply(pd.to_datetime)

# Change columns to categories
df_clean['Ship Mode'] = pd.Categorical(df_clean['Ship Mode'], categories=df_clean['Ship Mode'].unique(), ordered=False)
df_clean['Segment'] = pd.Categorical(df_clean['Segment'], categories=df_clean['Segment'].unique(), ordered=False)
df_clean['Country'] = pd.Categorical(df_clean['Country'], categories=['United States','International'], ordered=False)
df_clean['Region'] = pd.Categorical(df_clean['Region'], categories=df_clean['Segment'].unique(), ordered=False)
df_clean['Category'] = pd.Categorical(df_clean['Category'], categories=df_clean['Category'].unique(), ordered=False)
df_clean['Sub-Category'] = pd.Categorical(df_clean['Sub-Category'], categories=df_clean['Sub-Category'].unique(), ordered=False)

# Check if there is a one-one relation for IDs or other

# Check if every 'Sub-Category' appearns only in one 'Category'
subcat_cat_count = df_clean.groupby('Sub-Category',observed=True)['Category'].nunique() # For each sub-category it is count how many appears in different categories
display(subcat_cat_count ) # If one count is larger than 1 it means it appears in two (o more) different Categories. 



Sub-Category
Bookcases      1
Chairs         1
Labels         1
Tables         1
Storage        1
Furnishings    1
Art            1
Phones         1
Binders        1
Appliances     1
Paper          1
Accessories    1
Envelopes      1
Fasteners      1
Supplies       1
Machines       1
Copiers        1
Name: Category, dtype: int64

# check names with double ids

In [None]:
ids='Product ID'
name='Product Name'
fl='Product ID dup'

In [None]:
# Create a Series of flag of the values with a duplicated id
def dup_flag(df, id_col, value_col):
    unique_combinations = df[[id_col, value_col]].drop_duplicates().copy()
    dup_prod = df.groupby(id_col)[value_col].nunique()
    
    dup_ids = dup_prod[dup_prod > 1].index.to_list()
    
    dup_flags=df[id_col].isin(dup_ids)
    return dup_flags, unique_combinations

def update_id(df, id_col, value_col):

    flags=dup_flag(df, id_col, value_col)
    df[id_col+' dup']=flags[0]
    suffixes=flags[1]
    suffixes[id_col+' suffix'] = suffixes.groupby(id_col).cumcount() + 1
    new_col = id_col+' updated'
    suffixes[new_col] = suffixes[id_col].astype(str) + "_" + suffixes[id_col+' suffix'].astype(str).str.zfill(2)
    
    # Columns to check
    columns_to_remove = [id_col+' suffix', new_col]
    
    # Drop only if they exist
    df = df.drop(columns=[col for col in columns_to_remove if col in df.columns], axis=1)
    
    df_new = df.merge(suffixes,on=[id_col, value_col], how='left')
    
    df_new[new_col] = np.where(
        df_new[id_col + " dup"], 
        df_new[new_col], # If 'id_col dup' is True, i.e. there are duplicates
        df_new[id_col]  # If 'id_col dup' is False, i.e. no-duplicates
    )

    return df_new

In [12]:
df_clean=update_id(df_clean,'Product ID', 'Product Name')
df_clean=update_id(df_clean,'Customer ID', 'Customer Name')

In [13]:
df_clean.columns

Index(['Row ID', 'Order ID', 'Order Date', 'Ship Date', 'Ship Mode',
       'Customer ID', 'Customer Name', 'Segment', 'Country', 'City', 'State',
       'Postal Code', 'Region', 'Product ID', 'Category', 'Sub-Category',
       'Product Name', 'Sales', 'Quantity', 'Discount', 'Profit',
       'Product ID dup', 'Product ID suffix', 'Product ID updated',
       'Customer ID dup', 'Customer ID suffix', 'Customer ID updated'],
      dtype='object')

In [14]:
df_clean[df_clean['Customer ID dup']]


Unnamed: 0,Row ID,Order ID,Order Date,Ship Date,Ship Mode,Customer ID,Customer Name,Segment,Country,City,...,Sales,Quantity,Discount,Profit,Product ID dup,Product ID suffix,Product ID updated,Customer ID dup,Customer ID suffix,Customer ID updated
