# **AAL Australia: 4Q Sales Report**


## **1. Data Wrangling**


### **1.1 Data Inspection**

_It's the process of getting familiar with the data in order to identify quality and structure issues._


In [None]:
import pandas as pd

df = pd.read_csv("./Data/AAL_Q4-2020_Sales.csv")

print("First 5 rows of the DataFrame:")
print(df.head(5))

print(f"\nLast 5 rows of the DataFrame:")
print(df.tail(5))

print("\nDataFrame Summary")
print(df.info())

has_null = df.isnull().values.any()
print("\nDoes the DataFrame have any null value?:", has_null)

number_of_duplicates = df.duplicated().sum()
print(f"\nNumber of duplicates: {number_of_duplicates}")

print("\nList of numerical columns:")
numerical_columns = df.select_dtypes(include=["number"]).columns
print(numerical_columns)

print("\nList of categorical columns:")
categorical_columns = df.select_dtypes(exclude=["number"]).columns
print(categorical_columns)

### **1.2 Data Cleaning**

_It's the process of handling missing data, removing duplicates, converting data types, trimming whitespace, correcting inconsistencies, standardizing formats, dealing with outliers, and validating accuracy._


In [None]:
# Handling duplicates:
if number_of_duplicates > 0:
    df = df.drop_duplicates(keep="first")

# Handling missing data for numerical columns:
for column in numerical_columns:
    if df[column].isnull().any():
        column_median = df[column].median()
        df[column] = df[column].fillna(column_median)

# Handling missing data for categorical columns:
for column in categorical_columns:
    if df[column].isnull().any():
        df[column] = df[column].fillna("Unknown")


# Identify and correct misspelled words and unnecessary whitespace:
for column in categorical_columns:
    if column != "Date":
        df[column] = df[column].str.strip()
        unique_values = df[column].unique()
        print(f"{column} unique values: {unique_values}")

# Handling Outliers:
SALES_COLUMN = "Sales"

q1 = df[SALES_COLUMN].quantile(0.25)
q3 = df[SALES_COLUMN].quantile(0.75)
iqr = q3 - q1

lower_bound = q1 - 1.5 * iqr
upper_bound = q3 + 1.5 * iqr

# Tagging all the 'sales outliers' with True or False
condition = (df[SALES_COLUMN] >= lower_bound) & (df[SALES_COLUMN] <= upper_bound)
df["Sales_Outlier"] = ~condition

### **1.3 Data Transformation**

_It involves converting data from its original form into a format that is more suitable for analysis._


In [None]:
from pandas import DataFrame

# Transforming Date column to YYYY-MM-DD:
DATE_COLUMN = "Date"
df[DATE_COLUMN] = pd.to_datetime(df[DATE_COLUMN], format="%d-%b-%Y")


# Encoding categorical data into numerical:
def encode(df: DataFrame, original_column: str, new_column: str):
    df[new_column], _unique = pd.factorize(df[original_column])


ORIGINAL_COLUMNS = ["Time", "State", "Group"]
NEW_COLUMNS = ["Numerical_Time", "Numerical_State", "Numerical_Group"]

for original_column, new_column in zip(ORIGINAL_COLUMNS, NEW_COLUMNS):
    encode(df, original_column, new_column)

# Binning Sales column: