In [37]:
import numpy as np
import pandas as pd

import warnings
warnings.filterwarnings('ignore')

import seaborn as sns
import matplotlib.pyplot as plt
import matplotlib.animation as animation


import plotly.express as px
import plotly.graph_objects as go
import plotly.figure_factory as ff


# Step 1: Loading Dataset

In [38]:
# Load the dataset
df = pd.read_csv("AMZN_quarterly_balance_sheet.csv")

# Removing leading and trailing spaces from column names
df.columns = df.columns.str.strip()

# Using a regular expression to replace multiple spaces with a single space in all column names
df.columns = df.columns.str.replace(r'\s+', ' ', regex=True)  

# Step 2: Overview of Dataset

In [39]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 61 entries, 0 to 60
Data columns (total 38 columns):
 #   Column                                  Non-Null Count  Dtype  
---  ------                                  --------------  -----  
 0   fiscalDateEnding                        61 non-null     object 
 1   reportedCurrency                        61 non-null     object 
 2   totalAssets                             61 non-null     int64  
 3   totalCurrentAssets                      61 non-null     int64  
 4   cashAndCashEquivalentsAtCarryingValue   61 non-null     int64  
 5   cashAndShortTermInvestments             61 non-null     int64  
 6   inventory                               61 non-null     int64  
 7   currentNetReceivables                   61 non-null     int64  
 8   totalNonCurrentAssets                   61 non-null     int64  
 9   propertyPlantEquipment                  61 non-null     int64  
 10  accumulatedDepreciationAmortizationPPE  11 non-null     float64


In [40]:
df.head(20)

Unnamed: 0,fiscalDateEnding,reportedCurrency,totalAssets,totalCurrentAssets,cashAndCashEquivalentsAtCarryingValue,cashAndShortTermInvestments,inventory,currentNetReceivables,totalNonCurrentAssets,propertyPlantEquipment,...,currentLongTermDebt,longTermDebtNoncurrent,shortLongTermDebtTotal,otherCurrentLiabilities,otherNonCurrentLiabilities,totalShareholderEquity,treasuryStock,retainedEarnings,commonStock,commonStockSharesOutstanding
0,2024-06-30,USD,554818000000,173307000000,71178000000,89092000000,34109000000,50106000000,160798000000,220717000000,...,7429000000.0,54889000000.0,62759000000.0,42747000000,27226000000.0,236447000000,7837000000,137534000000,110000000,10490000000
1,2024-03-31,USD,530969000000,163989000000,72852000000,85074000000,31147000000,47768000000,157036000000,209950000000,...,8848000000.0,57634000000.0,66902000000.0,44915000000,26657000000.0,216661000000,7837000000,124049000000,109000000,10403000000
2,2023-12-31,USD,527854000000,172351000000,73387000000,86780000000,33318000000,52253000000,159019000000,204177000000,...,8494000000.0,58314000000.0,67329000000.0,45764000000,25451000000.0,201875000000,7837000000,113618000000,109000000,10383000000
3,2023-09-30,USD,486883000000,142995000000,49605000000,64169000000,35406000000,43420000000,147431000000,196468000000,...,5995000000.0,61098000000.0,67638000000.0,42340000000,21707000000.0,182973000000,7837000000,102994000000,108000000,10330000000
4,2023-06-30,USD,477607000000,140482000000,49529000000,63970000000,36587000000,39925000000,143356000000,193784000000,...,3997000000.0,63092000000.0,68572000000.0,49232000000,21853000000.0,168602000000,7837000000,93115000000,108000000,10313000000
5,2023-03-31,USD,464378000000,136221000000,49343000000,64405000000,34170000000,37646000000,137417000000,190754000000,...,2000000000.0,67084000000.0,70572000000.0,66382000000,20931000000.0,154526000000,7837000000,86365000000,108000000,10258000000
6,2022-12-31,USD,462675000000,146791000000,53888000000,70026000000,34405000000,42360000000,135273000000,186715000000,...,2999000000.0,67150000000.0,71742000000.0,62566000000,21121000000.0,146043000000,7837000000,83193000000,108000000,10242000000
7,2022-09-30,USD,428362000000,131463000000,34947000000,58662000000,36647000000,36154000000,119711000000,177195000000,...,4247000000.0,58919000000.0,64641000000.0,59974000000,22259000000.0,137489000000,7837000000,82915000000,107000000,10198000000
8,2022-06-30,USD,419728000000,133667000000,37478000000,60710000000,38153000000,34804000000,112370000000,173706000000,...,4998000000.0,58053000000.0,63670000000.0,56254000000,23458000000.0,131402000000,7837000000,80043000000,107000000,10183000000
9,2022-03-31,USD,410767000000,133876000000,36393000000,66385000000,34987000000,32504000000,108438000000,168468000000,...,2681000000.0,47556000000.0,50553000000.0,58141000000,23971000000.0,134001000000,4503000000,82071000000,5000000,509000000


# Step 3: EDA - Missing Values Analysis 

## Step 3)i): EDA - Show Missing Values in each Column

In [41]:
def display_columns_with_null_values(df: pd.DataFrame):
    """
    Displays the total number of null values for each column in the dataframe,
    showing only columns that have null values.
    
    Parameters:
    - df (pd.DataFrame): The dataframe to be checked for null values.
    
    Returns:
    - None: Prints the columns with null values and their counts.
    """
    
    # Get total null values in each column
    total_null_values = df.isnull().sum()
    
    # Filter out columns that don't have any null values
    columns_with_null = total_null_values[total_null_values > 0].sort_values(ascending=False)
    
    # Check if there are any columns with null values
    if not columns_with_null.empty:
        print('-' * 64)
        print("Total null values in each column (only columns with null values)")
        print('-' * 64)
        print(columns_with_null)
    else:
        print('-' * 64)
        print("Total null values in each column (only columns with null values)")
        print('-' * 64)
        print("No columns have null values.")

In [42]:
# Get percentage of null values in each column
null_values_percentage = df.isnull().mean().round(4).mul(100).sort_values(ascending=False)
print('-' * 44)
print("Percentage(%) of null values in each column")
print('-' * 44)
print(null_values_percentage)
print('\n')

# Get total null values in each column
display_columns_with_null_values(df)


--------------------------------------------
Percentage(%) of null values in each column
--------------------------------------------
accumulatedDepreciationAmortizationPPE    81.97
longTermInvestments                       81.97
capitalLeaseObligations                   63.93
investments                               36.07
deferredRevenue                           36.07
intangibleAssetsExcludingGoodwill         27.87
shortTermDebt                             22.95
currentLongTermDebt                       21.31
currentDebt                               21.31
otherNonCurrentLiabilities                 9.84
longTermDebtNoncurrent                     9.84
longTermDebt                               9.84
shortLongTermDebtTotal                     8.20
currentAccountsPayable                     1.64
totalShareholderEquity                     0.00
otherCurrentLiabilities                    0.00
treasuryStock                              0.00
retainedEarnings                           0.00
co

## Step 3)ii): EDA - Handling Missing Values

In [43]:
# Drop Columns with Majority Null Values

"""
accumulatedDepreciationAmortizationPPE    50
longTermInvestments                       50
capitalLeaseObligations                   39
investments                               22
deferredRevenue                           22
intangibleAssetsExcludingGoodwill         17
shortTermDebt                             14
currentDebt                               13
currentLongTermDebt                       13
"""

df.drop(["accumulatedDepreciationAmortizationPPE", 
         "longTermInvestments",
         "capitalLeaseObligations",
         "investments",
        "deferredRevenue",
        "intangibleAssetsExcludingGoodwill",
        "shortTermDebt",
        "currentDebt",
        "currentLongTermDebt"

         ], axis=1, inplace=True)

# Get total null values in each column
display_columns_with_null_values(df)

----------------------------------------------------------------
Total null values in each column (only columns with null values)
----------------------------------------------------------------
longTermDebt                  6
longTermDebtNoncurrent        6
otherNonCurrentLiabilities    6
shortLongTermDebtTotal        5
currentAccountsPayable        1
dtype: int64


In [44]:
# Fill Null Values in the Remaining Columns with the average of the column
numeric_df = df.select_dtypes(include=[np.number]) # Select only numeric columns
numeric_df.fillna(numeric_df.mean(), inplace=True)  # Fill missing values in numeric columns with the column mean
df[numeric_df.columns] = numeric_df # Merge back with non-numeric columns if needed

# Get total null values in each column
display_columns_with_null_values(df)

----------------------------------------------------------------
Total null values in each column (only columns with null values)
----------------------------------------------------------------
No columns have null values.


In [45]:
df.head(20)

Unnamed: 0,fiscalDateEnding,reportedCurrency,totalAssets,totalCurrentAssets,cashAndCashEquivalentsAtCarryingValue,cashAndShortTermInvestments,inventory,currentNetReceivables,totalNonCurrentAssets,propertyPlantEquipment,...,longTermDebt,longTermDebtNoncurrent,shortLongTermDebtTotal,otherCurrentLiabilities,otherNonCurrentLiabilities,totalShareholderEquity,treasuryStock,retainedEarnings,commonStock,commonStockSharesOutstanding
0,2024-06-30,USD,554818000000,173307000000,71178000000,89092000000,34109000000,50106000000,160798000000,220717000000,...,62683000000.0,54889000000.0,62759000000.0,42747000000,27226000000.0,236447000000,7837000000,137534000000,110000000,10490000000
1,2024-03-31,USD,530969000000,163989000000,72852000000,85074000000,31147000000,47768000000,157036000000,209950000000,...,66852000000.0,57634000000.0,66902000000.0,44915000000,26657000000.0,216661000000,7837000000,124049000000,109000000,10403000000
2,2023-12-31,USD,527854000000,172351000000,73387000000,86780000000,33318000000,52253000000,159019000000,204177000000,...,67182000000.0,58314000000.0,67329000000.0,45764000000,25451000000.0,201875000000,7837000000,113618000000,109000000,10383000000
3,2023-09-30,USD,486883000000,142995000000,49605000000,64169000000,35406000000,43420000000,147431000000,196468000000,...,67472000000.0,61098000000.0,67638000000.0,42340000000,21707000000.0,182973000000,7837000000,102994000000,108000000,10330000000
4,2023-06-30,USD,477607000000,140482000000,49529000000,63970000000,36587000000,39925000000,143356000000,193784000000,...,67472000000.0,63092000000.0,68572000000.0,49232000000,21853000000.0,168602000000,7837000000,93115000000,108000000,10313000000
5,2023-03-31,USD,464378000000,136221000000,49343000000,64405000000,34170000000,37646000000,137417000000,190754000000,...,69472000000.0,67084000000.0,70572000000.0,66382000000,20931000000.0,154526000000,7837000000,86365000000,108000000,10258000000
6,2022-12-31,USD,462675000000,146791000000,53888000000,70026000000,34405000000,42360000000,135273000000,186715000000,...,70542000000.0,67150000000.0,71742000000.0,62566000000,21121000000.0,146043000000,7837000000,83193000000,108000000,10242000000
7,2022-09-30,USD,428362000000,131463000000,34947000000,58662000000,36647000000,36154000000,119711000000,177195000000,...,63541000000.0,58919000000.0,64641000000.0,59974000000,22259000000.0,137489000000,7837000000,82915000000,107000000,10198000000
8,2022-06-30,USD,419728000000,133667000000,37478000000,60710000000,38153000000,34804000000,112370000000,173706000000,...,63435000000.0,58053000000.0,63670000000.0,56254000000,23458000000.0,131402000000,7837000000,80043000000,107000000,10183000000
9,2022-03-31,USD,410767000000,133876000000,36393000000,66385000000,34987000000,32504000000,108438000000,168468000000,...,50553000000.0,47556000000.0,50553000000.0,58141000000,23971000000.0,134001000000,4503000000,82071000000,5000000,509000000


# Step 4: EDA - Duplicate Values Analysis 

## Step 4)i): EDA - Show Duplicate Values Rows

In [46]:
# Get percentage of duplicate rows
total_rows = len(df)
duplicate_rows = df.duplicated().sum()
duplicate_percentage = (duplicate_rows / total_rows) * 100

print('-' * 48)
print("Percentage(%) of duplicate rows in the DataFrame")
print('-' * 48)
print(f"{duplicate_percentage:.2f}%")
print('\n')

# Get total number of duplicate rows
print('-' * 30)
print("Total number of duplicate rows")
print('-' * 30)
print(duplicate_rows)


------------------------------------------------
Percentage(%) of duplicate rows in the DataFrame
------------------------------------------------
0.00%


------------------------------
Total number of duplicate rows
------------------------------
0


## Step 4)ii): EDA - Handling Duplicate Values Rows

In [47]:
# # Drop all duplicate rows
# df_cleaned = df.drop_duplicates()

# print('-' * 22)
# print("Duplicate rows dropped")
# print('-' * 22)
# print(f"Original number of rows: {total_rows}")
# print(f"Number of rows after dropping duplicates: {len(df_cleaned)}")

# Step 5): EDA - Save Cleaned Dataset

In [48]:
# save to csv
df.to_csv(f'DC_AMZN_quarterly_balance_sheet.csv.csv')