In [1]:
import numpy as np
import pandas as pd

import warnings
warnings.filterwarnings('ignore')

import seaborn as sns
import matplotlib.pyplot as plt
import matplotlib.animation as animation


import plotly.express as px
import plotly.graph_objects as go
import plotly.figure_factory as ff


# Step 1: Loading Dataset

In [2]:
# Load the dataset
df = pd.read_csv("AMZN_quarterly_income_statement.csv")

# Removing leading and trailing spaces from column names
df.columns = df.columns.str.strip()

# Using a regular expression to replace multiple spaces with a single space in all column names
df.columns = df.columns.str.replace(r'\s+', ' ', regex=True)  

# Step 2: Overview of Dataset

In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 65 entries, 0 to 64
Data columns (total 26 columns):
 #   Column                             Non-Null Count  Dtype  
---  ------                             --------------  -----  
 0   fiscalDateEnding                   65 non-null     object 
 1   reportedCurrency                   65 non-null     object 
 2   grossProfit                        65 non-null     int64  
 3   totalRevenue                       65 non-null     int64  
 4   costOfRevenue                      65 non-null     int64  
 5   costofGoodsAndServicesSold         65 non-null     int64  
 6   operatingIncome                    65 non-null     int64  
 7   sellingGeneralAndAdministrative    65 non-null     int64  
 8   researchAndDevelopment             65 non-null     int64  
 9   operatingExpenses                  65 non-null     int64  
 10  investmentIncomeNet                61 non-null     float64
 11  netInterestIncome                  60 non-null     float64
 

In [4]:
df.head(20)

Unnamed: 0,fiscalDateEnding,reportedCurrency,grossProfit,totalRevenue,costOfRevenue,costofGoodsAndServicesSold,operatingIncome,sellingGeneralAndAdministrative,researchAndDevelopment,operatingExpenses,...,depreciation,depreciationAndAmortization,incomeBeforeTax,incomeTaxExpense,interestAndDebtExpense,netIncomeFromContinuingOperations,comprehensiveIncomeNetOfTax,ebit,ebitda,netIncome
0,2024-06-30,USD,59553000000,147839000000,88286000000,73785000000,14672000000,13553000000,22304000000,14501000000,...,,948000000,15252000000,1767000000,589000000.0,13485000000.0,13090000000.0,14672000000,15620000000,13485000000
1,2024-03-31,USD,56617000000,142595000000,85978000000,72633000000,15307000000,12404000000,16960000000,13345000000,...,,941000000,12898000000,2467000000,644000000.0,10431000000.0,9873000000.0,13542000000,14483000000,10431000000
2,2023-12-31,USD,39401000000,169328000000,129927000000,92553000000,13209000000,36212000000,22038000000,39274000000,...,,2082000000,13666000000,3042000000,713000000.0,10644000000.0,12587000000.0,14379000000,16461000000,10624000000
3,2023-09-30,USD,52610000000,142183000000,89573000000,75022000000,11188000000,13112000000,21203000000,14551000000,...,,1439000000,12185000000,2306000000,806000000.0,9879000000.0,8556000000.0,12991000000,14430000000,9879000000
4,2023-06-30,USD,48693000000,133552000000,84859000000,69373000000,7681000000,13947000000,18829000000,15486000000,...,,1539000000,7554000000,804000000,840000000.0,6750000000.0,7043000000.0,8394000000,9933000000,6750000000
5,2023-03-31,USD,43403000000,126605000000,83202000000,67791000000,4774000000,13215000000,17420000000,15411000000,...,,1546000000,4120000000,948000000,823000000.0,3172000000.0,3686000000.0,4943000000,6489000000,3172000000
6,2022-12-31,USD,-1360000000,148376000000,1360000000,85640000000,2737000000,36751000000,3653000000,1360000000,...,,2118000000,-949000000,-1227000000,694000000.0,-2720000000.0,2906000000.0,-255000000,1863000000,278000000
7,2022-09-30,USD,40542000000,126381000000,85839000000,70268000000,2525000000,14075000000,16787000000,15571000000,...,,1496000000,2941000000,69000000,617000000.0,2872000000.0,539000000.0,3558000000,5054000000,2872000000
8,2022-06-30,USD,39590000000,120533000000,80943000000,66424000000,3317000000,12989000000,15468000000,14519000000,...,,1530000000,-2665000000,-637000000,584000000.0,-2028000000.0,-4445000000.0,-2081000000,-551000000,-2028000000
9,2022-03-31,USD,37013000000,115986000000,78973000000,66499000000,3669000000,10914000000,12270000000,12474000000,...,,1560000000,-5266000000,-1422000000,472000000.0,-3844000000.0,-4833000000.0,-4794000000,-3234000000,-3844000000


# Step 3: EDA - Missing Values Analysis 

## Step 3)i): EDA - Show Missing Values in each Column

In [5]:
def display_columns_with_null_values(df: pd.DataFrame):
    """
    Displays the total number of null values for each column in the dataframe,
    showing only columns that have null values.
    
    Parameters:
    - df (pd.DataFrame): The dataframe to be checked for null values.
    
    Returns:
    - None: Prints the columns with null values and their counts.
    """
    
    # Get total null values in each column
    total_null_values = df.isnull().sum()
    
    # Filter out columns that don't have any null values
    columns_with_null = total_null_values[total_null_values > 0].sort_values(ascending=False)
    
    # Check if there are any columns with null values
    if not columns_with_null.empty:
        print('-' * 64)
        print("Total null values in each column (only columns with null values)")
        print('-' * 64)
        print(columns_with_null)
    else:
        print('-' * 64)
        print("Total null values in each column (only columns with null values)")
        print('-' * 64)
        print("No columns have null values.")

In [6]:
# Get percentage of null values in each column
null_values_percentage = df.isnull().mean().round(4).mul(100).sort_values(ascending=False)
print('-' * 44)
print("Percentage(%) of null values in each column")
print('-' * 44)
print(null_values_percentage)
print('\n')

# Get total null values in each column
display_columns_with_null_values(df)


--------------------------------------------
Percentage(%) of null values in each column
--------------------------------------------
depreciation                         100.00
interestIncome                        67.69
nonInterestIncome                     53.85
netInterestIncome                      7.69
comprehensiveIncomeNetOfTax            6.15
netIncomeFromContinuingOperations      6.15
interestAndDebtExpense                 6.15
investmentIncomeNet                    6.15
fiscalDateEnding                       0.00
otherNonOperatingIncome                0.00
ebitda                                 0.00
ebit                                   0.00
incomeTaxExpense                       0.00
incomeBeforeTax                        0.00
depreciationAndAmortization            0.00
interestExpense                        0.00
reportedCurrency                       0.00
operatingExpenses                      0.00
researchAndDevelopment                 0.00
sellingGeneralAndAdministrativ

## Step 3)ii): EDA - Handling Missing Values

In [7]:
# Drop Columns with Majority Null Values
df.drop(["depreciation", 
         "interestIncome",
         "nonInterestIncome"
         ], axis=1, inplace=True)

# Get total null values in each column
display_columns_with_null_values(df)

----------------------------------------------------------------
Total null values in each column (only columns with null values)
----------------------------------------------------------------
netInterestIncome                    5
investmentIncomeNet                  4
interestAndDebtExpense               4
netIncomeFromContinuingOperations    4
comprehensiveIncomeNetOfTax          4
dtype: int64


In [8]:
# Fill Null Values in the Remaining Columns with the average of the column
numeric_df = df.select_dtypes(include=[np.number]) # Select only numeric columns
numeric_df.fillna(numeric_df.mean(), inplace=True)  # Fill missing values in numeric columns with the column mean
df[numeric_df.columns] = numeric_df # Merge back with non-numeric columns if needed

# Get total null values in each column
display_columns_with_null_values(df)

----------------------------------------------------------------
Total null values in each column (only columns with null values)
----------------------------------------------------------------
No columns have null values.


In [9]:
df.head(20)

Unnamed: 0,fiscalDateEnding,reportedCurrency,grossProfit,totalRevenue,costOfRevenue,costofGoodsAndServicesSold,operatingIncome,sellingGeneralAndAdministrative,researchAndDevelopment,operatingExpenses,...,otherNonOperatingIncome,depreciationAndAmortization,incomeBeforeTax,incomeTaxExpense,interestAndDebtExpense,netIncomeFromContinuingOperations,comprehensiveIncomeNetOfTax,ebit,ebitda,netIncome
0,2024-06-30,USD,59553000000,147839000000,88286000000,73785000000,14672000000,13553000000,22304000000,14501000000,...,-18000000,948000000,15252000000,1767000000,589000000.0,13485000000.0,13090000000.0,14672000000,15620000000,13485000000
1,2024-03-31,USD,56617000000,142595000000,85978000000,72633000000,15307000000,12404000000,16960000000,13345000000,...,-2673000000,941000000,12898000000,2467000000,644000000.0,10431000000.0,9873000000.0,13542000000,14483000000,10431000000
2,2023-12-31,USD,39401000000,169328000000,129927000000,92553000000,13209000000,36212000000,22038000000,39274000000,...,289000000,2082000000,13666000000,3042000000,713000000.0,10644000000.0,12587000000.0,14379000000,16461000000,10624000000
3,2023-09-30,USD,52610000000,142183000000,89573000000,75022000000,11188000000,13112000000,21203000000,14551000000,...,1031000000,1439000000,12185000000,2306000000,806000000.0,9879000000.0,8556000000.0,12991000000,14430000000,9879000000
4,2023-06-30,USD,48693000000,133552000000,84859000000,69373000000,7681000000,13947000000,18829000000,15486000000,...,61000000,1539000000,7554000000,804000000,840000000.0,6750000000.0,7043000000.0,8394000000,9933000000,6750000000
5,2023-03-31,USD,43403000000,126605000000,83202000000,67791000000,4774000000,13215000000,17420000000,15411000000,...,-443000000,1546000000,4120000000,948000000,823000000.0,3172000000.0,3686000000.0,4943000000,6489000000,3172000000
6,2022-12-31,USD,-1360000000,148376000000,1360000000,85640000000,2737000000,36751000000,3653000000,1360000000,...,-3450000000,2118000000,-949000000,-1227000000,694000000.0,-2720000000.0,2906000000.0,-255000000,1863000000,278000000
7,2022-09-30,USD,40542000000,126381000000,85839000000,70268000000,2525000000,14075000000,16787000000,15571000000,...,759000000,1496000000,2941000000,69000000,617000000.0,2872000000.0,539000000.0,3558000000,5054000000,2872000000
8,2022-06-30,USD,39590000000,120533000000,80943000000,66424000000,3317000000,12989000000,15468000000,14519000000,...,-5545000000,1530000000,-2665000000,-637000000,584000000.0,-2028000000.0,-4445000000.0,-2081000000,-551000000,-2028000000
9,2022-03-31,USD,37013000000,115986000000,78973000000,66499000000,3669000000,10914000000,12270000000,12474000000,...,-8570000000,1560000000,-5266000000,-1422000000,472000000.0,-3844000000.0,-4833000000.0,-4794000000,-3234000000,-3844000000


# Step 4: EDA - Duplicate Values Analysis 

## Step 4)i): EDA - Show Duplicate Values Rows

In [10]:
# Get percentage of duplicate rows
total_rows = len(df)
duplicate_rows = df.duplicated().sum()
duplicate_percentage = (duplicate_rows / total_rows) * 100

print('-' * 48)
print("Percentage(%) of duplicate rows in the DataFrame")
print('-' * 48)
print(f"{duplicate_percentage:.2f}%")
print('\n')

# Get total number of duplicate rows
print('-' * 30)
print("Total number of duplicate rows")
print('-' * 30)
print(duplicate_rows)


------------------------------------------------
Percentage(%) of duplicate rows in the DataFrame
------------------------------------------------
0.00%


------------------------------
Total number of duplicate rows
------------------------------
0


## Step 4)ii): EDA - Handling Duplicate Values Rows

In [11]:
# # Drop all duplicate rows
# df_cleaned = df.drop_duplicates()

# print('-' * 22)
# print("Duplicate rows dropped")
# print('-' * 22)
# print(f"Original number of rows: {total_rows}")
# print(f"Number of rows after dropping duplicates: {len(df_cleaned)}")

# Step 5): EDA - Save Cleaned Dataset

In [12]:
# save to csv
df.to_csv(f'DC_AMZN_quarterly_income_statement.csv')