In [8]:
import numpy as np
import pandas as pd

import warnings
warnings.filterwarnings('ignore')

import seaborn as sns
import matplotlib.pyplot as plt
import matplotlib.animation as animation


import plotly.express as px
import plotly.graph_objects as go
import plotly.figure_factory as ff


# Step 1: Loading Dataset

In [9]:
# Load the dataset
df = pd.read_csv("AMZN_quarterly_cash_flow.csv")

# Removing leading and trailing spaces from column names
df.columns = df.columns.str.strip()

# Using a regular expression to replace multiple spaces with a single space in all column names
df.columns = df.columns.str.replace(r'\s+', ' ', regex=True)  

# Step 2: Overview of Dataset

In [10]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 61 entries, 0 to 60
Data columns (total 29 columns):
 #   Column                                                     Non-Null Count  Dtype  
---  ------                                                     --------------  -----  
 0   fiscalDateEnding                                           61 non-null     object 
 1   reportedCurrency                                           61 non-null     object 
 2   operatingCashflow                                          61 non-null     int64  
 3   paymentsForOperatingActivities                             58 non-null     float64
 4   proceedsFromOperatingActivities                            0 non-null      float64
 5   changeInOperatingLiabilities                               61 non-null     int64  
 6   changeInOperatingAssets                                    61 non-null     int64  
 7   depreciationDepletionAndAmortization                       61 non-null     int64  
 8   capitalExpen

In [11]:
df.head(20)

Unnamed: 0,fiscalDateEnding,reportedCurrency,operatingCashflow,paymentsForOperatingActivities,proceedsFromOperatingActivities,changeInOperatingLiabilities,changeInOperatingAssets,depreciationDepletionAndAmortization,capitalExpenditures,changeInReceivables,...,dividendPayoutCommonStock,dividendPayoutPreferredStock,proceedsFromIssuanceOfCommonStock,proceedsFromIssuanceOfLongTermDebtAndCapitalSecuritiesNet,proceedsFromIssuanceOfPreferredStock,proceedsFromRepurchaseOfEquity,proceedsFromSaleOfTreasuryStock,changeInCashAndCashEquivalents,changeInExchangeRate,netIncome
0,2024-06-30,USD,25281000000,2916000000.0,,6412000000,6140000000,12038000000,17620000000,,...,,,,0.0,,0.0,,,,13485000000
1,2024-03-31,USD,18989000000,3406000000.0,,-9714000000,925000000,11684000000,14925000000,,...,,,,0.0,,0.0,,,,10431000000
2,2023-12-31,USD,42465000000,2840000000.0,,14517000000,9622000000,13820000000,14588000000,,...,,,,0.0,,0.0,,,,10624000000
3,2023-09-30,USD,21217000000,2768000000.0,,2795000000,-808000000,12131000000,12479000000,,...,,,,0.0,,0.0,,,,9879000000
4,2023-06-30,USD,16476000000,2605000000.0,,3185000000,2373000000,11589000000,11455000000,-5167000000.0,...,,,,0.0,,0.0,,264000000.0,,6750000000
5,2023-03-31,USD,4788000000,2548000000.0,,-10446000000,-371000000,11123000000,14207000000,1521000000.0,...,,,,0.0,,0.0,,-4664000000.0,,3172000000
6,2022-12-31,USD,29173000000,2449000000.0,,1300000000,-3180000000,12685000000,16592000000,-8788000000.0,...,,,,8235000000.0,,0.0,,18438000000.0,,278000000
7,2022-09-30,USD,11404000000,1901000000.0,,-1172000000,-732000000,10327000000,16378000000,-4794000000.0,...,,,,107000000.0,,0.0,,-1188000000.0,,2872000000
8,2022-06-30,USD,8965000000,2183000000.0,,4020000000,3890000000,9716000000,15724000000,-6799000000.0,...,,,,12824000000.0,,-3334000000.0,,1513000000.0,,-2028000000
9,2022-03-31,USD,-2790000000,2474000000.0,,-8044000000,2614000000,9193000000,14951000000,-1516000000.0,...,,,,0.0,,-2666000000.0,,106000000.0,,-3844000000


# Step 3: EDA - Missing Values Analysis 

## Step 3)i): EDA - Show Missing Values in each Column

In [12]:
def display_columns_with_null_values(df: pd.DataFrame):
    """
    Displays the total number of null values for each column in the dataframe,
    showing only columns that have null values.
    
    Parameters:
    - df (pd.DataFrame): The dataframe to be checked for null values.
    
    Returns:
    - None: Prints the columns with null values and their counts.
    """
    
    # Get total null values in each column
    total_null_values = df.isnull().sum()
    
    # Filter out columns that don't have any null values
    columns_with_null = total_null_values[total_null_values > 0].sort_values(ascending=False)
    
    # Check if there are any columns with null values
    if not columns_with_null.empty:
        print('-' * 64)
        print("Total null values in each column (only columns with null values)")
        print('-' * 64)
        print(columns_with_null)
    else:
        print('-' * 64)
        print("Total null values in each column (only columns with null values)")
        print('-' * 64)
        print("No columns have null values.")

In [13]:
# Get percentage of null values in each column
null_values_percentage = df.isnull().mean().round(4).mul(100).sort_values(ascending=False)
print('-' * 44)
print("Percentage(%) of null values in each column")
print('-' * 44)
print(null_values_percentage)
print('\n')

# Get total null values in each column
display_columns_with_null_values(df)


--------------------------------------------
Percentage(%) of null values in each column
--------------------------------------------
proceedsFromOperatingActivities                              100.00
proceedsFromSaleOfTreasuryStock                              100.00
proceedsFromIssuanceOfPreferredStock                         100.00
proceedsFromIssuanceOfCommonStock                            100.00
dividendPayoutPreferredStock                                 100.00
dividendPayoutCommonStock                                    100.00
dividendPayout                                               100.00
paymentsForRepurchaseOfPreferredStock                        100.00
proceedsFromRepaymentsOfShortTermDebt                         63.93
proceedsFromIssuanceOfLongTermDebtAndCapitalSecuritiesNet     40.98
changeInExchangeRate                                          16.39
proceedsFromRepurchaseOfEquity                                13.11
changeInCashAndCashEquivalents                    

## Step 3)ii): EDA - Handling Missing Values

In [14]:
# Drop Columns with Majority Null Values
"""
proceedsFromOperatingActivities                              61
paymentsForRepurchaseOfPreferredStock                        61
dividendPayout                                               61
dividendPayoutCommonStock                                    61
dividendPayoutPreferredStock                                 61
proceedsFromIssuanceOfCommonStock                            61
proceedsFromIssuanceOfPreferredStock                         61
proceedsFromSaleOfTreasuryStock                              61
proceedsFromRepaymentsOfShortTermDebt                        39
proceedsFromIssuanceOfLongTermDebtAndCapitalSecuritiesNet    25
changeInExchangeRate                                         10
"""

df.drop([
    "proceedsFromOperatingActivities", 
    "paymentsForRepurchaseOfPreferredStock",
    "dividendPayout",
    "dividendPayoutCommonStock",
    "dividendPayoutPreferredStock",
    "proceedsFromIssuanceOfCommonStock",
    "proceedsFromIssuanceOfPreferredStock",
    "proceedsFromSaleOfTreasuryStock",
    "proceedsFromRepaymentsOfShortTermDebt",
    "proceedsFromIssuanceOfLongTermDebtAndCapitalSecuritiesNet",
    "changeInExchangeRate"

         ], axis=1, inplace=True)

# Get total null values in each column
display_columns_with_null_values(df)

----------------------------------------------------------------
Total null values in each column (only columns with null values)
----------------------------------------------------------------
proceedsFromRepurchaseOfEquity    8
changeInReceivables               4
changeInCashAndCashEquivalents    4
paymentsForOperatingActivities    3
dtype: int64


In [15]:
# Fill Null Values in the Remaining Columns with the average of the column
numeric_df = df.select_dtypes(include=[np.number]) # Select only numeric columns
numeric_df.fillna(numeric_df.mean(), inplace=True)  # Fill missing values in numeric columns with the column mean
df[numeric_df.columns] = numeric_df # Merge back with non-numeric columns if needed

# Get total null values in each column
display_columns_with_null_values(df)

----------------------------------------------------------------
Total null values in each column (only columns with null values)
----------------------------------------------------------------
No columns have null values.


In [16]:
df.head(20)

Unnamed: 0,fiscalDateEnding,reportedCurrency,operatingCashflow,paymentsForOperatingActivities,changeInOperatingLiabilities,changeInOperatingAssets,depreciationDepletionAndAmortization,capitalExpenditures,changeInReceivables,changeInInventory,profitLoss,cashflowFromInvestment,cashflowFromFinancing,paymentsForRepurchaseOfCommonStock,paymentsForRepurchaseOfEquity,proceedsFromRepurchaseOfEquity,changeInCashAndCashEquivalents,netIncome
0,2024-06-30,USD,25281000000,2916000000.0,6412000000,6140000000,12038000000,17620000000,-1304035000.0,3085000000,13485000000,-22138000000,-4490000000,0,0,0.0,442596500.0,13485000000
1,2024-03-31,USD,18989000000,3406000000.0,-9714000000,925000000,11684000000,14925000000,-1304035000.0,-1776000000,10431000000,-17862000000,-1256000000,0,0,0.0,442596500.0,10431000000
2,2023-12-31,USD,42465000000,2840000000.0,14517000000,9622000000,13820000000,14588000000,-1304035000.0,-2643000000,10644000000,-12601000000,-6746000000,0,0,0.0,442596500.0,10624000000
3,2023-09-30,USD,21217000000,2768000000.0,2795000000,-808000000,12131000000,12479000000,-1304035000.0,-808000000,9879000000,-11753000000,-8948000000,0,0,0.0,442596500.0,9879000000
4,2023-06-30,USD,16476000000,2605000000.0,3185000000,2373000000,11589000000,11455000000,-5167000000.0,2373000000,6750000000,-9673000000,-6539000000,0,0,0.0,264000000.0,6750000000
5,2023-03-31,USD,4788000000,2548000000.0,-10446000000,-371000000,11123000000,14207000000,1521000000.0,-371000000,3172000000,-15806000000,6354000000,0,0,0.0,-4664000000.0,3172000000
6,2022-12-31,USD,29173000000,2449000000.0,1300000000,-3180000000,12685000000,16592000000,-8788000000.0,-3180000000,-2720000000,-10821000000,86000000,0,0,0.0,18438000000.0,278000000
7,2022-09-30,USD,11404000000,1901000000.0,-1172000000,-732000000,10327000000,16378000000,-4794000000.0,-732000000,2872000000,-15608000000,3016000000,0,0,0.0,-1188000000.0,2872000000
8,2022-06-30,USD,8965000000,2183000000.0,4020000000,3890000000,9716000000,15724000000,-6799000000.0,3890000000,-2028000000,-12078000000,4626000000,3334000000,3334000000,-3334000000.0,1513000000.0,-2028000000
9,2022-03-31,USD,-2790000000,2474000000.0,-8044000000,2614000000,9193000000,14951000000,-1516000000.0,2614000000,-3844000000,906000000,1990000000,2666000000,2666000000,-2666000000.0,106000000.0,-3844000000


# Step 4: EDA - Duplicate Values Analysis 

## Step 4)i): EDA - Show Duplicate Values Rows

In [17]:
# Get percentage of duplicate rows
total_rows = len(df)
duplicate_rows = df.duplicated().sum()
duplicate_percentage = (duplicate_rows / total_rows) * 100

print('-' * 48)
print("Percentage(%) of duplicate rows in the DataFrame")
print('-' * 48)
print(f"{duplicate_percentage:.2f}%")
print('\n')

# Get total number of duplicate rows
print('-' * 30)
print("Total number of duplicate rows")
print('-' * 30)
print(duplicate_rows)


------------------------------------------------
Percentage(%) of duplicate rows in the DataFrame
------------------------------------------------
0.00%


------------------------------
Total number of duplicate rows
------------------------------
0


## Step 4)ii): EDA - Handling Duplicate Values Rows

In [18]:
# # Drop all duplicate rows
# df_cleaned = df.drop_duplicates()

# print('-' * 22)
# print("Duplicate rows dropped")
# print('-' * 22)
# print(f"Original number of rows: {total_rows}")
# print(f"Number of rows after dropping duplicates: {len(df_cleaned)}")

# Step 5): EDA - Save Cleaned Dataset

In [19]:
# save to csv
df.to_csv(f'DC_AMZN_quarterly_cash_flow.csv')