In [1]:
import numpy as np
import pandas as pd

import warnings
warnings.filterwarnings('ignore')

import seaborn as sns
import matplotlib.pyplot as plt
import matplotlib.animation as animation


import plotly.express as px
import plotly.graph_objects as go
import plotly.figure_factory as ff


# Step 1: Loading Dataset

In [2]:
# Load the dataset
df = pd.read_csv("Data\EDA Preprocessed Data\EDA_AMZN_Historical_Quarterly_With_Economic_Indicators.csv")

# Removing leading and trailing spaces from column names
df.columns = df.columns.str.strip()

# Using a regular expression to replace multiple spaces with a single space in all column names
df.columns = df.columns.str.replace(r'\s+', ' ', regex=True)  


# Step 2: Overview of Dataset

In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 64 entries, 0 to 63
Data columns (total 22 columns):
 #   Column                          Non-Null Count  Dtype  
---  ------                          --------------  -----  
 0   Date                            64 non-null     object 
 1   Open                            64 non-null     float64
 2   High                            64 non-null     float64
 3   Low                             64 non-null     float64
 4   Close                           64 non-null     float64
 5   Volume                          64 non-null     int64  
 6   MA_21                           64 non-null     float64
 7   RSI                             64 non-null     float64
 8   MACD                            64 non-null     float64
 9   Stochastic_Oscillator           64 non-null     float64
 10  ATR                             64 non-null     float64
 11  Momentum_21                     64 non-null     float64
 12  OBV                             64 non

In [4]:
df.head()

Unnamed: 0,Date,Open,High,Low,Close,Volume,MA_21,RSI,MACD,Stochastic_Oscillator,...,OBV,Cumulative_Return,Volatility,Price_Gap,Consumer_Price_Index_Quarterly,Durable_Goods_Orders_Quarterly,Federal_Funds_Rate_Quarterly,Retail_Sales_Quarterly,Treasury_Yield_Quarterly,Unemployment_Rate_Quarterly
0,2009-03-31,2.5675,3.7805,2.3815,3.672,12370496000,3.070679,58.512097,0.136847,65.430878,...,6131718000,1.350993,0.211514,0.00145,212.015,145856.666667,0.183333,276177.0,3.453333,145856.666667
1,2009-06-30,3.651,4.428,3.5855,4.183,9494416000,3.910458,55.309698,0.023607,58.383477,...,11657492000,1.538999,0.161083,-0.001254,214.263,147576.333333,0.18,301857.333333,4.17,147576.333333
2,2009-09-30,4.221,4.725,3.7705,4.668,8324122000,4.18242,53.239807,0.126727,53.727712,...,15869958000,1.717439,0.157503,-0.003992,215.718,156525.666667,0.156667,303807.666667,4.323333,156525.666667
3,2009-12-31,4.625,7.2955,4.4135,6.726,12727852000,5.718096,63.826041,0.116955,69.004807,...,22967222000,2.474614,0.328069,0.027391,216.152,162970.333333,0.12,322315.0,4.33,162970.333333
4,2010-03-31,6.8125,6.9095,5.691,6.7885,11980988000,6.306039,51.020327,0.142294,48.488807,...,28442446000,2.497609,0.214745,0.006992,217.019667,177515.333333,0.133333,291971.666667,4.62,177515.333333


# Step 3: EDA - Missing Values Analysis 

## Step 3)i): EDA - Show Missing Values in each Column

In [5]:
def display_columns_with_null_values(df: pd.DataFrame):
    """
    Displays the total number of null values for each column in the dataframe,
    showing only columns that have null values.
    
    Parameters:
    - df (pd.DataFrame): The dataframe to be checked for null values.
    
    Returns:
    - None: Prints the columns with null values and their counts.
    """
    
    # Get total null values in each column
    total_null_values = df.isnull().sum()
    
    # Filter out columns that don't have any null values
    columns_with_null = total_null_values[total_null_values > 0].sort_values(ascending=False)
    
    # Check if there are any columns with null values
    if not columns_with_null.empty:
        print('-' * 64)
        print("Total null values in each column (only columns with null values)")
        print('-' * 64)
        print(columns_with_null)
    else:
        print('-' * 64)
        print("Total null values in each column (only columns with null values)")
        print('-' * 64)
        print("No columns have null values.")

In [6]:
# Get percentage of null values in each column
null_values_percentage = df.isnull().mean().round(4).mul(100).sort_values(ascending=False)
print('-' * 44)
print("Percentage(%) of null values in each column")
print('-' * 44)
print(null_values_percentage)
print('\n')

# Get total null values in each column
display_columns_with_null_values(df)


--------------------------------------------
Percentage(%) of null values in each column
--------------------------------------------
Durable_Goods_Orders_Quarterly    1.56
Consumer_Price_Index_Quarterly    1.56
Treasury_Yield_Quarterly          1.56
Unemployment_Rate_Quarterly       1.56
Retail_Sales_Quarterly            1.56
Federal_Funds_Rate_Quarterly      1.56
Volume                            0.00
Close                             0.00
Low                               0.00
High                              0.00
Open                              0.00
Date                              0.00
MA_21                             0.00
RSI                               0.00
Cumulative_Return                 0.00
OBV                               0.00
Momentum_21                       0.00
ATR                               0.00
Stochastic_Oscillator             0.00
MACD                              0.00
Volatility                        0.00
Price_Gap                         0.00
dtype: f

## Step 3)ii): EDA - Handling Missing Values

In [7]:
# # Drop rows with missing values
df = df.dropna()


# Step 4: EDA - Duplicate Values Analysis 

## Step 4)i): EDA - Show Duplicate Values Rows

In [8]:
# Get percentage of duplicate rows
total_rows = len(df)
duplicate_rows = df.duplicated().sum()
duplicate_percentage = (duplicate_rows / total_rows) * 100

print('-' * 48)
print("Percentage(%) of duplicate rows in the DataFrame")
print('-' * 48)
print(f"{duplicate_percentage:.2f}%")
print('\n')

# Get total number of duplicate rows
print('-' * 30)
print("Total number of duplicate rows")
print('-' * 30)
print(duplicate_rows)


------------------------------------------------
Percentage(%) of duplicate rows in the DataFrame
------------------------------------------------
0.00%


------------------------------
Total number of duplicate rows
------------------------------
0


## Step 4)ii): EDA - Handling Duplicate Values Rows

In [9]:
# # Drop all duplicate rows
# df_cleaned = df.drop_duplicates()

# print('-' * 22)
# print("Duplicate rows dropped")
# print('-' * 22)
# print(f"Original number of rows: {total_rows}")
# print(f"Number of rows after dropping duplicates: {len(df_cleaned)}")

## Step 5) Aggregating Fundamental Data

- **`Fundamental Data`**: 

  - **Balance Sheet**
  - **Cash Flow**
  - **Earnings**
  - **Income Statement**

## Step 5)i) Balance Sheet

In [10]:
balance_sheet_quarterly_df = pd.read_csv("Data\Fundamental Data\Balance Sheet\DC_AMZN_quarterly_balance_sheet.csv")

balance_sheet_quarterly_df.drop(columns=['Unnamed: 0', 'reportedCurrency'], inplace=True)

balance_sheet_quarterly_df.head()


Unnamed: 0,fiscalDateEnding,totalAssets,totalCurrentAssets,cashAndCashEquivalentsAtCarryingValue,cashAndShortTermInvestments,inventory,currentNetReceivables,totalNonCurrentAssets,propertyPlantEquipment,intangibleAssets,...,longTermDebt,longTermDebtNoncurrent,shortLongTermDebtTotal,otherCurrentLiabilities,otherNonCurrentLiabilities,totalShareholderEquity,treasuryStock,retainedEarnings,commonStock,commonStockSharesOutstanding
0,2024-06-30,554818000000,173307000000,71178000000,89092000000,34109000000,50106000000,160798000000,220717000000,22879000000,...,62683000000.0,54889000000.0,62759000000.0,42747000000,27226000000.0,236447000000,7837000000,137534000000,110000000,10490000000
1,2024-03-31,530969000000,163989000000,72852000000,85074000000,31147000000,47768000000,157036000000,209950000000,22770000000,...,66852000000.0,57634000000.0,66902000000.0,44915000000,26657000000.0,216661000000,7837000000,124049000000,109000000,10403000000
2,2023-12-31,527854000000,172351000000,73387000000,86780000000,33318000000,52253000000,159019000000,204177000000,30476000000,...,67182000000.0,58314000000.0,67329000000.0,45764000000,25451000000.0,201875000000,7837000000,113618000000,109000000,10383000000
3,2023-09-30,486883000000,142995000000,49605000000,64169000000,35406000000,43420000000,147431000000,196468000000,22749000000,...,67472000000.0,61098000000.0,67638000000.0,42340000000,21707000000.0,182973000000,7837000000,102994000000,108000000,10330000000
4,2023-06-30,477607000000,140482000000,49529000000,63970000000,36587000000,39925000000,143356000000,193784000000,22785000000,...,67472000000.0,63092000000.0,68572000000.0,49232000000,21853000000.0,168602000000,7837000000,93115000000,108000000,10313000000


## Step 5)ii) Cash Flow

In [11]:
cash_flow_quarterly_df = pd.read_csv("Data\Fundamental Data\Cash Flow\DC_AMZN_quarterly_cash_flow.csv")

cash_flow_quarterly_df.drop(columns=['Unnamed: 0', 'reportedCurrency'], inplace=True)

cash_flow_quarterly_df.head()


Unnamed: 0,fiscalDateEnding,operatingCashflow,paymentsForOperatingActivities,changeInOperatingLiabilities,changeInOperatingAssets,depreciationDepletionAndAmortization,capitalExpenditures,changeInReceivables,changeInInventory,profitLoss,cashflowFromInvestment,cashflowFromFinancing,paymentsForRepurchaseOfCommonStock,paymentsForRepurchaseOfEquity,proceedsFromRepurchaseOfEquity,changeInCashAndCashEquivalents,netIncome
0,2024-06-30,25281000000,2916000000.0,6412000000,6140000000,12038000000,17620000000,-1304035000.0,3085000000,13485000000,-22138000000,-4490000000,0,0,0.0,442596500.0,13485000000
1,2024-03-31,18989000000,3406000000.0,-9714000000,925000000,11684000000,14925000000,-1304035000.0,-1776000000,10431000000,-17862000000,-1256000000,0,0,0.0,442596500.0,10431000000
2,2023-12-31,42465000000,2840000000.0,14517000000,9622000000,13820000000,14588000000,-1304035000.0,-2643000000,10644000000,-12601000000,-6746000000,0,0,0.0,442596500.0,10624000000
3,2023-09-30,21217000000,2768000000.0,2795000000,-808000000,12131000000,12479000000,-1304035000.0,-808000000,9879000000,-11753000000,-8948000000,0,0,0.0,442596500.0,9879000000
4,2023-06-30,16476000000,2605000000.0,3185000000,2373000000,11589000000,11455000000,-5167000000.0,2373000000,6750000000,-9673000000,-6539000000,0,0,0.0,264000000.0,6750000000


## Step 5)iii) Earnings

In [12]:
earnings_quarterly_df = pd.read_csv("Data\Fundamental Data\Earnings\DC_AMZN_quarterly_earnings.csv")

earnings_quarterly_df.drop(columns=['Unnamed: 0', 'reportedDate', 'reportTime'], inplace=True)

earnings_quarterly_df.head()


Unnamed: 0,fiscalDateEnding,reportedEPS,estimatedEPS,surprise,surprisePercentage
0,2024-06-30,1.26,1.03,0.23,22.3301
1,2024-03-31,0.98,0.82,0.16,19.5122
2,2023-12-31,1.0,0.8,0.2,25.0
3,2023-09-30,0.94,0.58,0.36,62.069
4,2023-06-30,0.65,0.35,0.3,85.7143


## Step 5)iv) Income Statement

In [13]:
income_statement_df = pd.read_csv("Data\Fundamental Data\Income Statement\DC_AMZN_quarterly_income_statement.csv")

income_statement_df.drop(columns=['Unnamed: 0', 'reportedCurrency'], inplace=True)

income_statement_df.head()

Unnamed: 0,fiscalDateEnding,grossProfit,totalRevenue,costOfRevenue,costofGoodsAndServicesSold,operatingIncome,sellingGeneralAndAdministrative,researchAndDevelopment,operatingExpenses,investmentIncomeNet,...,otherNonOperatingIncome,depreciationAndAmortization,incomeBeforeTax,incomeTaxExpense,interestAndDebtExpense,netIncomeFromContinuingOperations,comprehensiveIncomeNetOfTax,ebit,ebitda,netIncome
0,2024-06-30,59553000000,147839000000,88286000000,73785000000,14672000000,13553000000,22304000000,14501000000,1180000000.0,...,-18000000,948000000,15252000000,1767000000,589000000.0,13485000000.0,13090000000.0,14672000000,15620000000,13485000000
1,2024-03-31,56617000000,142595000000,85978000000,72633000000,15307000000,12404000000,16960000000,13345000000,993000000.0,...,-2673000000,941000000,12898000000,2467000000,644000000.0,10431000000.0,9873000000.0,13542000000,14483000000,10431000000
2,2023-12-31,39401000000,169328000000,129927000000,92553000000,13209000000,36212000000,22038000000,39274000000,901000000.0,...,289000000,2082000000,13666000000,3042000000,713000000.0,10644000000.0,12587000000.0,14379000000,16461000000,10624000000
3,2023-09-30,52610000000,142183000000,89573000000,75022000000,11188000000,13112000000,21203000000,14551000000,776000000.0,...,1031000000,1439000000,12185000000,2306000000,806000000.0,9879000000.0,8556000000.0,12991000000,14430000000,9879000000
4,2023-06-30,48693000000,133552000000,84859000000,69373000000,7681000000,13947000000,18829000000,15486000000,661000000.0,...,61000000,1539000000,7554000000,804000000,840000000.0,6750000000.0,7043000000.0,8394000000,9933000000,6750000000


## Step 6) Save To CSV

In [14]:
# Step 1: Ensure 'Date' and relevant date columns are in datetime format
df['Date'] = pd.to_datetime(df['Date'])
balance_sheet_quarterly_df['fiscalDateEnding'] = pd.to_datetime(balance_sheet_quarterly_df['fiscalDateEnding'])
cash_flow_quarterly_df['fiscalDateEnding'] = pd.to_datetime(cash_flow_quarterly_df['fiscalDateEnding'])
earnings_quarterly_df['fiscalDateEnding'] = pd.to_datetime(earnings_quarterly_df['fiscalDateEnding'])
income_statement_df['fiscalDateEnding'] = pd.to_datetime(income_statement_df['fiscalDateEnding'])

# Step 2: Merge balance_sheet_quarterly_df into df based on 'Date' and 'fiscalDateEnding'
df = df.merge(balance_sheet_quarterly_df, left_on='Date', right_on='fiscalDateEnding', how='left').drop(columns=['fiscalDateEnding'])

# Step 3: Merge cash_flow_quarterly_df into df based on 'Date' and 'fiscalDateEnding'
df = df.merge(cash_flow_quarterly_df, left_on='Date', right_on='fiscalDateEnding', how='left').drop(columns=['fiscalDateEnding'])

# Step 4: Merge earnings_quarterly_df into df based on 'Date' and 'fiscalDateEnding'
df = df.merge(earnings_quarterly_df, left_on='Date', right_on='fiscalDateEnding', how='left').drop(columns=['fiscalDateEnding'])

# Step 5: Merge income_statement_df into df based on 'Date' and 'fiscalDateEnding'
df = df.merge(income_statement_df, left_on='Date', right_on='fiscalDateEnding', how='left').drop(columns=['fiscalDateEnding'])

# Step 6: Display the combined DataFrame
df.head()


Unnamed: 0,Date,Open,High,Low,Close,Volume,MA_21,RSI,MACD,Stochastic_Oscillator,...,otherNonOperatingIncome,depreciationAndAmortization,incomeBeforeTax,incomeTaxExpense,interestAndDebtExpense,netIncomeFromContinuingOperations,comprehensiveIncomeNetOfTax,ebit,ebitda,netIncome_y
0,2009-03-31,2.5675,3.7805,2.3815,3.672,12370496000,3.070679,58.512097,0.136847,65.430878,...,4000000.0,87000000.0,246000000.0,69000000.0,255967200.0,2199525000.0,2174295000.0,258000000.0,331000000.0,177000000.0
1,2009-06-30,3.651,4.428,3.5855,4.183,9494416000,3.910458,55.309698,0.023607,58.383477,...,19000000.0,84000000.0,181000000.0,39000000.0,7000000.0,284000000.0,227000000.0,188000000.0,272000000.0,142000000.0
2,2009-09-30,4.221,4.725,3.7705,4.668,8324122000,4.18242,53.239807,0.126727,53.727712,...,11000000.0,95000000.0,259000000.0,60000000.0,7000000.0,199000000.0,222000000.0,266000000.0,361000000.0,199000000.0
3,2009-12-31,4.625,7.2955,4.4135,6.726,12727852000,5.718096,63.826041,0.116955,69.004807,...,-6000000.0,112000000.0,469000000.0,85000000.0,8000000.0,386000000.0,363000000.0,477000000.0,589000000.0,384000000.0
4,2010-03-31,6.8125,6.9095,5.691,6.7885,11980988000,6.306039,51.020327,0.142294,48.488807,...,3000000.0,120000000.0,399000000.0,100000000.0,7000000.0,299000000.0,210000000.0,406000000.0,526000000.0,299000000.0


In [15]:
# Get total null values in each column
display_columns_with_null_values(df)

----------------------------------------------------------------
Total null values in each column (only columns with null values)
----------------------------------------------------------------
totalAssets                              2
totalCurrentAssets                       2
cashAndCashEquivalentsAtCarryingValue    2
cashAndShortTermInvestments              2
inventory                                2
                                        ..
netIncomeFromContinuingOperations        1
comprehensiveIncomeNetOfTax              1
ebit                                     1
ebitda                                   1
netIncome_y                              1
Length: 68, dtype: int64


In [16]:
df.to_csv("Data\EDA Preprocessed Data\EDA_AMZN_Historical_Quarterly_With_Fundamental_Data_Economic_Indicators.csv", index=False)