In [8]:
import numpy as np
import pandas as pd

import warnings
warnings.filterwarnings('ignore')

import seaborn as sns
import matplotlib.pyplot as plt
import matplotlib.animation as animation


import plotly.express as px
import plotly.graph_objects as go
import plotly.figure_factory as ff


# Step 1: Loading Dataset

In [9]:
# Load the dataset
df = pd.read_csv("AMZN_quarterly_earnings.csv")

# Removing leading and trailing spaces from column names
df.columns = df.columns.str.strip()

# Using a regular expression to replace multiple spaces with a single space in all column names
df.columns = df.columns.str.replace(r'\s+', ' ', regex=True)  

# Step 2: Overview of Dataset

In [10]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 109 entries, 0 to 108
Data columns (total 7 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   fiscalDateEnding    109 non-null    object 
 1   reportedDate        109 non-null    object 
 2   reportedEPS         109 non-null    float64
 3   estimatedEPS        109 non-null    float64
 4   surprise            109 non-null    float64
 5   surprisePercentage  109 non-null    float64
 6   reportTime          109 non-null    object 
dtypes: float64(4), object(3)
memory usage: 6.1+ KB


In [11]:
df.head(20)

Unnamed: 0,fiscalDateEnding,reportedDate,reportedEPS,estimatedEPS,surprise,surprisePercentage,reportTime
0,2024-06-30,2024-08-01,1.26,1.03,0.23,22.3301,post-market
1,2024-03-31,2024-04-30,0.98,0.82,0.16,19.5122,post-market
2,2023-12-31,2024-02-01,1.0,0.8,0.2,25.0,post-market
3,2023-09-30,2023-10-26,0.94,0.58,0.36,62.069,post-market
4,2023-06-30,2023-08-03,0.65,0.35,0.3,85.7143,post-market
5,2023-03-31,2023-04-27,0.31,0.21,0.1,47.619,post-market
6,2022-12-31,2023-02-02,0.25,0.18,0.07,38.8889,post-market
7,2022-09-30,2022-10-27,0.17,0.22,-0.05,-22.7273,post-market
8,2022-06-30,2022-07-28,0.18,0.14,0.04,28.5714,post-market
9,2022-03-31,2022-04-28,0.37,0.42,-0.05,-11.9048,post-market


# Step 3: EDA - Missing Values Analysis 

## Step 3)i): EDA - Show Missing Values in each Column

In [12]:
def display_columns_with_null_values(df: pd.DataFrame):
    """
    Displays the total number of null values for each column in the dataframe,
    showing only columns that have null values.
    
    Parameters:
    - df (pd.DataFrame): The dataframe to be checked for null values.
    
    Returns:
    - None: Prints the columns with null values and their counts.
    """
    
    # Get total null values in each column
    total_null_values = df.isnull().sum()
    
    # Filter out columns that don't have any null values
    columns_with_null = total_null_values[total_null_values > 0].sort_values(ascending=False)
    
    # Check if there are any columns with null values
    if not columns_with_null.empty:
        print('-' * 64)
        print("Total null values in each column (only columns with null values)")
        print('-' * 64)
        print(columns_with_null)
    else:
        print('-' * 64)
        print("Total null values in each column (only columns with null values)")
        print('-' * 64)
        print("No columns have null values.")

In [13]:
# Get percentage of null values in each column
null_values_percentage = df.isnull().mean().round(4).mul(100).sort_values(ascending=False)
print('-' * 44)
print("Percentage(%) of null values in each column")
print('-' * 44)
print(null_values_percentage)
print('\n')

# Get total null values in each column
display_columns_with_null_values(df)


--------------------------------------------
Percentage(%) of null values in each column
--------------------------------------------
fiscalDateEnding      0.0
reportedDate          0.0
reportedEPS           0.0
estimatedEPS          0.0
surprise              0.0
surprisePercentage    0.0
reportTime            0.0
dtype: float64


----------------------------------------------------------------
Total null values in each column (only columns with null values)
----------------------------------------------------------------
No columns have null values.


## Step 3)ii): EDA - Handling Missing Values

In [14]:
# # Fill Null Values in the Remaining Columns with the average of the column
# numeric_df = df.select_dtypes(include=[np.number]) # Select only numeric columns
# numeric_df.fillna(numeric_df.mean(), inplace=True)  # Fill missing values in numeric columns with the column mean
# df[numeric_df.columns] = numeric_df # Merge back with non-numeric columns if needed

# # Get total null values in each column
# display_columns_with_null_values(df)

In [15]:
df.head(20)

Unnamed: 0,fiscalDateEnding,reportedDate,reportedEPS,estimatedEPS,surprise,surprisePercentage,reportTime
0,2024-06-30,2024-08-01,1.26,1.03,0.23,22.3301,post-market
1,2024-03-31,2024-04-30,0.98,0.82,0.16,19.5122,post-market
2,2023-12-31,2024-02-01,1.0,0.8,0.2,25.0,post-market
3,2023-09-30,2023-10-26,0.94,0.58,0.36,62.069,post-market
4,2023-06-30,2023-08-03,0.65,0.35,0.3,85.7143,post-market
5,2023-03-31,2023-04-27,0.31,0.21,0.1,47.619,post-market
6,2022-12-31,2023-02-02,0.25,0.18,0.07,38.8889,post-market
7,2022-09-30,2022-10-27,0.17,0.22,-0.05,-22.7273,post-market
8,2022-06-30,2022-07-28,0.18,0.14,0.04,28.5714,post-market
9,2022-03-31,2022-04-28,0.37,0.42,-0.05,-11.9048,post-market


# Step 4: EDA - Duplicate Values Analysis 

## Step 4)i): EDA - Show Duplicate Values Rows

In [16]:
# Get percentage of duplicate rows
total_rows = len(df)
duplicate_rows = df.duplicated().sum()
duplicate_percentage = (duplicate_rows / total_rows) * 100

print('-' * 48)
print("Percentage(%) of duplicate rows in the DataFrame")
print('-' * 48)
print(f"{duplicate_percentage:.2f}%")
print('\n')

# Get total number of duplicate rows
print('-' * 30)
print("Total number of duplicate rows")
print('-' * 30)
print(duplicate_rows)


------------------------------------------------
Percentage(%) of duplicate rows in the DataFrame
------------------------------------------------
0.00%


------------------------------
Total number of duplicate rows
------------------------------
0


## Step 4)ii): EDA - Handling Duplicate Values Rows

In [17]:
# # Drop all duplicate rows
# df_cleaned = df.drop_duplicates()

# print('-' * 22)
# print("Duplicate rows dropped")
# print('-' * 22)
# print(f"Original number of rows: {total_rows}")
# print(f"Number of rows after dropping duplicates: {len(df_cleaned)}")

# Step 5): EDA - Save Cleaned Dataset

In [18]:
# save to csv
df.to_csv(f'DC_AMZN_quarterly_earnings.csv')