In [1]:
import numpy as np
import pandas as pd

import warnings
warnings.filterwarnings('ignore')

import seaborn as sns
import matplotlib.pyplot as plt
import matplotlib.animation as animation


import plotly.express as px
import plotly.graph_objects as go
import plotly.figure_factory as ff


# Step 1: Loading Dataset

In [2]:
df = pd.read_csv('Data\FE_Classification_AMZN_Historical_Quarterly_With_Fundamental_Data_Economic_Indicators.csv')

# Removing leading and trailing spaces from column names
df.columns = df.columns.str.strip()

# Using a regular expression to replace multiple spaces with a single space in all column names
df.columns = df.columns.str.replace(r'\s+', ' ', regex=True)  

# # # Dropping columns that are not needed
# df.drop(["Date"], axis=1, inplace=True)




# Step 2: Overview of Dataset

In [3]:
num_of_rows = len(df)
print(f"The number of rows is {num_of_rows}")
print('\n')

df.info()

The number of rows is 63


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 63 entries, 0 to 62
Data columns (total 92 columns):
 #   Column                                 Non-Null Count  Dtype  
---  ------                                 --------------  -----  
 0   Date                                   63 non-null     object 
 1   Open                                   63 non-null     float64
 2   High                                   63 non-null     float64
 3   Low                                    63 non-null     float64
 4   Close                                  63 non-null     float64
 5   Volume                                 63 non-null     int64  
 6   MA_21                                  63 non-null     float64
 7   RSI                                    63 non-null     float64
 8   MACD                                   63 non-null     float64
 9   Stochastic_Oscillator                  63 non-null     float64
 10  ATR                                    63 non-nul

In [4]:
df.head()

Unnamed: 0,Date,Open,High,Low,Close,Volume,MA_21,RSI,MACD,Stochastic_Oscillator,...,incomeBeforeTax,incomeTaxExpense,interestAndDebtExpense,netIncomeFromContinuingOperations,comprehensiveIncomeNetOfTax,ebit,ebitda,netIncome_y,Forward_Return,Price_Movement_Class
0,2009-03-31,2.5675,3.7805,2.3815,3.672,12370496000,3.070679,58.512097,0.136847,65.430878,...,246000000.0,69000000.0,255967200.0,2199525000.0,2174295000.0,258000000.0,331000000.0,177000000.0,13.916127,4
1,2009-06-30,3.651,4.428,3.5855,4.183,9494416000,3.910458,55.309698,0.023607,58.383477,...,181000000.0,39000000.0,7000000.0,284000000.0,227000000.0,188000000.0,272000000.0,142000000.0,11.594552,4
2,2009-09-30,4.221,4.725,3.7705,4.668,8324122000,4.18242,53.239807,0.126727,53.727712,...,259000000.0,60000000.0,7000000.0,199000000.0,222000000.0,266000000.0,361000000.0,199000000.0,44.087393,4
3,2009-12-31,4.625,7.2955,4.4135,6.726,12727852000,5.718096,63.826041,0.116955,69.004807,...,469000000.0,85000000.0,8000000.0,386000000.0,363000000.0,477000000.0,589000000.0,384000000.0,0.92923,2
4,2010-03-31,6.8125,6.9095,5.691,6.7885,11980988000,6.306039,51.020327,0.142294,48.488807,...,399000000.0,100000000.0,7000000.0,299000000.0,210000000.0,406000000.0,526000000.0,299000000.0,-19.525669,0


# Step 3: EDA - Missing Values Analysis 

## Step 3)i): EDA - Show Missing Values in each Column

In [5]:
def display_columns_with_null_values(df: pd.DataFrame):
    """
    Displays the total number of null values for each column in the dataframe,
    showing only columns that have null values.
    
    Parameters:
    - df (pd.DataFrame): The dataframe to be checked for null values.
    
    Returns:
    - None: Prints the columns with null values and their counts.
    """
    
    # Get total null values in each column
    total_null_values = df.isnull().sum()
    
    # Filter out columns that don't have any null values
    columns_with_null = total_null_values[total_null_values > 0].sort_values(ascending=False)
    
    # Check if there are any columns with null values
    if not columns_with_null.empty:
        print('-' * 64)
        print("Total null values in each column (only columns with null values)")
        print('-' * 64)
        print(columns_with_null)
    else:
        print('-' * 64)
        print("Total null values in each column (only columns with null values)")
        print('-' * 64)
        print("No columns have null values.")

In [6]:
# Get percentage of null values in each column
null_values_percentage = df.isnull().mean().round(4).mul(100).sort_values(ascending=False)
print('-' * 44)
print("Percentage(%) of null values in each column")
print('-' * 44)
print(null_values_percentage)
print('\n')

# Get total null values in each column
display_columns_with_null_values(df)


--------------------------------------------
Percentage(%) of null values in each column
--------------------------------------------
Forward_Return                 1.59
Date                           0.00
profitLoss                     0.00
estimatedEPS                   0.00
reportedEPS                    0.00
                               ... 
totalNonCurrentAssets          0.00
currentNetReceivables          0.00
inventory                      0.00
cashAndShortTermInvestments    0.00
Price_Movement_Class           0.00
Length: 92, dtype: float64


----------------------------------------------------------------
Total null values in each column (only columns with null values)
----------------------------------------------------------------
Forward_Return    1
dtype: int64


## Step 3)ii): EDA - Handling Missing Values

In [7]:
# Fill Null Values in the Remaining Columns with the average of the column
numeric_df = df.select_dtypes(include=[np.number]) # Select only numeric columns
numeric_df.fillna(numeric_df.mean(), inplace=True)  # Fill missing values in numeric columns with the column mean
df[numeric_df.columns] = numeric_df # Merge back with non-numeric columns if needed

# Get total null values in each column
display_columns_with_null_values(df)


----------------------------------------------------------------
Total null values in each column (only columns with null values)
----------------------------------------------------------------
No columns have null values.


# Step 4: EDA - Duplicate Values Analysis 

## Step 4)i): EDA - Show Duplicate Values Rows

In [8]:
# Get percentage of duplicate rows
total_rows = len(df)
duplicate_rows = df.duplicated().sum()
duplicate_percentage = (duplicate_rows / total_rows) * 100

print('-' * 48)
print("Percentage(%) of duplicate rows in the DataFrame")
print('-' * 48)
print(f"{duplicate_percentage:.2f}%")
print('\n')

# Get total number of duplicate rows
print('-' * 30)
print("Total number of duplicate rows")
print('-' * 30)
print(duplicate_rows)


------------------------------------------------
Percentage(%) of duplicate rows in the DataFrame
------------------------------------------------
0.00%


------------------------------
Total number of duplicate rows
------------------------------
0


# Step 5): EDA - Outliers Detection and Removal

## Step 5)i): EDA - Outliers Detection

In [9]:
def create_box_plots(df, columns):
    """
    Create box plots for the specified columns of a pandas DataFrame.
    Args:
    df (pd.DataFrame): The DataFrame containing the data.
    columns (list of str): The list of columns for which to create box plots.
    Returns:
    plotly.graph_objs._figure.Figure: The figure object containing the box plots.
    """
    fig = px.box(df, x=columns)
    fig.update_layout(
        title="Box Plot",
        xaxis_title="Columns",
        yaxis_title="Values",
        showlegend=False
    )
    return fig


columns = df.select_dtypes(include=['number']).columns.tolist()


fig = create_box_plots(df, columns=columns)
fig.show()


## Step 5)ii): EDA - Outliers Removal

In [10]:
import plotly.graph_objects as go
from plotly.subplots import make_subplots
import plotly.express as px
import pandas as pd

def plot_outliers_before_after_removal(feature_name, before_removal_data, after_removal_data):
    """
    Plot the box plots before and after removing outliers.
    Args:
        feature_name (str): The name of the feature to plot.
        before_removal_data (pd.Series): Data before removing outliers.
        after_removal_data (pd.Series): Data after removing outliers.
    """
    
    # Create a subplot figure with 2 rows and 1 column
    fig = make_subplots(rows=2, cols=1, subplot_titles=("Before Filtering", "After Filtering"))

    # Add the first plot to the first row
    fig.add_trace(
        go.Box(x=before_removal_data, name="Before Filtering", orientation='h'),
        row=1, col=1
    )

    # Add the second plot to the second row
    fig.add_trace(
        go.Box(x=after_removal_data, name="After Filtering", orientation='h'),
        row=2, col=1
    )

    # Update layout
    fig.update_layout(height=600, title_text=f"Comparison of <b>{feature_name}</b> Before and After Filtering Outliers")

    # Show the figure
    fig.show()

# Step 6): EDA - Feature Selection

## Step 6)i): EDA - Numerical Feature Selection

In [11]:
import pandas as pd
import plotly.figure_factory as ff

def heatmap_correlations(df: pd.DataFrame, 
                         targetVariable: str,
                         colorscale: str = "Viridis",
                         width: int = 800,
                         height: int = 800,
                         txt_output: str = "correlation_results.txt",
                         image_output: str = "heatmap_correlations.png",
                         threshold: float = 0.1):
    """
    Create a heatmap of correlations, save results, and return features with correlations 
    below the threshold for removal.
    
    Parameters:
    - df (pd.DataFrame): Data to be plotted.
    - targetVariable (str): The dependent variable for correlations.
    - colorscale (str): Colorscale for the heatmap.
    - width (int): Width of the plot.
    - height (int): Height of the plot.
    - txt_output (str): Path to save the correlation results as a text file.
    - image_output (str): Path to save the heatmap image.
    - threshold (float): Threshold for filtering relevant features.
    
    Returns:
    - List of feature names with correlations below the threshold.
    """
    # Filter only numeric columns
    df_numeric = df.select_dtypes(include=['number'])

    if targetVariable not in df_numeric.columns:
        print(f"The target variable {targetVariable} is not numeric.")
        return

    # Calculate correlations with the target variable
    target_corr = df_numeric.corr()[targetVariable].sort_values(ascending=False)

    # Write the correlations to a text file
    with open(txt_output, 'w') as f:
        f.write('-' * 56 + '\n')
        f.write(f"Correlation of {targetVariable} with other Independent variables\n")
        f.write('-' * 56 + '\n')
        f.write(target_corr.to_string() + '\n')

    print(f"Correlation results saved to {txt_output}")

    # Calculate the full correlation matrix
    corr_matrix = df_numeric.corr()

    # Create heatmap
    fig = ff.create_annotated_heatmap(
        z=corr_matrix.values,
        x=list(corr_matrix.columns),
        y=list(corr_matrix.index),
        annotation_text=corr_matrix.round(2).values,
        colorscale=colorscale
    )

    fig.update_layout(
        title="Correlation Heatmap of Variables",
        width=width,
        height=height
    )

    # # Save the heatmap image
    # fig.write_image(image_output)
    # print(f"Heatmap image saved to {image_output}")

    # Identify features with correlations below the threshold
    low_correlation_features = target_corr[target_corr.abs() < threshold].index.tolist()

    # Return the list of features to remove
    return low_correlation_features

features_to_remove = heatmap_correlations(
    df, 
    targetVariable='Price_Movement_Class', 
    colorscale='RdYlGn', 
    width=2500, 
    height=2500, 
    txt_output='0) correlation_results.txt',
    image_output='heatmap_correlations.png',
    threshold=0.10 # Default threshold = 0.10
)

print(f"Features to remove: {features_to_remove}")


Correlation results saved to 0) correlation_results.txt
Features to remove: ['cashflowFromInvestment', 'proceedsFromRepurchaseOfEquity', 'surprise', 'commonStockSharesOutstanding', 'investmentIncomeNet', 'Treasury_Yield_Quarterly', 'cashflowFromFinancing', 'totalNonCurrentAssets', 'Federal_Funds_Rate_Quarterly', 'netInterestIncome', 'treasuryStock', 'operatingIncome', 'interestAndDebtExpense', 'depreciationAndAmortization', 'commonStock', 'cashAndCashEquivalentsAtCarryingValue', 'intangibleAssets', 'shortLongTermDebtTotal', 'interestExpense', 'ebit', 'goodwill', 'ebitda', 'paymentsForRepurchaseOfEquity', 'incomeBeforeTax', 'comprehensiveIncomeNetOfTax', 'sellingGeneralAndAdministrative', 'operatingCashflow', 'researchAndDevelopment', 'otherNonCurrentAssets', 'longTermDebtNoncurrent', 'paymentsForRepurchaseOfCommonStock', 'paymentsForOperatingActivities', 'reportedEPS', 'operatingExpenses', 'netIncome_x', 'changeInCashAndCashEquivalents', 'currentNetReceivables', 'depreciationDepletionA

In [12]:
# Drop the features with low correlations
df = df.drop(columns=features_to_remove)
df.head()

Unnamed: 0,Date,Open,High,Low,Close,Volume,MA_21,RSI,MACD,Stochastic_Oscillator,...,capitalExpenditures,changeInReceivables,changeInInventory,surprisePercentage,grossProfit,costOfRevenue,costofGoodsAndServicesSold,incomeTaxExpense,Forward_Return,Price_Movement_Class
0,2009-03-31,2.5675,3.7805,2.3815,3.672,12370496000,3.070679,58.512097,0.136847,65.430878,...,5245475000.0,-1304035000.0,551016400.0,0.0,451000000.0,4438000000.0,3741000000.0,69000000.0,13.916127,4
1,2009-06-30,3.651,4.428,3.5855,4.183,9494416000,3.910458,55.309698,0.023607,58.383477,...,78000000.0,-16000000.0,23000000.0,0.0,1133000000.0,3808000000.0,3518000000.0,39000000.0,11.594552,4
2,2009-09-30,4.221,4.725,3.7705,4.668,8324122000,4.18242,53.239807,0.126727,53.727712,...,103000000.0,155000000.0,276000000.0,0.0,1273000000.0,4504000000.0,4176000000.0,60000000.0,44.087393,4
3,2009-12-31,4.625,7.2955,4.4135,6.726,12727852000,5.718096,63.826041,0.116955,69.004807,...,137000000.0,509000000.0,339000000.0,0.0,1977000000.0,8029000000.0,7543000000.0,85000000.0,0.92923,2
4,2010-03-31,6.8125,6.9095,5.691,6.7885,11980988000,6.306039,51.020327,0.142294,48.488807,...,140000000.0,-454000000.0,-321000000.0,0.0,1206000000.0,5918000000.0,5501000000.0,100000000.0,-19.525669,0


In [13]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 63 entries, 0 to 62
Data columns (total 37 columns):
 #   Column                          Non-Null Count  Dtype  
---  ------                          --------------  -----  
 0   Date                            63 non-null     object 
 1   Open                            63 non-null     float64
 2   High                            63 non-null     float64
 3   Low                             63 non-null     float64
 4   Close                           63 non-null     float64
 5   Volume                          63 non-null     int64  
 6   MA_21                           63 non-null     float64
 7   RSI                             63 non-null     float64
 8   MACD                            63 non-null     float64
 9   Stochastic_Oscillator           63 non-null     float64
 10  ATR                             63 non-null     float64
 11  Momentum_21                     63 non-null     float64
 12  OBV                             63 non

In [14]:
# Step 1: Ensure 'Date' column is in datetime format
df['Date'] = pd.to_datetime(df['Date'])

# Step 2: Extract the 'Year' from the 'Date' column
df['Year'] = df['Date'].dt.year

# Step 3: Split the DataFrame into two: one for 2009-2022 and another for 2023 onwards
df_2009_2022 = df[df['Year'].between(2009, 2022)]
df_2023_onwards = df[df['Year'] > 2022]

# Step 4: Save the DataFrames to CSV files
df_2009_2022.to_csv(r"Data/FS_Classification_AMZN_Historical_Quarterly_2009_2022_With_Fundamental_Data_Economic_Indicators.csv", index=False)
df_2023_onwards.to_csv(r"Data/FS_Classification_AMZN_Historical_Quarterly_2023_Onwards_With_Fundamental_Data_Economic_Indicators.csv", index=False)
