In [21]:
import numpy as np
import pandas as pd

import warnings
warnings.filterwarnings('ignore')

import seaborn as sns
import matplotlib.pyplot as plt
import matplotlib.animation as animation


import plotly.express as px
import plotly.graph_objects as go
import plotly.figure_factory as ff


# Step 1: Loading Dataset

In [22]:
# df = pd.read_csv('Data\FS_Classification_AMZN_Historical_Quarterly_2009_2022_With_Fundamental_Data_Economic_Indicators.csv')
df = pd.read_csv('Data\FS_Classification_AMZN_Historical_Quarterly_2023_Onwards_With_Fundamental_Data_Economic_Indicators.csv')



# Removing leading and trailing spaces from column names
df.columns = df.columns.str.strip()

# Using a regular expression to replace multiple spaces with a single space in all column names
df.columns = df.columns.str.replace(r'\s+', ' ', regex=True)  

# # Dropping columns that are not needed
df.drop(["Date", "Year"], axis=1, inplace=True)



# Step 2: Overview of Dataset

In [23]:
num_of_rows = len(df)
print(f"The number of rows is {num_of_rows}")
print('\n')

df.info()

The number of rows is 7


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7 entries, 0 to 6
Data columns (total 36 columns):
 #   Column                          Non-Null Count  Dtype  
---  ------                          --------------  -----  
 0   Open                            7 non-null      float64
 1   High                            7 non-null      float64
 2   Low                             7 non-null      float64
 3   Close                           7 non-null      float64
 4   Volume                          7 non-null      int64  
 5   MA_21                           7 non-null      float64
 6   RSI                             7 non-null      float64
 7   MACD                            7 non-null      float64
 8   Stochastic_Oscillator           7 non-null      float64
 9   ATR                             7 non-null      float64
 10  Momentum_21                     7 non-null      float64
 11  OBV                             7 non-null      int64  
 12  Cumulative_Ret

In [24]:
df.head()

Unnamed: 0,Open,High,Low,Close,Volume,MA_21,RSI,MACD,Stochastic_Oscillator,ATR,...,capitalExpenditures,changeInReceivables,changeInInventory,surprisePercentage,grossProfit,costOfRevenue,costofGoodsAndServicesSold,incomeTaxExpense,Forward_Return,Price_Movement_Class
0,85.459999,114.0,81.43,103.290001,4237140900,94.574424,54.283704,1.411384,57.035177,2.624101,...,14207000000.0,1521000000.0,-371000000.0,47.619,43403000000.0,83202000000.0,67791000000.0,948000000.0,26.207764,4
1,102.300003,131.490005,97.709999,130.360001,3899624000,109.537804,62.979858,3.396109,74.22263,2.199527,...,11455000000.0,-5167000000.0,2373000000.0,85.7143,48693000000.0,84859000000.0,69373000000.0,804000000.0,-2.485423,1
2,130.820007,145.860001,123.040001,127.120003,3389452900,133.134233,53.048983,-2.481221,52.517575,2.589117,...,12479000000.0,-1304035000.0,-808000000.0,62.069,52610000000.0,89573000000.0,75022000000.0,2306000000.0,19.524858,4
3,127.279999,155.630005,118.349998,151.940002,3181680200,137.242933,57.185846,2.782032,69.167466,2.602857,...,14588000000.0,-1304035000.0,-2643000000.0,25.0,39401000000.0,129927000000.0,92553000000.0,3042000000.0,18.717916,4
4,151.539993,181.699997,144.050003,180.380005,2700335000,162.643059,59.673912,2.546693,71.302744,2.503677,...,14925000000.0,-1304035000.0,-1776000000.0,19.5122,56617000000.0,85978000000.0,72633000000.0,2467000000.0,7.134934,4


# Step 3: EDA - Missing Values Analysis 

## Step 3)i): EDA - Show Missing Values in each Column

In [25]:
def display_columns_with_null_values(df: pd.DataFrame):
    """
    Displays the total number of null values for each column in the dataframe,
    showing only columns that have null values.
    
    Parameters:
    - df (pd.DataFrame): The dataframe to be checked for null values.
    
    Returns:
    - None: Prints the columns with null values and their counts.
    """
    
    # Get total null values in each column
    total_null_values = df.isnull().sum()
    
    # Filter out columns that don't have any null values
    columns_with_null = total_null_values[total_null_values > 0].sort_values(ascending=False)
    
    # Check if there are any columns with null values
    if not columns_with_null.empty:
        print('-' * 64)
        print("Total null values in each column (only columns with null values)")
        print('-' * 64)
        print(columns_with_null)
    else:
        print('-' * 64)
        print("Total null values in each column (only columns with null values)")
        print('-' * 64)
        print("No columns have null values.")

In [26]:
# Get percentage of null values in each column
null_values_percentage = df.isnull().mean().round(4).mul(100).sort_values(ascending=False)
print('-' * 44)
print("Percentage(%) of null values in each column")
print('-' * 44)
print(null_values_percentage)
print('\n')

# Get total null values in each column
display_columns_with_null_values(df)


--------------------------------------------
Percentage(%) of null values in each column
--------------------------------------------
Open                              0.0
High                              0.0
inventory                         0.0
shortTermInvestments              0.0
otherCurrentAssets                0.0
retainedEarnings                  0.0
changeInOperatingLiabilities      0.0
changeInOperatingAssets           0.0
capitalExpenditures               0.0
changeInReceivables               0.0
changeInInventory                 0.0
surprisePercentage                0.0
grossProfit                       0.0
costOfRevenue                     0.0
costofGoodsAndServicesSold        0.0
incomeTaxExpense                  0.0
Forward_Return                    0.0
cashAndShortTermInvestments       0.0
Unemployment_Rate_Quarterly       0.0
Retail_Sales_Quarterly            0.0
Stochastic_Oscillator             0.0
Low                               0.0
Close                         

## Step 3)ii): EDA - Handling Missing Values

In [27]:
# Fill Null Values in the Remaining Columns with the average of the column
numeric_df = df.select_dtypes(include=[np.number]) # Select only numeric columns
numeric_df.fillna(numeric_df.mean(), inplace=True)  # Fill missing values in numeric columns with the column mean
df[numeric_df.columns] = numeric_df # Merge back with non-numeric columns if needed

# Get total null values in each column
display_columns_with_null_values(df)


----------------------------------------------------------------
Total null values in each column (only columns with null values)
----------------------------------------------------------------
No columns have null values.


# Step 4: EDA - Duplicate Values Analysis 

## Step 4)i): EDA - Show Duplicate Values Rows

In [28]:
# Get percentage of duplicate rows
total_rows = len(df)
duplicate_rows = df.duplicated().sum()
duplicate_percentage = (duplicate_rows / total_rows) * 100

print('-' * 48)
print("Percentage(%) of duplicate rows in the DataFrame")
print('-' * 48)
print(f"{duplicate_percentage:.2f}%")
print('\n')

# Get total number of duplicate rows
print('-' * 30)
print("Total number of duplicate rows")
print('-' * 30)
print(duplicate_rows)


------------------------------------------------
Percentage(%) of duplicate rows in the DataFrame
------------------------------------------------
0.00%


------------------------------
Total number of duplicate rows
------------------------------
0


# Step 5): EDA - BackTesting

In [29]:
import joblib
import pandas as pd
from sklearn.metrics import accuracy_score, precision_score, recall_score, confusion_matrix, classification_report

# Load the saved model
joblib_file = "Model/random_forest_model_pipeline.joblib"
loaded_model = joblib.load(joblib_file)


# Extract features and actual labels
features = df.drop(columns=['Price_Movement_Class']) 
actual_labels = df['Price_Movement_Class']

# Make predictions using the loaded model
predicted_labels = loaded_model.predict(features)

# Add predictions to the original DataFrame as a new column
df['Price_Movement_Class_Prediction'] = predicted_labels

df.head(15)  

Unnamed: 0,Open,High,Low,Close,Volume,MA_21,RSI,MACD,Stochastic_Oscillator,ATR,...,changeInReceivables,changeInInventory,surprisePercentage,grossProfit,costOfRevenue,costofGoodsAndServicesSold,incomeTaxExpense,Forward_Return,Price_Movement_Class,Price_Movement_Class_Prediction
0,85.459999,114.0,81.43,103.290001,4237140900,94.574424,54.283704,1.411384,57.035177,2.624101,...,1521000000.0,-371000000.0,47.619,43403000000.0,83202000000.0,67791000000.0,948000000.0,26.207764,4,4
1,102.300003,131.490005,97.709999,130.360001,3899624000,109.537804,62.979858,3.396109,74.22263,2.199527,...,-5167000000.0,2373000000.0,85.7143,48693000000.0,84859000000.0,69373000000.0,804000000.0,-2.485423,1,0
2,130.820007,145.860001,123.040001,127.120003,3389452900,133.134233,53.048983,-2.481221,52.517575,2.589117,...,-1304035000.0,-808000000.0,62.069,52610000000.0,89573000000.0,75022000000.0,2306000000.0,19.524858,4,4
3,127.279999,155.630005,118.349998,151.940002,3181680200,137.242933,57.185846,2.782032,69.167466,2.602857,...,-1304035000.0,-2643000000.0,25.0,39401000000.0,129927000000.0,92553000000.0,3042000000.0,18.717916,4,4
4,151.539993,181.699997,144.050003,180.380005,2700335000,162.643059,59.673912,2.546693,71.302744,2.503677,...,-1304035000.0,-1776000000.0,19.5122,56617000000.0,85978000000.0,72633000000.0,2467000000.0,7.134934,4,4
5,180.789993,199.839996,166.320007,193.25,2622598800,182.068685,53.876961,2.60801,60.945101,2.844399,...,-1304035000.0,3085000000.0,22.3301,59553000000.0,88286000000.0,73785000000.0,1767000000.0,-3.580853,1,0
6,193.490005,201.199997,151.610001,186.330002,2604113800,182.434695,50.993386,3.399506,57.869881,3.823794,...,-1304035000.0,551016400.0,32.289969,22049190000.0,38655210000.0,33353480000.0,386338700.0,7.778962,0,4


In [30]:
# Calculate performance metrics for multi-class classification
accuracy = accuracy_score(actual_labels, predicted_labels)
precision = precision_score(actual_labels, predicted_labels, average='macro')  # Average for multi-class
recall = recall_score(actual_labels, predicted_labels, average='macro')
conf_matrix = confusion_matrix(actual_labels, predicted_labels)

# Display the results
# print(f'Accuracy: {accuracy:.4f}')
# print(f'Precision (Macro): {precision:.4f}')
# print(f'Recall (Macro): {recall:.4f}')
print('Confusion Matrix:')
print(conf_matrix)
print('\nClassification Report:')
print(classification_report(actual_labels, predicted_labels))


Confusion Matrix:
[[0 0 1]
 [2 0 0]
 [0 0 4]]

Classification Report:
              precision    recall  f1-score   support

           0       0.00      0.00      0.00         1
           1       0.00      0.00      0.00         2
           4       0.80      1.00      0.89         4

    accuracy                           0.57         7
   macro avg       0.27      0.33      0.30         7
weighted avg       0.46      0.57      0.51         7

