In [1]:
import pandas as pd
import numpy as np

In [3]:
def clean_data(df):
    """
    Cleans the dataset by handling missing values and preparing it for further analysis.

    Parameters:
    df (DataFrame): The original dataset

    Returns:
    DataFrame: The cleaned dataset
    """
    # Convert the 'Date' column to datetime format if it exists in the dataset
    if 'Date' in df.columns:
        df['Date'] = pd.to_datetime(df['Date'], errors='coerce')
    
    # Handling missing values
    # Fill missing values in RSI, %K, %R using forward fill first, and backward fill as a backup
    for col in ['RSI', '%K', '%R']:
        if col in df.columns:
            df[col].fillna(method='ffill', inplace=True)
            df[col].fillna(method='bfill', inplace=True)

    # Fill moving averages (MA20, MA50, MA100) using forward fill first, and backward fill as a backup
    for col in ['MA20', 'MA50', 'MA100']:
        if col in df.columns:
            df[col].fillna(method='ffill', inplace=True)
            df[col].fillna(method='bfill', inplace=True)

    # Fill lag features (Close_lag_1 to Close_lag_5) using backward fill as they represent past data
    lag_features = ['Close_lag_1', 'Close_lag_2', 'Close_lag_3', 'Close_lag_4', 'Close_lag_5']
    for col in lag_features:
        if col in df.columns:
            df[col].fillna(method='bfill', inplace=True)

    # Fill Bollinger Bands (BB_Middle, BB_Std_Dev, BB_Upper, BB_Lower) using forward fill first, and backward fill as a backup
    bollinger_features = ['BB_Middle', 'BB_Std_Dev', 'BB_Upper', 'BB_Lower']
    for col in bollinger_features:
        if col in df.columns:
            df[col].fillna(method='ffill', inplace=True)
            df[col].fillna(method='bfill', inplace=True)

    # Fill Ichimoku indicators using forward fill first, and backward fill as a backup
    ichimoku_features = ['tenkan_sen', 'kijun_sen', 'senkou_span_a', 'senkou_span_b']
    for col in ichimoku_features:
        if col in df.columns:
            df[col].fillna(method='ffill', inplace=True)
            df[col].fillna(method='bfill', inplace=True)

    # Fill 'Returns' with 0 as it represents no change in price
    if 'Returns' in df.columns:
        df['Returns'].fillna(0, inplace=True)

    return df




## The rationale behind the cleaning process: 
Forward Fill (ffill) for RSI, %K, %R, Moving Averages, Bollinger Bands, and Ichimoku Indicators:
These indicators are momentum or trend-based, and forward fill allows the use of the most recent computed value to fill in missing data, which is helpful for maintaining trend continuity.

Backward Fill (bfill) for Lag Features:
Lag features (Close_lag_1, Close_lag_2, etc.) represent past data, so backward filling makes sense to utilize existing previous values when missing.

Remaining Missing Values:
Some columns still contain missing values, specifically RSI, %K, %R, MA20, MA50, MA100, BB_*, and Ichimoku features. These missing values exist due to the rolling calculations used in their derivation and may require further filling or removal depending on the modeling approach.

1. Convert the 'Date' column to datetime format
2. Handle missing values in RSI, %K, %R using forward fill first, and backward fill as a backup
3. Fill moving averages (MA20, MA50, MA100) using forward fill first, and backward fill as a backup
4. Fill lag features (Close_lag_1 to Close_lag_5) using backward fill as they represent past data
5. Fill Bollinger Bands (BB_Middle, BB_Std_Dev, BB_Upper, BB_Lower) using forward fill first, and backward fill as a backup
6. Fill Ichimoku indicators using forward fill first, and backward fill as a backup
7. Fill 'Returns' with 0 as it represents no change in price


In [4]:
df = pd.read_csv('MSFT.csv')
cleaned_data = clean_data(df)

# Check if there are any missing values left after cleaning
final_missing_values = cleaned_data.isnull().sum()

# Display the count of missing values per column after final cleaning
final_missing_values


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df[col].fillna(method='ffill', inplace=True)
  df[col].fillna(method='ffill', inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df[col].fillna(method='bfill', inplace=True)
  df[col].fillna(method='bfill', inplace=True)
The behavior will change in pandas 3.0. This inp

Date              0
Open              0
High              0
Low               0
Close             0
Adj Close         0
Volume            0
RSI               0
%K                0
%R                0
MA20              0
MA50              0
MA100             0
Returns           0
Close_lag_1       0
Close_lag_2       0
Close_lag_3       0
Close_lag_4       0
Close_lag_5       0
EMA_short         0
EMA_long          0
MACD              0
MACD_Signal       0
BB_Middle         0
BB_Std_Dev        0
BB_Upper          0
BB_Lower          0
tenkan_sen        0
kijun_sen         0
senkou_span_a     0
senkou_span_b     0
chikou_span      26
dtype: int64

In [5]:
df.head()

Unnamed: 0,Date,Open,High,Low,Close,Adj Close,Volume,RSI,%K,%R,...,MACD_Signal,BB_Middle,BB_Std_Dev,BB_Upper,BB_Lower,tenkan_sen,kijun_sen,senkou_span_a,senkou_span_b,chikou_span
0,1986-03-13,0.088542,0.101563,0.088542,0.097222,0.059946,1031788800,46.666922,47.05564,-52.94436,...,0.0,0.09618,0.002853,0.101886,0.090475,0.09592,0.096788,0.098307,0.105252,0.101563
1,1986-03-14,0.097222,0.102431,0.097222,0.100694,0.062087,308160000,46.666922,47.05564,-52.94436,...,5.5e-05,0.09618,0.002853,0.101886,0.090475,0.09592,0.096788,0.098307,0.105252,0.099826
2,1986-03-17,0.100694,0.103299,0.100694,0.102431,0.063158,133171200,46.666922,47.05564,-52.94436,...,0.00017,0.09618,0.002853,0.101886,0.090475,0.09592,0.096788,0.098307,0.105252,0.10026
3,1986-03-18,0.102431,0.103299,0.098958,0.099826,0.061552,67766400,46.666922,47.05564,-52.94436,...,0.000274,0.09618,0.002853,0.101886,0.090475,0.09592,0.096788,0.098307,0.105252,0.110243
4,1986-03-19,0.099826,0.100694,0.097222,0.09809,0.060482,47894400,46.666922,47.05564,-52.94436,...,0.000338,0.09618,0.002853,0.101886,0.090475,0.09592,0.096788,0.098307,0.105252,0.117188
