Here's a snippet of code to pull data from yfinance and clean it

In [1]:
import yfinance as yf
import pandas as pd

def get_and_clean_data(ticker):
    try:
        # Download stock data
        data = yf.download(ticker, start="2010-01-01", end="2023-12-31")
        
        # Check if the dataframe is empty (happens if the ticker isn't valid or no data available)
        if data.empty:
            raise ValueError(f"No data found for ticker '{ticker}'. It might not be traded anymore.")
        
        # Fill missing values
        data.fillna(method='ffill', inplace=True)  # Forward fill
        data.fillna(method='bfill', inplace=True)  # Backward fill if any NaN remain
    
        # Detect outliers using IQR and cap/floor them
        for column in data.columns:
            Q1 = data[column].quantile(0.25)
            Q3 = data[column].quantile(0.75)
            IQR = Q3 - Q1
            lower_bound = Q1 - 1.5 * IQR
            upper_bound = Q3 + 1.5 * IQR
            
            # Capping the upper outliers and flooring the lower outliers
            data[column] = data[column].clip(lower=lower_bound, upper=upper_bound)

        return data
    except ValueError as e:
        print(e)
    except Exception as e:
        print(f"An error occurred: {e}")

# Example usage
ticker = 'AAPL'  # Apple Inc.
cleaned_data = get_and_clean_data(ticker)
if cleaned_data is not None:
    print(cleaned_data.head())


[*********************100%%**********************]  1 of 1 completed

                Open      High       Low     Close  Adj Close     Volume
Date                                                                    
2010-01-04  7.622500  7.660714  7.585000  7.643214   6.461977  493729600
2010-01-05  7.664286  7.699643  7.616071  7.656429   6.473149  601904800
2010-01-06  7.656429  7.686786  7.526786  7.534643   6.370185  552160000
2010-01-07  7.562500  7.571429  7.466071  7.520714   6.358408  477131200
2010-01-08  7.510714  7.571429  7.466429  7.570714   6.400682  447610800



  data.fillna(method='ffill', inplace=True)  # Forward fill
  data.fillna(method='bfill', inplace=True)  # Backward fill if any NaN remain


Here's the same approach utilizing the same dataset I provided in my original capstone proposal

In [None]:
import pandas as pd

def load_and_clean_data(file_path):
    # Load data from CSV file
    data = pd.read_csv(file_path, parse_dates=['Date'], index_col='Date')
    
    # Check for any missing values and fill them
    data.fillna(method='ffill', inplace=True)  # Forward fill
    data.fillna(method='bfill', inplace=True)  # Backward fill if any NaN remain
    
    # Detect outliers using IQR and cap/floor them
    for column in data.columns:
        Q1 = data[column].quantile(0.25)
        Q3 = data[column].quantile(0.75)
        IQR = Q3 - Q1
        lower_bound = Q1 - 1.5 * IQR
        upper_bound = Q3 + 1.5 * IQR
        
        # Capping the upper outliers and flooring the lower outliers
        data[column] = data[column].clip(lower=lower_bound, upper=upper_bound)
    
    return data

file_path = r'C:\Users\Hunter\Documents\ucsd\capstone\threeDatasets\stockPrediction\Stocks'  # Modify this path to where your file is stored
cleaned_data = load_and_clean_data(file_path)
print(cleaned_data.head())
