In [1]:
#Create a DataFrame
import yfinance as yf
import pandas as pd
from datetime import datetime, timedelta

# List of 10 companies from S&P 100 (you can modify this list)
sp100_companies = [
    'AAPL',  # Apple
    'MSFT',  # Microsoft
    'AMZN',  # Amazon
    'GOOGL', # Alphabet (Google)
    'META',  # Meta (Facebook)
    'JNJ',   # Johnson & Johnson
    'JPM',   # JPMorgan Chase
    'V',     # Visa
    'PG',    # Procter & Gamble
    'DIS'    # Disney
]

# Calculate date range (last 3 years from today)
end_date = datetime.today()
start_date = end_date - timedelta(days=3*365)

# Function to fetch data for a single stock
def fetch_stock_data(ticker, start_date, end_date):
    try:
        stock = yf.Ticker(ticker)
        df = stock.history(start=start_date, end=end_date)
        df['Ticker'] = ticker  # Add ticker symbol as a column
        return df
    except Exception as e:
        print(f"Error fetching data for {ticker}: {e}")
        return None

# Fetch data for all companies
all_data = []
for ticker in sp100_companies:
    print(f"Fetching data for {ticker}...")
    data = fetch_stock_data(ticker, start_date, end_date)
    if data is not None:
        all_data.append(data)

# Combine all data into a single DataFrame
if all_data:
    combined_df = pd.concat(all_data)
    
    # Save to CSV
    filename = f"top10_sp100_companies.csv"
    combined_df.to_csv(filename)
    print(f"\nData successfully saved to {filename}")
    
    # Display sample data
    print("\nSample of the fetched data:")
    print(combined_df.head())
else:
    print("No data was fetched. Please check your internet connection or ticker symbols.")

Fetching data for AAPL...
Fetching data for MSFT...
Fetching data for AMZN...
Fetching data for GOOGL...
Fetching data for META...
Fetching data for JNJ...
Fetching data for JPM...
Fetching data for V...
Fetching data for PG...
Fetching data for DIS...

Data successfully saved to top10_sp100_companies.csv

Sample of the fetched data:
                                 Open        High         Low       Close  \
Date                                                                        
2022-04-18 00:00:00-04:00  161.306270  163.943545  160.961860  162.437943   
2022-04-19 00:00:00-04:00  162.388749  165.144105  161.296447  164.730789   
2022-04-20 00:00:00-04:00  166.069110  166.187207  163.451535  164.563507   
2022-04-21 00:00:00-04:00  166.216690  168.794908  163.264525  163.766388   
2022-04-22 00:00:00-04:00  163.805801  165.193307  158.924882  159.210251   

                             Volume  Dividends  Stock Splits Ticker  
Date                                                  

In [2]:
#Preprocess data to train the model
import numpy as np
import pandas as pd
from sklearn.preprocessing import MinMaxScaler

def preprocess_data(df, ticker):
    # Filter data for the specific ticker
    stock_df = df[df['Ticker'] == ticker].copy()
    
    # Calculate daily returns
    stock_df['Daily_Return'] = stock_df['Close'].pct_change()
    
    # Calculate moving averages
    stock_df['MA_5'] = stock_df['Close'].rolling(window=5).mean()
    stock_df['MA_20'] = stock_df['Close'].rolling(window=20).mean()
    
    # Calculate Bollinger Bands
    stock_df['Upper_Bollinger'] = stock_df['MA_20'] + 2 * stock_df['Close'].rolling(window=20).std()
    stock_df['Lower_Bollinger'] = stock_df['MA_20'] - 2 * stock_df['Close'].rolling(window=20).std()
    
    # Calculate RSI
    delta = stock_df['Close'].diff()
    gain = (delta.where(delta > 0, 0)).rolling(window=14).mean()
    loss = (-delta.where(delta < 0, 0)).rolling(window=14).mean()
    rs = gain / loss
    stock_df['RSI'] = 100 - (100 / (1 + rs))
    
    # Drop NaN values
    stock_df.dropna(inplace=True)
    
    # Select features
    features = ['Close', 'Volume', 'Daily_Return', 'MA_5', 'MA_20', 
                'Upper_Bollinger', 'Lower_Bollinger', 'RSI']
    
    # Scale features
    scaler = MinMaxScaler()
    scaled_data = scaler.fit_transform(stock_df[features])
    
    return scaled_data, stock_df, scaler

In [3]:
#Main
df = pd.read_csv('top10_sp100_companies.csv')

scaled_data, stock_df, scaler = preprocess_data(df, 'GOOGL')

stock_df.head(20)

Unnamed: 0,Date,Open,High,Low,Close,Volume,Dividends,Stock Splits,Ticker,Daily_Return,MA_5,MA_20,Upper_Bollinger,Lower_Bollinger,RSI
2272,2022-05-13 00:00:00-04:00,113.988033,117.314134,113.064454,115.498314,35038000,0.0,0.0,GOOGL,0.028415,113.338731,118.267518,128.976519,107.558517,41.63142
2273,2022-05-16 00:00:00-04:00,114.408513,115.617235,113.347591,113.900444,25990000,0.0,0.0,GOOGL,-0.013835,113.723691,117.609091,127.637237,107.580944,44.628934
2274,2022-05-17 00:00:00-04:00,116.284544,116.343763,114.323423,115.918793,23054000,0.0,0.0,GOOGL,0.01772,114.137314,116.93551,125.304136,108.566884,52.958495
2275,2022-05-18 00:00:00-04:00,114.452815,114.850913,111.0247,111.367065,35126000,0.0,0.0,GOOGL,-0.039267,113.798335,116.132325,123.249633,109.015018,41.089266
2276,2022-05-19 00:00:00-04:00,110.901289,112.472277,109.476597,109.858772,34144000,0.0,0.0,GOOGL,-0.013543,113.308678,115.414234,121.971519,108.856949,44.563777
2277,2022-05-20 00:00:00-04:00,111.418314,111.649713,105.293102,108.389793,48962000,0.0,0.0,GOOGL,-0.013372,111.886974,114.880411,121.907377,107.853445,38.464904
2278,2022-05-23 00:00:00-04:00,109.066065,111.613882,108.223589,110.95752,37184000,0.0,0.0,GOOGL,0.02369,111.298389,114.303868,120.551501,108.056234,41.671665
2279,2022-05-24 00:00:00-04:00,105.268726,105.95196,101.399719,105.465782,76780000,0.0,0.0,GOOGL,-0.049494,109.207787,113.672885,120.799633,106.546137,27.17582
2280,2022-05-25 00:00:00-04:00,104.485955,105.993252,103.362329,105.301559,40258000,0.0,0.0,GOOGL,-0.001557,107.994685,113.250429,121.299734,105.201124,32.223617
2281,2022-05-26 00:00:00-04:00,105.174167,108.132519,104.724314,107.279602,37948000,0.0,0.0,GOOGL,0.018785,107.478851,112.716482,120.866925,104.566039,37.304481
