In [2]:
pip install pykalman

Collecting pykalman
  Downloading pykalman-0.9.7-py2.py3-none-any.whl.metadata (5.5 kB)
Downloading pykalman-0.9.7-py2.py3-none-any.whl (251 kB)
Installing collected packages: pykalman
Successfully installed pykalman-0.9.7
Note: you may need to restart the kernel to use updated packages.


In [3]:
import pandas as pd
import numpy as np
from statsmodels.tsa.arima.model import ARIMA
from sklearn.metrics import mean_squared_error
from pykalman import KalmanFilter

file_path = "/Users/wenjing/Downloads/Capstone/df_combined_with_return.csv"
df_combined = pd.read_csv(file_path)
len(df_combined)

9477

In [8]:
df_combined['return'] = df_combined['return']/100

In [17]:
import pandas as pd
import numpy as np
from statsmodels.tsa.arima.model import ARIMA
from sklearn.metrics import mean_squared_error
from pykalman import KalmanFilter
from sklearn.preprocessing import LabelEncoder

# List of stock symbols
stocks = ['AMD', 'NVDA', 'TSLA', 'AAPL', 'BA', 'MSFT', 'GOOG', 'INTC', 'MRK']

# Define a function to apply Kalman filtering for missing data
def apply_kalman_filter(series):
    kf = KalmanFilter(initial_state_mean=0, n_dim_obs=1)
    state_means, _ = kf.em(series, n_iter=10).filter(series)
    return pd.Series(state_means.flatten(), index=series.index)

# Function to fit ARIMA model and calculate RMSE
def fit_arima_and_calculate_rmse(data, exog_cols, target_col='return'):
    # Encode 'sentiment_label' if it is in the exog_cols
    if 'sentiment_label' in exog_cols:
        le = LabelEncoder()
        data['sentiment_label'] = le.fit_transform(data['sentiment_label'].astype(str))

    # Fill missing values using Kalman Filter for each exogenous column
    for col in exog_cols:
        if data[col].dtype in ['float64', 'int64']:  # Apply only to numeric columns
            data[col] = apply_kalman_filter(data[col])

    # Drop rows where target variable is missing
    data = data.dropna(subset=[target_col])

    # Prepare exogenous data
    exog_data = data[exog_cols]

    # Split data into training and test sets
    train_size = int(0.8 * len(data))
    train, test = data[target_col][:train_size], data[target_col][train_size:]
    exog_train, exog_test = exog_data[:train_size], exog_data[train_size:]

    # Fit ARIMA model with exogenous variables
    model = ARIMA(train, exog=exog_train, order=(2,1,2))
    model_fit = model.fit()

    # Make predictions on the test set
    predictions = model_fit.predict(start=train_size, end=len(data)-1, exog=exog_test)

    # Calculate RMSE
    rmse = np.sqrt(mean_squared_error(test, predictions))
    return rmse

# Initialize dictionaries to store RMSE values for each stock
rmse_with_sentiment = {}
rmse_without_sentiment = {}

# Loop through each stock symbol
for stock in stocks:
    print(f"Processing stock: {stock}")

    # Filter data for the current stock
    df_stock = df_combined[df_combined['Stock_symbol'] == stock].copy()
    
    # Ensure 'date' is in datetime format and sort by date
#     df_stock['date'] = pd.to_datetime(df_stock['date'])
    df_stock.set_index('date', inplace=True)
#     df_stock.index = pd.to_datetime(df_stock.index)
    df_stock.index = pd.DatetimeIndex(df_stock.index).to_period('D')

    df_stock = df_stock.sort_values('date')

    # Define columns with and without sentiment features
    cols_with_sentiment = ['open', 'volume', 'sentiment_label', 'sentiment_score']
    cols_without_sentiment = ['open', 'volume']

    # Calculate RMSE with sentiment features
    rmse_with = fit_arima_and_calculate_rmse(df_stock, exog_cols=cols_with_sentiment)
    rmse_with_sentiment[stock] = rmse_with

    # Calculate RMSE without sentiment features
    rmse_without = fit_arima_and_calculate_rmse(df_stock, exog_cols=cols_without_sentiment)
    rmse_without_sentiment[stock] = rmse_without

    # Print results
    print(f"Stock: {stock} - RMSE with sentiment: {rmse_with:.4f}, RMSE without sentiment: {rmse_without:.4f}")

# Display comparison of RMSE values
print("\nRMSE Comparison:")
for stock in stocks:
    print(f"{stock} - With Sentiment: {rmse_with_sentiment[stock]:.4f}, Without Sentiment: {rmse_without_sentiment[stock]:.4f}")


Processing stock: AMD




Stock: AMD - RMSE with sentiment: 0.2333, RMSE without sentiment: 0.2528
Processing stock: NVDA


  warn('Non-stationary starting autoregressive parameters'
  warn('Non-invertible starting MA parameters found.'
  warn('Non-stationary starting autoregressive parameters'
  warn('Non-invertible starting MA parameters found.'


Stock: NVDA - RMSE with sentiment: 0.1461, RMSE without sentiment: 0.1303
Processing stock: TSLA




Stock: TSLA - RMSE with sentiment: 0.0681, RMSE without sentiment: 0.0542
Processing stock: AAPL




Stock: AAPL - RMSE with sentiment: 0.0400, RMSE without sentiment: 0.0277
Processing stock: BA




Stock: BA - RMSE with sentiment: 0.1023, RMSE without sentiment: 0.0809
Processing stock: MSFT




Stock: MSFT - RMSE with sentiment: 0.0550, RMSE without sentiment: 0.0276
Processing stock: GOOG




Stock: GOOG - RMSE with sentiment: 0.0167, RMSE without sentiment: 0.0169
Processing stock: INTC




Stock: INTC - RMSE with sentiment: 0.0839, RMSE without sentiment: 0.0916
Processing stock: MRK




Stock: MRK - RMSE with sentiment: 0.2507, RMSE without sentiment: 0.2707

RMSE Comparison:
AMD - With Sentiment: 0.2333, Without Sentiment: 0.2528
NVDA - With Sentiment: 0.1461, Without Sentiment: 0.1303
TSLA - With Sentiment: 0.0681, Without Sentiment: 0.0542
AAPL - With Sentiment: 0.0400, Without Sentiment: 0.0277
BA - With Sentiment: 0.1023, Without Sentiment: 0.0809
MSFT - With Sentiment: 0.0550, Without Sentiment: 0.0276
GOOG - With Sentiment: 0.0167, Without Sentiment: 0.0169
INTC - With Sentiment: 0.0839, Without Sentiment: 0.0916
MRK - With Sentiment: 0.2507, Without Sentiment: 0.2707




In [10]:
import pandas as pd
import numpy as np
from statsmodels.tsa.arima.model import ARIMA

# Assuming your DataFrame is 'df_combined' with columns as specified

# List of stock symbols
stocks = ['AMD', 'NVDA', 'TSLA', 'AAPL', 'BA', 'MSFT', 'GOOG', 'INTC', 'MRK']

# Loop through each stock symbol
for stock in stocks:
    print(f"Processing stock: {stock}")

    # Filter data for the current stock
    df_stock = df_combined[df_combined['Stock_symbol'] == stock].copy()
    
    # Ensure 'date' is in datetime format and sort by date
    df_stock['date'] = pd.to_datetime(df_stock['date'])
    df_stock = df_stock.sort_values('date')
    
    # Set the date as the index for ARIMA modeling
    df_stock.set_index('date', inplace=True)
    
    # Use 'open' and any exogenous variables to predict 'return'
    exog_cols = ['open', 'volume']  # Adjust if you include sentiment features

    # Fit ARIMA model on 'return' column with exogenous features
    model = ARIMA(df_stock['return'], exog=df_stock[exog_cols], order=(1, 1, 1))
    model_fit = model.fit()

    # Predict returns for the last 10 examples in the test set
    test_data = df_stock.iloc[-10:]
    predicted_returns = model_fit.predict(start=len(df_stock)-10, end=len(df_stock)-1, exog=test_data[exog_cols])

    # Calculate the predicted price from the predicted return and open price
    test_data['predicted_return'] = predicted_returns
    test_data['predicted_price'] = test_data['open'] * (1 + test_data['predicted_return'])

    # Show the predicted price, corresponding close price, and open price
    print(test_data[['open', 'predicted_return', 'predicted_price', 'close']])


Processing stock: AMD


  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  test_data['predicted_return'] = predicted_returns
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  test_data['predicted_price'] = test_data['open'] * (1 + test_data['predicted_return'])
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)


                  open  predicted_return  predicted_price       close
date                                                                 
2023-12-14  138.889999         -0.015807       136.694598  138.000000
2023-12-15  139.520004         -0.014051       137.559619  139.149994
2023-12-18  139.149994         -0.006101       138.301066  138.899994
2023-12-19  138.570007          0.003798       139.096349  140.149994
2023-12-20  139.000000          0.003599       139.500228  135.470001
2023-12-21  138.139999         -0.003529       137.652453  139.910004
2023-12-22  140.479996         -0.020295       137.628895  139.600006
2023-12-26  140.070007         -0.005304       139.327109  143.410004
2023-12-27  144.720001         -0.029419       140.462501  146.070007
2023-12-28  146.800003         -0.022704       143.466991  148.759995
Processing stock: NVDA
                  open  predicted_return  predicted_price       close
date                                                               

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  test_data['predicted_return'] = predicted_returns
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  test_data['predicted_price'] = test_data['open'] * (1 + test_data['predicted_return'])
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#retur

                  open  predicted_return  predicted_price       close
date                                                                 
2023-12-04  235.750000          0.015883       239.494300  235.580002
2023-12-05  233.869995          0.007267       235.569511  238.720001
2023-12-06  242.919998         -0.013632       239.608559  239.369995
2023-12-07  241.550003         -0.005834       240.140792  242.639999
2023-12-08  240.270004          0.005462       241.582478  243.839996
2023-12-11  242.740005          0.006163       244.235980  239.740005
2023-12-12  238.550003          0.006991       240.217639  237.009995
2023-12-13  234.190002          0.004648       235.278424  239.289993
2023-12-14  241.220001         -0.007903       239.313650  251.050003
2023-12-15  251.210007         -0.000684       251.038055  253.500000
Processing stock: AAPL
                  open  predicted_return  predicted_price       close
date                                                               

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  test_data['predicted_return'] = predicted_returns
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  test_data['predicted_price'] = test_data['open'] * (1 + test_data['predicted_return'])
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#retur

                  open  predicted_return  predicted_price       close
date                                                                 
2023-11-30  225.000000         -0.000804       224.818990  231.630005
2023-12-01  231.770004          0.002214       232.283186  233.869995
2023-12-04  231.300003          0.009386       233.470912  234.869995
2023-12-05  233.539993          0.007359       235.258623  234.160004
2023-12-06  234.779999          0.002660       235.404453  236.889999
2023-12-07  236.899994          0.001731       237.310138  237.330002
2023-12-11  243.500000         -0.012098       240.554214  248.080002
2023-12-12  247.949997         -0.004309       246.881503  248.630005
2023-12-13  249.100006          0.000135       249.133529  250.910004
2023-12-14  250.910004          0.001219       251.215935  256.239990
Processing stock: MSFT
                  open  predicted_return  predicted_price       close
date                                                               

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  test_data['predicted_return'] = predicted_returns
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  test_data['predicted_price'] = test_data['open'] * (1 + test_data['predicted_return'])
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)


                  open  predicted_return  predicted_price       close
date                                                                 
2023-12-04  131.294006         -0.006376       130.456861  130.630005
2023-12-05  130.369995         -0.004501       129.783176  132.389999
2023-12-06  132.899994          0.001357       133.080343  131.429993
2023-12-07  136.600006         -0.005871       135.798027  138.449997
2023-12-08  135.660004          0.001974       135.927847  136.639999
2023-12-11  133.820007          0.004131       134.372881  134.699997
2023-12-12  133.270004          0.004553       133.876783  133.639999
2023-12-13  134.544998          0.004378       135.134037  133.970001
2023-12-14  134.770004          0.001068       134.913986  133.199997
2023-12-15  132.919998         -0.008853       131.743246  133.839996
Processing stock: INTC


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  test_data['predicted_return'] = predicted_returns
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  test_data['predicted_price'] = test_data['open'] * (1 + test_data['predicted_return'])
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)


                 open  predicted_return  predicted_price      close
date                                                               
2023-12-04  43.299999          0.007640        43.630815  42.349998
2023-12-05  41.910000          0.007619        42.229309  41.919998
2023-12-06  42.480000         -0.006498        42.203959  41.270000
2023-12-07  41.750000         -0.009602        41.349115  42.150002
2023-12-08  41.840000         -0.003092        41.710623  42.700001
2023-12-11  43.160000         -0.004187        42.979271  44.540001
2023-12-12  44.360001          0.003949        44.535171  44.040001
2023-12-13  44.080002          0.004407        44.274262  44.570000
2023-12-14  45.009998         -0.007910        44.653960  45.180000
2023-12-15  45.939999         -0.012758        45.353908  46.160000
Processing stock: MRK
                  open  predicted_return  predicted_price       close
date                                                                 
2023-12-01  102.849998

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  test_data['predicted_return'] = predicted_returns
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  test_data['predicted_price'] = test_data['open'] * (1 + test_data['predicted_return'])


In [11]:
import pandas as pd
import numpy as np
from statsmodels.tsa.arima.model import ARIMA
from sklearn.metrics import accuracy_score
from pykalman import KalmanFilter
from sklearn.preprocessing import LabelEncoder

# List of stock symbols
stocks = ['AMD', 'NVDA', 'TSLA', 'AAPL', 'BA', 'MSFT', 'GOOG', 'INTC', 'MRK']

# Define a function to apply Kalman filtering for missing data
def apply_kalman_filter(series):
    kf = KalmanFilter(initial_state_mean=0, n_dim_obs=1)
    state_means, _ = kf.em(series, n_iter=10).filter(series)
    return pd.Series(state_means.flatten(), index=series.index)

# Function to fit ARIMA model and calculate movement prediction accuracy
def fit_arima_and_calculate_accuracy(data, exog_cols, target_col='return'):
    # Encode 'sentiment_label' if it is in the exog_cols
    if 'sentiment_label' in exog_cols:
        le = LabelEncoder()
        data['sentiment_label'] = le.fit_transform(data['sentiment_label'].astype(str))

    # Fill missing values using Kalman Filter for each exogenous column
    for col in exog_cols:
        if data[col].dtype in ['float64', 'int64']:  # Apply only to numeric columns
            data[col] = apply_kalman_filter(data[col])

    # Drop rows where target variable is missing
    data = data.dropna(subset=[target_col])

    # Prepare exogenous data
    exog_data = data[exog_cols]

    # Split data into training and test sets
    train_size = int(0.8 * len(data))
    train, test = data[target_col][:train_size], data[target_col][train_size:]
    exog_train, exog_test = exog_data[:train_size], exog_data[train_size:]

    # Fit ARIMA model with exogenous variables
    model = ARIMA(train, exog=exog_train, order=(1, 1, 1))
    model_fit = model.fit()

    # Make predictions on the test set
    predicted_returns = model_fit.predict(start=train_size, end=len(data)-1, exog=exog_test)

    # Convert predicted returns to "up" or "down" based on sign
    predicted_movement = np.where(predicted_returns > 0, 'up', 'down')
    
    # Compare with actual movement in the test set and calculate accuracy
    actual_movement = data['movement'][train_size:]
    accuracy = accuracy_score(actual_movement, predicted_movement)

    return accuracy

# Initialize dictionaries to store accuracy values for each stock
accuracy_with_sentiment = {}
accuracy_without_sentiment = {}

# Loop through each stock symbol
for stock in stocks:
    print(f"Processing stock: {stock}")

    # Filter data for the current stock
    df_stock = df_combined[df_combined['Stock_symbol'] == stock].copy()
    
    # Ensure 'date' is in datetime format and sort by date
    df_stock.set_index('date', inplace=True)
    df_stock.index = pd.DatetimeIndex(df_stock.index).to_period('D')

    df_stock = df_stock.sort_values('date')

    # Define columns with and without sentiment features
    cols_with_sentiment = ['open', 'volume', 'sentiment_label', 'sentiment_score']
    cols_without_sentiment = ['open', 'volume']

    # Calculate accuracy with sentiment features
    accuracy_with = fit_arima_and_calculate_accuracy(df_stock, exog_cols=cols_with_sentiment)
    accuracy_with_sentiment[stock] = accuracy_with

    # Calculate accuracy without sentiment features
    accuracy_without = fit_arima_and_calculate_accuracy(df_stock, exog_cols=cols_without_sentiment)
    accuracy_without_sentiment[stock] = accuracy_without

    # Print results
    print(f"Stock: {stock} - Accuracy with sentiment: {accuracy_with:.4f}, Accuracy without sentiment: {accuracy_without:.4f}")

# Display comparison of accuracy values
print("\nAccuracy Comparison:")
for stock in stocks:
    print(f"{stock} - With Sentiment: {accuracy_with_sentiment[stock]:.4f}, Without Sentiment: {accuracy_without_sentiment[stock]:.4f}")


Processing stock: AMD




Stock: AMD - Accuracy with sentiment: 0.4911, Accuracy without sentiment: 0.4911
Processing stock: NVDA




Stock: NVDA - Accuracy with sentiment: 0.5299, Accuracy without sentiment: 0.5299
Processing stock: TSLA




Stock: TSLA - Accuracy with sentiment: 0.4512, Accuracy without sentiment: 0.4634
Processing stock: AAPL




Stock: AAPL - Accuracy with sentiment: 0.5128, Accuracy without sentiment: 0.5000
Processing stock: BA




Stock: BA - Accuracy with sentiment: 0.4891, Accuracy without sentiment: 0.4946
Processing stock: MSFT




Stock: MSFT - Accuracy with sentiment: 0.5181, Accuracy without sentiment: 0.5181
Processing stock: GOOG




Stock: GOOG - Accuracy with sentiment: 0.4362, Accuracy without sentiment: 0.4362
Processing stock: INTC




Stock: INTC - Accuracy with sentiment: 0.4676, Accuracy without sentiment: 0.4710
Processing stock: MRK




Stock: MRK - Accuracy with sentiment: 0.4783, Accuracy without sentiment: 0.4783

Accuracy Comparison:
AMD - With Sentiment: 0.4911, Without Sentiment: 0.4911
NVDA - With Sentiment: 0.5299, Without Sentiment: 0.5299
TSLA - With Sentiment: 0.4512, Without Sentiment: 0.4634
AAPL - With Sentiment: 0.5128, Without Sentiment: 0.5000
BA - With Sentiment: 0.4891, Without Sentiment: 0.4946
MSFT - With Sentiment: 0.5181, Without Sentiment: 0.5181
GOOG - With Sentiment: 0.4362, Without Sentiment: 0.4362
INTC - With Sentiment: 0.4676, Without Sentiment: 0.4710
MRK - With Sentiment: 0.4783, Without Sentiment: 0.4783




In [16]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report
from sklearn.preprocessing import LabelEncoder

# List of stock symbols
stocks = ['AMD', 'NVDA', 'TSLA', 'AAPL', 'BA', 'MSFT', 'GOOG', 'INTC', 'MRK']

# Initialize dictionaries to store accuracy scores
accuracy_with_sentiment = {}
accuracy_without_sentiment = {}

# Define a function to train and evaluate using only basic features ['open', 'volume']
def model_without_sentiment(df_stock):
    features = ['open', 'volume']
    X = df_stock[features]
    y = df_stock['movement']
    
    # Split the data
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
    
    # Train the model
    model = LogisticRegression()
    model.fit(X_train, y_train)
    
    # Predict and calculate accuracy
    y_pred = model.predict(X_test)
    accuracy = accuracy_score(y_test, y_pred)
    return accuracy, classification_report(y_test, y_pred, target_names=["Down", "Up"])

# Define a function to train and evaluate using additional sentiment features
def model_with_sentiment(df_stock):
    features = ['open', 'volume', 'sentiment_label', 'sentiment_score']
    X = df_stock[features]
    y = df_stock['movement']
    
    # Split the data
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
    
    # Train the model
    model = LogisticRegression()
    model.fit(X_train, y_train)
    
    # Predict and calculate accuracy
    y_pred = model.predict(X_test)
    accuracy = accuracy_score(y_test, y_pred)
    return accuracy, classification_report(y_test, y_pred, target_names=["Down", "Up"])

# Loop through each stock and calculate accuracy for each feature set
for stock in stocks:
    print(f"\nProcessing stock: {stock}")
    
    # Filter data for the current stock
    df_stock = df_combined[df_combined['Stock_symbol'] == stock].copy()
    
    # Ensure 'date' is in datetime format and sort by date
    df_stock['date'] = pd.to_datetime(df_stock['date'])
    df_stock = df_stock.sort_values('date')
    
    # Create the target variable 'movement' based on 'return' (1 for "up", 0 for "down")
    df_stock['movement'] = np.where(df_stock['return'] > 0, 1, 0)
    
    # Encode the sentiment label
    le = LabelEncoder()
    df_stock['sentiment_label'] = le.fit_transform(df_stock['sentiment_label'].astype(str))

    # Calculate accuracy without sentiment features
    accuracy_wo, report_wo = model_without_sentiment(df_stock)
    accuracy_without_sentiment[stock] = accuracy_wo
    print(f"Accuracy without sentiment for {stock}: {accuracy_wo:.2%}")
    print("Classification Report without Sentiment:")
    print(report_wo)

    # Calculate accuracy with sentiment features
    accuracy_with, report_with = model_with_sentiment(df_stock)
    accuracy_with_sentiment[stock] = accuracy_with
    print(f"Accuracy with sentiment for {stock}: {accuracy_with:.2%}")
    print("Classification Report with Sentiment:")
    print(report_with)

# Display the overall accuracy comparison
print("\nOverall Accuracy Comparison:")
for stock in stocks:
    print(f"{stock} - Without Sentiment: {accuracy_without_sentiment[stock]:.2%}, With Sentiment: {accuracy_with_sentiment[stock]:.2%}")



Processing stock: AMD
Accuracy without sentiment for AMD: 47.62%
Classification Report without Sentiment:
              precision    recall  f1-score   support

        Down       0.48      1.00      0.65       160
          Up       0.00      0.00      0.00       176

    accuracy                           0.48       336
   macro avg       0.24      0.50      0.32       336
weighted avg       0.23      0.48      0.31       336

Accuracy with sentiment for AMD: 47.62%
Classification Report with Sentiment:
              precision    recall  f1-score   support

        Down       0.48      1.00      0.65       160
          Up       0.00      0.00      0.00       176

    accuracy                           0.48       336
   macro avg       0.24      0.50      0.32       336
weighted avg       0.23      0.48      0.31       336


Processing stock: NVDA
Accuracy without sentiment for NVDA: 42.74%
Classification Report without Sentiment:
              precision    recall  f1-score   suppor

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize