In [None]:
import random
import os
import numpy as np 
import pandas as pd 
import requests
import pandas_datareader as web

# Date
import datetime as dt
from datetime import date, timedelta, datetime

import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn import metrics

# Metrics
from sklearn.metrics import mean_squared_error, mean_absolute_error, median_absolute_error,explained_variance_score, r2_score, mean_absolute_percentage_error
import math


# Modeling and preprocessing
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.linear_model import LinearRegression
from sklearn.neighbors import KNeighborsRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import BaggingRegressor, AdaBoostRegressor
from sklearn.neural_network import MLPRegressor
from sklearn.ensemble import GradientBoostingRegressor
from tensorflow.keras.layers import GRU, Dense


from sklearn.tree import DecisionTreeRegressor
from keras.models import Sequential
from keras.layers import LSTM, Dense, Dropout
from sklearn.base import BaseEstimator


import warnings
warnings.filterwarnings("ignore")

Daily data forecast 

In [None]:
#Daily dataset
df = pd.read_csv('commodities_DAILY.csv', parse_dates=True)
display(df)

In [None]:
df.columns

In [None]:
df.isnull().values.any()

In [None]:
# Convert the 'Date' column to datetime format 
df['Dates'] = pd.to_datetime(df['Dates'])

# Add a column for the day of the week (0 = Monday, 1 = Tuesday, ..., 6 = Sunday)
df['DayOfWeek'] = df['Dates'].dt.dayofweek

# Add a binary column indicating whether it's a weekend (1 = Saturday or Sunday, 0 = other days)
df['IsWeekend'] = df['DayOfWeek'].isin([5, 6]).astype(int)

# Display the modified Data
print(df.head())

In [None]:
def mean_absolute_percentage_error(y_true, y_pred): 
    y_true, y_pred = np.array(y_true), np.array(y_pred)
    return np.mean(np.abs((y_true - y_pred) / y_true)) * 100

columns_of_interest = ['CL1_PX_LAST', 'NG1_PX_LAST', 'HO1_PX_LAST', 'W1_PX_LAST', 'C1_PX_LAST', 'S1_PX_LAST', 'BO1_PX_LAST', 'HG1_PX_LAST', 'GC1_PX_LAST', 'CT1_PX_LAST', 'LC1_PX_LAST']


In [None]:
df['Dates'] = df['Dates'].astype(int) / 10**9  # Convert nanoseconds to seconds

Model LR

In [None]:
def train_and_predict(df, target_column):
    X = df.drop(target_column, axis=1)
    y = df[target_column]
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
    lr_model = LinearRegression()
    lr_model.fit(X_train, y_train)
    predictions = lr_model.predict(X_test)
    mape = mean_absolute_percentage_error(y_test, predictions)
    mse = mean_squared_error(y_test, predictions)
    r2 = r2_score(y_test, predictions)
    # Print evaluation metrics
    print("Mean Squared Error:", mse)
    print("R-squared (R2):", r2)
    print("Mean Absolute Percentage Error:", mape)


for column in columns_of_interest:
    train_and_predict(df, column)

Model RF

In [None]:
def train_and_predict(df, target_column):
    X = df.drop(target_column, axis=1)
    y = df[target_column]
    # Split the data into training and testing sets
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
    # Define the Random Forest regressor
    rf_model = RandomForestRegressor()
    # Specify hyperparameters to tune
    param_grid = {
        'n_estimators': [50, 100, 200],
        'max_depth': [None, 10, 20, 30],
    }
    # Perform grid search with cross-validation
    grid_search = GridSearchCV(estimator=rf_model, param_grid=param_grid, scoring='neg_mean_squared_error', cv=5)
    grid_search.fit(X_train, y_train)
    # Get the best hyperparameters
    best_params = grid_search.best_params_
    print("Best Hyperparameters:", best_params)
    # Use the best model for prediction
    best_model = grid_search.best_estimator_
    predictions = best_model.predict(X_test)
    # Evaluate the model
    mape = mean_absolute_percentage_error(y_test, predictions)
    mse = mean_squared_error(y_test, predictions)
    r2 = r2_score(y_test, predictions)
    print("Mean Squared Error:", mse)
    print("R-squared (R2):", r2)
    print("Mean Absolute Percentage Error:", mape)

for column in columns_of_interest:
    train_and_predict(df, column)

Model DT

In [None]:
def train_and_predict(df, target_column):
    X = df.drop(target_column, axis=1)
    y = df[target_column]
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
    param_grid = {
        'max_depth': [None, 10, 20, 30],
        'min_samples_split': [2, 5, 10],
        'min_samples_leaf': [1, 2, 4]
    }
    dt_model = DecisionTreeRegressor()
    # Perform GridSearchCV for hyperparameter tuning
    grid_search = GridSearchCV(dt_model, param_grid, cv=5, scoring='neg_mean_squared_error')
    grid_search.fit(X_train, y_train)
    best_params = grid_search.best_params_
    # Train the model with the best hyperparameters
    best_dt_model = DecisionTreeRegressor(**best_params)
    best_dt_model.fit(X_train, y_train)
    predictions = best_dt_model.predict(X_test)
    mse = mean_squared_error(y_test, predictions)
    r2 = r2_score(y_test, predictions)
    mape = mean_absolute_percentage_error(y_test, predictions)
    print({target_column})
    print(f"Best Hyperparameters: {best_params}")
    print(f"Mean Squared Error: {mse}")
    print(f"R-squared: {r2}")
    print("Mean Absolute Percentage Error:", mape)    

for column in columns_of_interest:
    train_and_predict(df, column)

Model GBR

In [None]:
def train_and_predict(df, target_column):
    X = df.drop(target_column, axis=1)
    y = df[target_column]
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
    # Create the Gradient Boosting Regressor
    gb_regressor = GradientBoostingRegressor(random_state=42)
    param_grid = {
        'n_estimators': [50, 100, 200],
        'learning_rate': [0.01, 0.1, 0.2],
        'max_depth': [3, 4, 5],
        'subsample': [0.8, 0.9, 1.0],
    }
    grid_search = GridSearchCV(estimator=gb_regressor, param_grid=param_grid, scoring='neg_mean_squared_error', cv=5)
    grid_search.fit(X_train, y_train)
    best_params = grid_search.best_params_
    print("Best Hyperparameters:", best_params)
    final_gb_model = GradientBoostingRegressor(**best_params, random_state=42)
    final_gb_model.fit(X_train, y_train)
    # Make predictions on the test set
    predictions = final_gb_model.predict(X_test)
    mse = mean_squared_error(y_test, predictions)
    r2 = r2_score(y_test, predictions)
    mape = mean_absolute_percentage_error(y_test, predictions)
    print({target_column})
    print("Mean Squared Error:", mse)
    print("R-squared (R2):", r2)
    print("Mean Absolute Percentage Error:", mape)


for column in columns_of_interest:
    train_and_predict(df, column)


Model kNN

In [None]:
def train_and_predict(df, target_column):
    X = df.drop(target_column, axis=1)
    y = df[target_column]
    # Normalize the data
    scaler = MinMaxScaler()
    X_scaled = scaler.fit_transform(X)
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
    param_grid = {
    'n_neighbors': np.arange(1, 21),
    'weights': ['uniform', 'distance']
    }
    knn_model = KNeighborsRegressor()
    grid_search = GridSearchCV(knn_model, param_grid, cv=5, scoring='neg_mean_squared_error')
    grid_search.fit(X_train, y_train)
    best_knn_model = grid_search.best_estimator_
    predictions = best_knn_model.predict(X_test)
    # Evaluate the model
    mse = mean_squared_error(y_test, predictions)
    r2 = r2_score(y_test, predictions)    
    mape = mean_absolute_percentage_error(y_test, predictions)
    print({target_column})
    print("Best Hyperparameters:", grid_search.best_params_)
    print(f"Mean Squared Error: {mse}")
    print(f"R-squared: {r2}")
    print("Mean Absolute Percentage Error:", mape)



for column in columns_of_interest:
    train_and_predict(df, column)

Models LSTM/GRU/DNN

In [None]:
class KerasLSTMRegressor(BaseEstimator):
    def __init__(self, model_type='LSTM', units=50, epochs=50, batch_size=32, verbose=0):
        self.model_type = model_type
        self.units = units
        self.epochs = epochs
        self.batch_size = batch_size
        self.verbose = verbose
        self.model = self._build_model()

    def _build_model(self):
        model = Sequential()
        if self.model_type == 'LSTM':
            model.add(LSTM(units=self.units, input_shape=(1, 1)))
        elif self.model_type == 'GRU':
            model.add(GRU(units=self.units, input_shape=(1, 1)))
        elif self.model_type == 'DNN':
            model.add(Dense(units=self.units, input_shape=(1,)))
            model.add(Dense(units=self.units))
        model.add(Dense(units=1))
        model.compile(optimizer='adam', loss='mean_squared_error')
        return model

    def fit(self, X, y):
        X = X.reshape((X.shape[0], 1, 1))
        self.model.fit(X, y, epochs=self.epochs, batch_size=self.batch_size, verbose=self.verbose)
        return self

    def predict(self, X):
        X = X.reshape((X.shape[0], 1, 1))
        return self.model.predict(X)

    def score(self, X, y):
        X = X.reshape((X.shape[0], 1, 1))
        return -self.model.evaluate(X, y, verbose=self.verbose)

def train_and_predict(df, target_column):
    target = df[target_column].values.reshape(-1, 1)
    scaler = MinMaxScaler()
    scaled_target = scaler.fit_transform(target)
    X_train, X_test, y_train, y_test = train_test_split(scaled_target[:-1], scaled_target[1:], test_size=0.2, random_state=42)

    param_grid = {
        'units': [50, 100, 150],  # Adjust units for LSTM, GRU, and DNN
        'epochs': [50, 100, 150],  # Adjust epochs
        'batch_size': [32, 64, 128]  # Adjust batch_size
    }

    # Create KerasRegressor for LSTM
    keras_lstm_regressor = KerasLSTMRegressor(model_type='LSTM', epochs=50, batch_size=32, verbose=0)
    grid_search_lstm = GridSearchCV(estimator=keras_lstm_regressor, param_grid=param_grid, scoring='neg_mean_squared_error', cv=5)
    grid_search_lstm.fit(X_train, y_train)
    # Get the best hyperparameters for LSTM
    best_params_lstm = grid_search_lstm.best_params_
    best_lstm_regressor = KerasLSTMRegressor(model_type='LSTM', units=best_params_lstm['units'], epochs=best_params_lstm['epochs'], batch_size=best_params_lstm['batch_size'], verbose=0)
    best_lstm_regressor.fit(X_train, y_train)
    predictions_lstm = best_lstm_regressor.predict(X_test)
    mse_lstm = mean_squared_error(y_test, predictions_lstm)
    mape_lstm = mean_absolute_percentage_error(y_test, predictions_lstm)
    r2_lstm = r2_score(y_test, predictions_lstm)

    # Create KerasRegressor for GRU
    keras_gru_regressor = KerasLSTMRegressor(model_type='GRU', epochs=50, batch_size=32, verbose=0)
    grid_search_gru = GridSearchCV(estimator=keras_gru_regressor, param_grid=param_grid, scoring='neg_mean_squared_error', cv=5)
    grid_search_gru.fit(X_train, y_train)
    # Get the best hyperparameters for GRU
    best_params_gru = grid_search_gru.best_params_
    best_gru_regressor = KerasLSTMRegressor(model_type='GRU', units=best_params_gru['units'], epochs=best_params_gru['epochs'], batch_size=best_params_gru['batch_size'], verbose=0)
    best_gru_regressor.fit(X_train, y_train)
    predictions_gru = best_gru_regressor.predict(X_test)
    mape_gru = mean_absolute_percentage_error(y_test, predictions_gru)
    mse_gru = mean_squared_error(y_test, predictions_gru)
    r2_gru = r2_score(y_test, predictions_gru)

    # Create KerasRegressor for DNN
    keras_dnn_regressor = KerasLSTMRegressor(model_type='DNN', epochs=50, batch_size=32, verbose=0)
    grid_search_dnn = GridSearchCV(estimator=keras_dnn_regressor, param_grid=param_grid, scoring='neg_mean_squared_error', cv=5)
    grid_search_dnn.fit(X_train, y_train)
    # Get the best hyperparameters for DNN
    best_params_dnn = grid_search_dnn.best_params_
    best_dnn_regressor = KerasLSTMRegressor(model_type='DNN', units=best_params_dnn['units'], epochs=best_params_dnn['epochs'], batch_size=best_params_dnn['batch_size'], verbose=0)
    best_dnn_regressor.fit(X_train, y_train)
    predictions_dnn = best_dnn_regressor.predict(X_test)
    mape_dnn = mean_absolute_percentage_error(y_test, predictions_dnn)
    mse_dnn = mean_squared_error(y_test, predictions_dnn)
    r2_dnn = r2_score(y_test, predictions_dnn)

    # Print the best parameters and results for LSTM, GRU, and DNN
    print(f"Target Column: {target_column}")
    print("Best parameters for LSTM:", best_params_lstm)
    print(f"MSE for LSTM: {mse_lstm}, R-squared for LSTM: {r2_lstm}, MAPE:", mape_lstm)
    print("Best parameters for GRU:", best_params_gru)
    print(f"MSE for GRU: {mse_gru}, R-squared for GRU: {r2_gru}, MAPE:", mape_gru)
    print("Best parameters for DNN:", best_params_dnn)
    print(f"MSE for DNN: {mse_dnn}, R-squared for DNN: {r2_dnn}, MAPE:", mape_dnn)
    print()
    
for column in columns_of_interest:
    train_and_predict(df, column)

Prediction sentiment price

In [None]:
!pip install praw
import praw
from transformers import pipeline, RobertaForSequenceClassification, RobertaTokenizer, BertForSequenceClassification, BertTokenizer, DistilBertForSequenceClassification, DistilBertTokenizer

# Reddit API credentials
reddit_client_id = 'bfr--CssnTdgrFGIwK_prQ'
reddit_client_secret = 'FPm-1pUWywIqnCjTo8enZwtg9lxtbQ'
reddit_user_agent = 'MytestApp/1.0 by IndependenceNew2283'

# Authenticate with Reddit API using PRAW
reddit = praw.Reddit(client_id=reddit_client_id,
                     client_secret=reddit_client_secret,
                     user_agent=reddit_user_agent)

# Define the subreddit and keywords related to commodities
subreddit_name = 'commodities'
keywords = ['gold', 'silver', 'oil', 'copper', 'soybean', 'cotton', 'cattle', 'cotton', 'wheat', 'corn', 'gas','commodity','commodities','stock market']

# Fetch submissions from the subreddit and filter by keywords
subreddit = reddit.subreddit(subreddit_name)

# Define the number of submissions to fetch (400 000 lines)
total_submissions = 400000
batch_size = 100

# Extract data from submissions
data = {'title': [], 'body': [], 'created_utc': []}
for _ in range(total_submissions // batch_size):
    submissions = subreddit.search(' OR '.join(keywords), sort='new', limit=batch_size)
    for submission in submissions:
        data['title'].append(submission.title)
        data['body'].append(submission.selftext)
        data['created_utc'].append(submission.created_utc)

# Create a DataFrame from the extracted data
reddit_data = pd.DataFrame(data)

In [None]:
# Load sentiment analysis models and tokenizers
roberta_sentiment_model = pipeline('sentiment-analysis', model=RobertaForSequenceClassification.from_pretrained('roberta-base'), tokenizer=RobertaTokenizer.from_pretrained('roberta-base'))
bert_sentiment_model = pipeline('sentiment-analysis', model=BertForSequenceClassification.from_pretrained('bert-base-uncased'), tokenizer=BertTokenizer.from_pretrained('bert-base-uncased'))
distilbert_sentiment_model = pipeline('sentiment-analysis', model=DistilBertForSequenceClassification.from_pretrained('distilbert-base-uncased'), tokenizer=DistilBertTokenizer.from_pretrained('distilbert-base-uncased'))

max_seq_length = 512
reddit_data['body'] = reddit_data['body'].apply(lambda x: x[:max_seq_length])

# Apply sentiment analysis using RoBERTa, BERT, and DistilBERT
reddit_data['title_sentiment_roberta'] = reddit_data['title'].apply(lambda x: roberta_sentiment_model(x)[0]['label'])
reddit_data['body_sentiment_roberta'] = reddit_data['body'].apply(lambda x: roberta_sentiment_model(x)[0]['label'])

reddit_data['title_sentiment_bert'] = reddit_data['title'].apply(lambda x: bert_sentiment_model(x)[0]['label'])
reddit_data['body_sentiment_bert'] = reddit_data['body'].apply(lambda x: bert_sentiment_model(x)[0]['label'])

reddit_data['title_sentiment_distilbert'] = reddit_data['title'].apply(lambda x: distilbert_sentiment_model(x)[0]['label'])
reddit_data['body_sentiment_distilbert'] = reddit_data['body'].apply(lambda x: distilbert_sentiment_model(x)[0]['label'])

print(reddit_data)

In [None]:
reddit_data['created_utc'] = pd.to_datetime(reddit_data['created_utc'], unit='s', utc=True)
reddit_data['created_utc'] = reddit_data['created_utc'].dt.tz_convert('GMT')

In [None]:
#label_mapping = {'LABEL_0': 'Negative', 'LABEL_1': 'Neutral', 'LABEL_2': 'Positive'}
label_mapping = {'LABEL_0': '0', 'LABEL_1': '-1', 'LABEL_2': '1'}

for model in ['roberta', 'bert', 'distilbert']:
    reddit_data[f'title_sentiment_{model}'] = reddit_data[f'title_sentiment_{model}'].map(label_mapping)
    reddit_data[f'body_sentiment_{model}'] = reddit_data[f'body_sentiment_{model}'].map(label_mapping)

# Visualize Sentiment Distribution
#labels = ['Negative', 'Neutral', 'Positive']
labels = ['-1', '0', '1']
# Plotting
fig, axs = plt.subplots(1, 3, figsize=(18, 6))

bar_width = 0.35  # Adjust the width of the bars
for i, model in enumerate(['roberta', 'bert', 'distilbert']):
    # Title Sentiment
    title_sentiment_counts = reddit_data[f'title_sentiment_{model}'].value_counts().reindex(labels, fill_value=0)
    axs[i].bar(labels, title_sentiment_counts, color='skyblue', label=f'Title ({model.capitalize()})', width=bar_width)

    # Body Sentiment
    body_sentiment_counts = reddit_data[f'body_sentiment_{model}'].value_counts().reindex(labels, fill_value=0)
    axs[i].bar([x + bar_width for x in range(len(labels))], body_sentiment_counts, color='#CBC3E3', alpha=0.7, label=f'Body ({model.capitalize()})', width=bar_width)

    axs[i].set_title(f'{model.capitalize()} Sentiment Distribution')
    axs[i].legend()  # Add legend to each subplot

fig.suptitle('Sentiment Analysis on Reddit Data')
plt.tight_layout()
plt.show()


In [None]:
# Save the DataFrame to a CSV file
reddit_data.to_csv('reddit_data.csv', index=False)

In [None]:
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

price_data = pd.read_csv('commodities_DAILY.csv')
sentiment_data = pd.read_csv('reddit_data.csv')

# Convert date columns to datetime format
price_data['Dates'] = pd.to_datetime(price_data['Dates'])
sentiment_data['created_utc'] = pd.to_datetime(sentiment_data['created_utc'])

# Convert 'created_utc' column in sentiment_data
sentiment_data['created_utc'] = pd.to_datetime(sentiment_data['created_utc'], unit='s')
# Extract date part only from 'created_utc'
sentiment_data['created_utc'] = sentiment_data['created_utc'].dt.date
# Convert 'Date' to the same format as 'created_date'
price_data['Dates'] = price_data['Dates'].dt.date
# Verify the conversion
print(price_data.head())
# Merge on the date part
merged_data = pd.merge(sentiment_data, price_data, left_on='created_utc', right_on='Dates', how='left')

for colonne in merged_data.columns:
    merged_data = merged_data.dropna(subset=[colonne])

# Define columns of interest
columns_of_interest = ['CL1_PX_LAST', 'NG1_PX_LAST', 'HO1_PX_LAST', 'W1_PX_LAST', 'C1_PX_LAST', 'S1_PX_LAST',
                        'BO1_PX_LAST', 'HG1_PX_LAST', 'GC1_PX_LAST', 'CT1_PX_LAST', 'LC1_PX_LAST']

def train_and_predict(df, target_column):

    # Select features and target variable
    features = merged_data.drop(['title', 'body', 'created_utc'] + [target_column], axis=1)
    target = merged_data[target_column]

    # Split the data into training and testing sets
    X_train, X_test, y_train, y_test = train_test_split(features, target, test_size=0.2, random_state=42)

    # Define preprocessing steps
    preprocessor = ColumnTransformer(
        transformers=[
            ('sentiment', SimpleImputer(strategy='constant', fill_value=0),
                ['title_sentiment_roberta', 'title_sentiment_bert', 'title_sentiment_distilbert']),
            ('numeric', StandardScaler(),
                ['CL1_VOLATILITY_10D', 'CL1_VOLATILITY_20D', 'NG1_VOLATILITY_10D', 'NG1_VOLATILITY_20D',
                 'HO1_VOLATILITY_10D', 'HO1_VOLATILITY_20D', 'W1_VOLATILITY_10D',
                 'W1_VOLATILITY_20D', 'C1_VOLATILITY_10D', 'C1_VOLATILITY_20D',
                 'S1_VOLATILITY_10D', 'S1_VOLATILITY_20D', 'BO1_VOLATILITY_10D', 'BO1_VOLATILITY_20D',
                 'HG1_VOLATILITY_10D', 'HG1_VOLATILITY_20D', 'GC1_VOLATILITY_10D',
                 'GC1_VOLATILITY_20D', 'CT1_VOLATILITY_10D', 'CT1_VOLATILITY_20D',
                 'LC1_VOLATILITY_10D', 'LC1_VOLATILITY_20D']),
        ])

    # Define the models
    rf_model = RandomForestRegressor(n_estimators=100, random_state=42)
    dt_model = DecisionTreeRegressor(random_state=42)

    # Create and evaluate pipelines for each model
    rf_pipeline = Pipeline(steps=[('preprocessor', preprocessor),
                                   ('model', rf_model)])

    dt_pipeline = Pipeline(steps=[('preprocessor', preprocessor),
                                   ('model', dt_model)])

    # Fit and evaluate Random Forest model
    rf_pipeline.fit(X_train, y_train)
    rf_y_pred = rf_pipeline.predict(X_test)
    rf_mse = mean_squared_error(y_test, rf_y_pred)
    rf_r2 = r2_score(y_test, rf_y_pred)
    print(f'{target_column} - Random Forest - Mean Squared Error: {rf_mse}')
    print(f'{target_column} - Random Forest - R-squared (R2) Score: {rf_r2}')

    # Fit and evaluate Decision Tree model
    dt_pipeline.fit(X_train, y_train)
    dt_y_pred = dt_pipeline.predict(X_test)
    dt_mse = mean_squared_error(y_test, dt_y_pred)
    dt_r2 = r2_score(y_test, dt_y_pred)
    print(f'{target_column} - Decision Tree - Mean Squared Error: {dt_mse}')
    print(f'{target_column} - Decision Tree - R-squared (R2) Score: {dt_r2}')


    print('-' * 50)


for column in columns_of_interest:
    train_and_predict(merged_data, column)

def train_and_predict_lstm(df, target_column):
    features = merged_data.drop(['title', 'body', 'created_utc'] + [target_column], axis=1)
    target = merged_data[target_column]
    features_array = features.select_dtypes(include=[np.number]).values
    target_array = target.values.reshape(-1, 1)

    scaler = StandardScaler()
    features_scaled = scaler.fit_transform(features_array)

    X_train, X_test, y_train, y_test = train_test_split(features_scaled, target_array, test_size=0.2, random_state=42)

    X_train = np.reshape(X_train, (X_train.shape[0], 1, X_train.shape[1]))
    X_test = np.reshape(X_test, (X_test.shape[0], 1, X_test.shape[1]))

    # Define LSTM model
    model = Sequential()
    model.add(LSTM(units=50, return_sequences=True, input_shape=(X_train.shape[1], X_train.shape[2])))
    model.add(LSTM(units=50))
    model.add(Dense(units=1))

    model.compile(optimizer='adam', loss='mean_squared_error')
    early_stopping = EarlyStopping(monitor='val_loss', patience=5, restore_best_weights=True)
    model.fit(X_train, y_train, epochs=100, batch_size=32, validation_split=0.1, callbacks=[early_stopping], verbose=1)

    y_pred = model.predict(X_test)
    mse = mean_squared_error(y_test, y_pred)
    r2 = r2_score(y_test, y_pred)
    print(f'{target_column} - LSTM - Mean Squared Error: {mse}')
    print(f'{target_column} - LSTM - R-squared (R2) Score: {r2}')
    print('-' * 50)

for column in columns_of_interest:
    train_and_predict_lstm(merged_data, column)


In [None]:
!pip install feedparser
import feedparser
import urllib.parse

# Define keywords related to commodities
keywords = ['gold', 'silver', 'oil', 'copper', 'soybean', 'cotton', 'cattle', 'cotton', 'wheat', 'corn', 'gas', 'commodity', 'commodities', 'stock market']

# Fetch news articles related to commodities from Google News (500 000 lines) 
news_data = {'title': [], 'summary': [], 'published': []}
total_lines = 500000
lines_fetched = 0
batch_size = 100

while lines_fetched < total_lines:
    for keyword in keywords:
        # Encode keyword for URL
        encoded_keyword = urllib.parse.quote_plus(keyword)
        feed_url = f'https://news.google.com/rss/search?q={encoded_keyword}&hl=en-US&gl=US&ceid=US:en'
        feed = feedparser.parse(feed_url)
        for entry in feed.entries:
            news_data['title'].append(entry.title)
            news_data['summary'].append(entry.summary)
            news_data['published'].append(entry.published)
            lines_fetched += 1
            if lines_fetched >= total_lines:
                break
        if lines_fetched >= total_lines:
            break
    if lines_fetched >= total_lines:
        break

# Create a DataFrame from the extracted news data
news_df = pd.DataFrame(news_data)


In [None]:
# Load sentiment analysis models and tokenizers
roberta_sentiment_model = pipeline('sentiment-analysis', model=RobertaForSequenceClassification.from_pretrained('roberta-base'), tokenizer=RobertaTokenizer.from_pretrained('roberta-base'))
bert_sentiment_model = pipeline('sentiment-analysis', model=BertForSequenceClassification.from_pretrained('bert-base-uncased'), tokenizer=BertTokenizer.from_pretrained('bert-base-uncased'))
distilbert_sentiment_model = pipeline('sentiment-analysis', model=DistilBertForSequenceClassification.from_pretrained('distilbert-base-uncased'), tokenizer=DistilBertTokenizer.from_pretrained('distilbert-base-uncased'))

# Truncate the 'summary' text to fit within the model's maximum sequence length
max_seq_length = 512
news_df['summary'] = news_df['summary'].apply(lambda x: x[:max_seq_length])

# Apply sentiment analysis using RoBERTa, BERT, and DistilBERT
news_df['title_sentiment_roberta'] = news_df['title'].apply(lambda x: roberta_sentiment_model(x)[0]['label'])
news_df['summary_sentiment_roberta'] = news_df['summary'].apply(lambda x: roberta_sentiment_model(x)[0]['label'])

news_df['title_sentiment_bert'] = news_df['title'].apply(lambda x: bert_sentiment_model(x)[0]['label'])
news_df['summary_sentiment_bert'] = news_df['summary'].apply(lambda x: bert_sentiment_model(x)[0]['label'])

news_df['title_sentiment_distilbert'] = news_df['title'].apply(lambda x: distilbert_sentiment_model(x)[0]['label'])
news_df['summary_sentiment_distilbert'] = news_df['summary'].apply(lambda x: distilbert_sentiment_model(x)[0]['label'])

# Display the DataFrame with sentiment analysis results
print(news_df)

In [None]:
# Map labels for sentiment categories
label_mapping = {'LABEL_0': '-1', 'LABEL_1': '0', 'LABEL_2': '1'}

# Map sentiment labels in the DataFrame
for model in ['roberta', 'bert', 'distilbert']:
    news_df[f'title_sentiment_{model}'] = news_df[f'title_sentiment_{model}'].map(label_mapping)
    news_df[f'summary_sentiment_{model}'] = news_df[f'summary_sentiment_{model}'].map(label_mapping)

# Visualize Sentiment Distribution
labels = ['-1', '0', '1']

fig, axs = plt.subplots(1, 3, figsize=(18, 6))

bar_width = 0.35
for i, model in enumerate(['roberta', 'bert', 'distilbert']):
    title_sentiment_counts = news_df[f'title_sentiment_{model}'].value_counts().reindex(labels, fill_value=0)
    axs[i].bar(labels, title_sentiment_counts, color='skyblue', label=f'Title ({model.capitalize()})', width=bar_width)
    summary_sentiment_counts = news_df[f'summary_sentiment_{model}'].value_counts().reindex(labels, fill_value=0)
    axs[i].bar([x + bar_width for x in range(len(labels))], summary_sentiment_counts, color='#CBC3E3', alpha=0.7, label=f'Summary ({model.capitalize()})', width=bar_width)

    axs[i].set_title(f'{model.capitalize()} Sentiment Distribution')
    axs[i].legend()

fig.suptitle('Sentiment Analysis on Google News Data')

plt.tight_layout()
plt.show()

In [None]:
news_df.to_csv('news_df.csv', index=False)

In [None]:
price_data = pd.read_csv('commodities_DAILY.csv')
sentiment_data = pd.read_csv('news_df.csv')

# Convert 'published' column in sentiment_data
sentiment_data['published'] = pd.to_datetime(sentiment_data['published'], unit='s')
# Extract date part only from 'published'
sentiment_data['published'] = sentiment_data['published'].dt.date
# Convert 'Dates' to the same format as 'published'
price_data['Dates'] = price_data['Dates'].dt.date
# Verify the conversion
print(price_data.head())
# Merge on the date part
merged_data = pd.merge(sentiment_data, price_data, left_on='published', right_on='Dates', how='left')
# Check columns of the merged data
print("merged_data columns:", merged_data.columns)

columns_of_interest = ['CL1_PX_LAST', 'NG1_PX_LAST', 'HO1_PX_LAST', 'W1_PX_LAST', 'C1_PX_LAST', 'S1_PX_LAST',
                        'BO1_PX_LAST', 'HG1_PX_LAST', 'GC1_PX_LAST', 'CT1_PX_LAST', 'LC1_PX_LAST']

# Dropping rows with missing values in sentiment columns if any
merged_data.dropna(subset=columns_of_interest, inplace=True)

def train_and_predict(df, target_column):
    features = merged_data.drop(['title', 'summary', 'published'] + [target_column], axis=1)
    target = merged_data[target_column]
    X_train, X_test, y_train, y_test = train_test_split(features, target, test_size=0.2, random_state=42)
    preprocessor = ColumnTransformer(
        transformers=[
            ('sentiment', SimpleImputer(strategy='constant', fill_value=0),
                ['title_sentiment_roberta', 'title_sentiment_bert', 'title_sentiment_distilbert']),
            ('numeric', StandardScaler(),
                ['CL1_VOLATILITY_10D', 'CL1_VOLATILITY_20D', 'NG1_VOLATILITY_10D', 'NG1_VOLATILITY_20D',
                 'HO1_VOLATILITY_10D', 'HO1_VOLATILITY_20D', 'W1_VOLATILITY_10D',
                 'W1_VOLATILITY_20D', 'C1_VOLATILITY_10D', 'C1_VOLATILITY_20D',
                 'S1_VOLATILITY_10D', 'S1_VOLATILITY_20D', 'BO1_VOLATILITY_10D', 'BO1_VOLATILITY_20D',
                 'HG1_VOLATILITY_10D', 'HG1_VOLATILITY_20D', 'GC1_VOLATILITY_10D',
                 'GC1_VOLATILITY_20D', 'CT1_VOLATILITY_10D', 'CT1_VOLATILITY_20D',
                 'LC1_VOLATILITY_10D', 'LC1_VOLATILITY_20D']),
        ])

    rf_model = RandomForestRegressor(n_estimators=200, random_state=42)
    dt_model = DecisionTreeRegressor(random_state=42)
    rf_pipeline = Pipeline(steps=[('preprocessor', preprocessor),
                                   ('model', rf_model)])

    dt_pipeline = Pipeline(steps=[('preprocessor', preprocessor),
                                   ('model', dt_model)])

    rf_pipeline.fit(X_train, y_train)
    rf_y_pred = rf_pipeline.predict(X_test)
    rf_mse = mean_squared_error(y_test, rf_y_pred)
    rf_r2 = r2_score(y_test, rf_y_pred)
    print(f'{target_column} - Random Forest - Mean Squared Error: {rf_mse}')
    print(f'{target_column} - Random Forest - R-squared (R2) Score: {rf_r2}')

    dt_pipeline.fit(X_train, y_train)
    dt_y_pred = dt_pipeline.predict(X_test)
    dt_mse = mean_squared_error(y_test, dt_y_pred)
    dt_r2 = r2_score(y_test, dt_y_pred)
    print(f'{target_column} - Decision Tree - Mean Squared Error: {dt_mse}')
    print(f'{target_column} - Decision Tree - R-squared (R2) Score: {dt_r2}')

    print('-' * 50)

for column in columns_of_interest:
    train_and_predict(merged_data, column)


def train_and_predict_lstm(df, target_column):
    features = merged_data.drop(['title', 'summary', 'published'] + [target_column], axis=1)
    target = merged_data[target_column]

    # Convert to numpy arrays
    features_array = features.select_dtypes(include=[np.number]).values
    target_array = target.values.reshape(-1, 1)

    # Normalize features
    scaler = StandardScaler()
    features_scaled = scaler.fit_transform(features_array)
    X_train, X_test, y_train, y_test = train_test_split(features_scaled, target_array, test_size=0.2, random_state=42)

    X_train = np.reshape(X_train, (X_train.shape[0], 1, X_train.shape[1]))
    X_test = np.reshape(X_test, (X_test.shape[0], 1, X_test.shape[1]))

    # Define LSTM model
    model = Sequential()
    model.add(LSTM(units=50, return_sequences=True, input_shape=(X_train.shape[1], X_train.shape[2])))
    model.add(LSTM(units=50))
    model.add(Dense(units=1))
    model.compile(optimizer='adam', loss='mean_squared_error')

    early_stopping = EarlyStopping(monitor='val_loss', patience=5, restore_best_weights=True)
    model.fit(X_train, y_train, epochs=100, batch_size=32, validation_split=0.1, callbacks=[early_stopping], verbose=1)

    y_pred = model.predict(X_test)
    mse = mean_squared_error(y_test, y_pred)
    r2 = r2_score(y_test, y_pred)
    print(f'{target_column} - LSTM - Mean Squared Error: {mse}')
    print(f'{target_column} - LSTM - R-squared (R2) Score: {r2}')
    print('-' * 50)

for column in columns_of_interest:
    train_and_predict_lstm(merged_data, column)