In [None]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import MaxAbsScaler

# Preprocessing the dataset

This part of the code is essentially preparing the data for a machine learning model, transforming the text data into numerical form, and splitting the data into training and testing sets.

The combined dataset is loaded from a CSV file using pandas' read_csv function and all news headlines for each record (each day) are concatenated into a single string. A CountVectorizer is initialized to convert the headlines into a matrix of token counts. The maximum number of features is set to 5000, but this can be adjusted based on computational capacity. A LabelEncoder is used to prepare the output matrix (Y) by transforming the labels into normalized encoding.

The dataset is split into training and testing sets based on specific date ranges. The variables X_train, X_test, Y_train, and Y_test are defined in a later cell, which split the X and Y matrices into training and testing sets based on the indices of the original train and test dataframes.

In [None]:
# Load the combined dataset
data = pd.read_csv('Datasets/Combined_News_DJIA.csv')

# Concatenate all the news headlines into a single string for each record
# Optimized by directly using pandas functionality
data['All_Headlines'] = data.iloc[:, 2:].fillna('').apply(lambda x: ' '.join(x), axis=1)

# Initialize a CountVectorizer to convert the headlines to a matrix of token counts
# Consider limiting max_features and experimenting with ngram_range for better performance
vectorizer = CountVectorizer(max_features=5000, stop_words='english')
X = vectorizer.fit_transform(data['All_Headlines'])

# Prepare the output matrix
Y = LabelEncoder().fit_transform(data['Label'])

# Verify the shapes of the matrices and the first few rows to ensure the preprocessing is as expected.
print("Shape of X:", X.shape)
print("Shape of Y:", Y.shape)
print(data[['Date', 'All_Headlines', 'Label']].head())

Shape of X: (1989, 5000)
Shape of Y: (1989,)
         Date                                      All_Headlines  Label
0  2008-08-08  b"Georgia 'downs two Russian warplanes' as cou...      0
1  2008-08-11  b'Why wont America and Nato help us? If they w...      1
2  2008-08-12  b'Remember that adorable 9-year-old who sang a...      0
3  2008-08-13  b' U.S. refuses Israel weapons to attack Iran:...      0
4  2008-08-14  b'All the experts admit that we should legalis...      1


In [1]:
# Splitting the dataset into training and testing sets
# Typically, you might want to use 80% of the data for training and 20% for testing, but these proportions can be adjusted.
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, random_state=42)

# Standardizing the features: since X is a sparse matrix returned by CountVectorizer, 
# we use MaxAbsScaler which is more appropriate for sparse data. StandardScaler is generally not used for sparse data
# because it can break the sparsity structure.

scaler = MaxAbsScaler()

X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Verify the standardization and splitting by printing shapes
print("X_train_scaled shape:", X_train_scaled.shape)
print("X_test_scaled shape:", X_test_scaled.shape)
print("Y_train shape:", Y_train.shape)
print("Y_test shape:", Y_test.shape)

NameError: name 'train_test_split' is not defined

# Possible Baseline Model
Below is an example extension adding a simple logistic regression model as a baseline.

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, accuracy_score

# Initialize the Logistic Regression model
lr_model = LogisticRegression(max_iter=1000)

# Fit the model on the scaled training data
lr_model.fit(X_train_scaled, Y_train)

# Predict on the scaled testing data
Y_pred = lr_model.predict(X_test_scaled)

# Evaluation
print("Accuracy on Test Set:", accuracy_score(Y_test, Y_pred))
print("\nClassification Report:\n", classification_report(Y_test, Y_pred))

Accuracy on Test Set: 0.46733668341708545

Classification Report:
               precision    recall  f1-score   support

           0       0.39      0.43      0.41       171
           1       0.54      0.50      0.52       227

    accuracy                           0.47       398
   macro avg       0.46      0.46      0.46       398
weighted avg       0.47      0.47      0.47       398



# Chat-GPT Based Model

We will use a Chat-GPT Based Model for predicting stock market trends from news headlines. We will use PyTorch and the Hugging Face Transformers library. Given the project's nature, we'll focus on using a pre-trained GPT model and fine-tuning it to the dataset. 


In [None]:
import pandas as pd
import torch
from torch.utils.data import Dataset, DataLoader
from transformers import GPT2Tokenizer, GPT2ForSequenceClassification, AdamW
from sklearn.model_selection import train_test_split
from tqdm.auto import tqdm

## Load and Preprocess the Dataset

We need to preprocess it into a format suitable for a GPT model. This involves tokenization and encoding the headlines, as well as preparing the labels.

In [None]:
# Load and prepare data
data = pd.read_csv('Datasets/Combined_News_DJIA.csv')
data['All_Headlines'] = data.iloc[:, 2:].fillna('').apply(lambda x: ' '.join(x), axis=1)
train_df, val_df = train_test_split(data, test_size=0.2, random_state=42)

# Define the dataset class
class NewsDataset(Dataset):
    def __init__(self, headlines, labels, tokenizer, max_len):
        self.headlines = headlines
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_len = max_len
        
    def __len__(self):
        return len(self.headlines)
    
    def __getitem__(self, item):
        headline = str(self.headlines[item])
        label = self.labels[item]
        
        encoding = self.tokenizer.encode_plus(
            headline,
            add_special_tokens=True,
            max_length=self.max_len,
            truncation=True,
            padding='max_length',
            return_attention_mask=True,
            return_tensors='pt',
        )
        
        return {
            'headline_text': headline,
            'input_ids': encoding['input_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten(),
            'labels': torch.tensor(label, dtype=torch.long)
        }

## Create Data Loaders

Preparing the dataset for training and validation. This step involves creating DataLoader instances for both training and validation sets.

In [4]:
# Function to create data loaders
def create_data_loader(df, tokenizer, max_len, batch_size):
    ds = NewsDataset(
        headlines=df['All_Headlines'].to_numpy(),
        labels=df['Label'].to_numpy(),
        tokenizer=tokenizer,
        max_len=max_len
    )
    
    return DataLoader(ds, batch_size=batch_size, num_workers=0) # Using num_workers=0 for compatibility

# Initialize the tokenizer and model
tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
tokenizer.pad_token = tokenizer.eos_token # Setting pad token

MAX_LEN = 128
BATCH_SIZE = 16

model = GPT2ForSequenceClassification.from_pretrained('gpt2', num_labels=2)
model.config.pad_token_id = model.config.eos_token_id # Ensure the model accepts pad_token
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

optimizer = AdamW(model.parameters(), lr=2e-5)

# Data loaders
train_data_loader = create_data_loader(train_df, tokenizer, MAX_LEN, BATCH_SIZE)
val_data_loader = create_data_loader(val_df, tokenizer, MAX_LEN, BATCH_SIZE)

Some weights of GPT2ForSequenceClassification were not initialized from the model checkpoint at gpt2 and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


## Model Training

Now we will define the model training and evaluation loop. This involves loading the GPT-2 model for sequence classification, defining the optimizer, and iterating over the dataset to train the model.

In [5]:
# Training function
def train_epoch(model, data_loader, optimizer, device, n_examples):
    model.train()
    losses = []
    correct_predictions = 0
    
    for d in tqdm(data_loader, desc="Training"):
        input_ids = d['input_ids'].to(device)
        attention_mask = d['attention_mask'].to(device)
        labels = d['labels'].to(device)
        
        outputs = model(
            input_ids=input_ids,
            attention_mask=attention_mask,
            labels=labels
        )
        
        loss = outputs.loss
        logits = outputs.logits
        _, preds = torch.max(logits, dim=1)
        correct_predictions += torch.sum(preds == labels)
        losses.append(loss.item())
        
        loss.backward()
        optimizer.step()
        optimizer.zero_grad()
    
    return correct_predictions.double() / n_examples, np.mean(losses)

## Training Process

In [None]:
# Training loop
EPOCHS = 3

for epoch in range(EPOCHS):
    print(f'Epoch {epoch + 1}/{EPOCHS}')
    print('-' * 10)

    train_acc, train_loss = train_epoch(
        model,
        train_data_loader,
        optimizer,
        device,
        len(train_df)
    )

    print(f'Train loss {train_loss} accuracy {train_acc}')


Some weights of GPT2ForSequenceClassification were not initialized from the model checkpoint at gpt2 and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch 1/3
----------


Training: 100%|██████████| 100/100 [04:11<00:00,  2.52s/it]


Train loss 0.8798575592041016 accuracy 0.5084852294154619
Epoch 2/3
----------


Training: 100%|██████████| 100/100 [04:01<00:00,  2.42s/it]


Train loss 0.6907531183958053 accuracy 0.5474544311753614
Epoch 3/3
----------


Training: 100%|██████████| 100/100 [03:36<00:00,  2.17s/it]

Train loss 0.6525920072197914 accuracy 0.6096794468887492





In [None]:
# Training loop
EPOCHS = 6

for epoch in range(EPOCHS):
    print(f'Epoch {epoch + 1}/{EPOCHS}')
    print('-' * 10)

    train_acc, train_loss = train_epoch(
        model,
        train_data_loader,
        optimizer,
        device,
        len(train_df)
    )

    print(f'Train loss {train_loss} accuracy {train_acc}')


Epoch 1/6
----------


Training: 100%|██████████| 100/100 [04:08<00:00,  2.48s/it]


Train loss 0.5994312298297882 accuracy 0.6744186046511628
Epoch 2/6
----------


Training: 100%|██████████| 100/100 [03:43<00:00,  2.24s/it]


Train loss 0.5150315748900175 accuracy 0.7573852922690132
Epoch 3/6
----------


Training: 100%|██████████| 100/100 [03:57<00:00,  2.38s/it]


Train loss 0.37972548872232437 accuracy 0.8372093023255814
Epoch 4/6
----------


Training: 100%|██████████| 100/100 [03:58<00:00,  2.38s/it]


Train loss 0.26156691592186687 accuracy 0.8950345694531741
Epoch 5/6
----------


Training: 100%|██████████| 100/100 [03:43<00:00,  2.24s/it]


Train loss 0.19294259852729737 accuracy 0.9340037712130735
Epoch 6/6
----------


Training: 100%|██████████| 100/100 [03:42<00:00,  2.23s/it]

Train loss 0.1268861471489072 accuracy 0.9566310496543055





In [10]:
# Assuming val_df is your validation DataFrame
# Let's take a small sample for testing
test_sample = val_df.sample(n=5).reset_index(drop=True)

# Create a test DataLoader
# Corrected function call - Ensure BATCH_SIZE is passed correctly as part of the function's arguments, not as a named parameter
test_data_loader = create_data_loader(test_sample, tokenizer, MAX_LEN, 1)  # Use 1 for BATCH_SIZE here

def evaluate(model, data_loader, device):
    model.eval()  # Put the model in evaluation mode
    
    predictions = []
    real_values = []
    
    with torch.no_grad():
        for d in data_loader:
            input_ids = d["input_ids"].to(device)
            attention_mask = d["attention_mask"].to(device)
            labels = d["labels"].to(device)
            
            outputs = model(
                input_ids=input_ids,
                attention_mask=attention_mask
            )
            
            _, preds = torch.max(outputs.logits, dim=1)
            
            predictions.extend(preds)
            real_values.extend(labels)
    
    predictions = torch.stack(predictions).cpu()
    real_values = torch.stack(real_values).cpu()
    return predictions, real_values

# Take a small sample for testing
test_sample = val_df.sample(n=5).reset_index(drop=True)

# Corrected call to create_data_loader with the correct argument passing
test_data_loader = create_data_loader(test_sample, tokenizer, MAX_LEN, 1)  # Correctly passing batch size

# Evaluate the model on the test sample
predictions, real_values = evaluate(model, test_data_loader, device)

# Displaying the predictions and actual labels
for i in range(len(predictions)):
    print(f"Headline: {test_sample['All_Headlines'].iloc[i]}")
    print(f"Predicted sentiment: {'Increase' if predictions[i] == 1 else 'Decrease'}")
    print(f"Actual movement: {'Increase' if real_values[i] == 1 else 'Decrease'}")
    print("---")

# Save the model weights
model.save_pretrained('path_to_save_model')


Headline: Japan struck by 6.7 magnitude earthquake. After shocks expected. President Barack Obama:''Palestinians deserve an end to occupation'' Vatican officially recognizes 'state of Palestine' in new treaty "Time has come to reexamine cannabis prohibition, Israel's police chief says" Developing reports claim that Pakistan's army not only knew where Osama bin Laden was hiding, but complicit in protecting him since 2006 Tory officials threatened BBC during election - senior BBC executives faced repeated threats of far-reaching reforms if they didnt change election campaign coverage Pope: God will judge you on whether you cared for Earth BBC reporting a coup in Burundi has effectively removed President Nkurunziza. 40 shia muslims gunned down by gunmen in a bus in Pakistan. Chancellor Angela Merkel is set to face further political embarrassment over Berlins spy scandal following new revelations that US intelligence planned to obtain unlimited access to Germanys main internet cable networ

# Revised Final:


In [None]:
import pandas as pd
import torch
from torch.utils.data import Dataset, DataLoader
from transformers import GPT2Tokenizer, GPT2ForSequenceClassification, AdamW
from sklearn.model_selection import train_test_split
from tqdm.auto import tqdm

# Load and preprocess the dataset
data = pd.read_csv('Datasets/Combined_News_DJIA.csv')
data['All_Headlines'] = data.iloc[:, 2:].fillna('').apply(lambda x: ' '.join(x), axis=1)
train_df, val_df = train_test_split(data, test_size=0.2, random_state=42)

# Define a custom Dataset class for tokenization and batching
class NewsDataset(Dataset):
    def __init__(self, headlines, labels, tokenizer, max_len):
        self.headlines = headlines
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_len = max_len
        
    def __len__(self):
        return len(self.headlines)
    
    def __getitem__(self, item):
        headline = str(self.headlines[item])
        label = self.labels[item]
        
        encoding = self.tokenizer.encode_plus(
            headline,
            add_special_tokens=True,
            max_length=self.max_len,
            truncation=True,
            padding='max_length',
            return_attention_mask=True,
            return_tensors='pt',
        )
        
        return {
            'headline_text': headline,
            'input_ids': encoding['input_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten(),
            'labels': torch.tensor(label, dtype=torch.long)
        }

# Create data loaders for batch processing
def create_data_loader(df, tokenizer, max_len, batch_size):
    ds = NewsDataset(
        headlines=df['All_Headlines'].to_numpy(),
        labels=df['Label'].to_numpy(),
        tokenizer=tokenizer,
        max_len=max_len
    )
    
    return DataLoader(ds, batch_size=batch_size, num_workers=0)

tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
tokenizer.pad_token = tokenizer.eos_token  # Use EOS token as pad token

MAX_LEN = 128
BATCH_SIZE = 16

model = GPT2ForSequenceClassification.from_pretrained('gpt2', num_labels=2)
model.config.pad_token_id = model.config.eos_token_id  # Configuring pad token
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

optimizer = AdamW(model.parameters(), lr=2e-5)

train_data_loader = create_data_loader(train_df, tokenizer, MAX_LEN, BATCH_SIZE)
val_data_loader = create_data_loader(val_df, tokenizer, MAX_LEN, BATCH_SIZE)

# Training function
def train_epoch(model, data_loader, optimizer, device, n_examples):
    model.train()
    losses = []
    correct_predictions = 0
    
    for d in tqdm(data_loader, desc="Training"):
        input_ids = d['input_ids'].to(device)
        attention_mask = d['attention_mask'].to(device)
        labels = d['labels'].to(device)
        
        outputs = model(
            input_ids=input_ids,
            attention_mask=attention_mask,
            labels=labels
        )
        
        loss = outputs.loss
        logits = outputs.logits
        _, preds = torch.max(logits, dim=1)
        correct_predictions += torch.sum(preds == labels)
        losses.append(loss.item())
        
        loss.backward()
        optimizer.step()
        optimizer.zero_grad()
    
    return correct_predictions.double() / n_examples, np.mean(losses)

# Training loop
EPOCHS = 3
for epoch in range(EPOCHS):
    print(f'Epoch {epoch + 1}/{EPOCHS}')
    print('-' * 10)

    train_acc, train_loss = train_epoch(
        model,
        train_data_loader,
        optimizer,
        device,
        len(train_df)
    )

    print(f'Train loss {train_loss} accuracy {train_acc}')

# Function to evaluate the model on test data
def evaluate(model, data_loader, device):
    model.eval()
    
    predictions = []
    real_values = []
    
    with torch.no_grad():
        for d in data_loader:
            input_ids = d["input_ids"].to(device)
            attention_mask = d["attention_mask"].to(device)
            labels = d["labels"].to(device)
            
            outputs = model(
                input_ids=input_ids,
                attention_mask=attention_mask
            )
            
            _, preds = torch.max(outputs.logits, dim=1)
            
            predictions.extend(preds)
            real_values.extend(labels)
    
    predictions = torch.stack(predictions).cpu()
    real_values = torch.stack(real_values).cpu()
    return predictions, real_values

# Testing the model with a small sample from the validation set
test_sample = val_df.sample(n=5).reset_index(drop=True)
test_data_loader = create_data_loader(test_sample, tokenizer, MAX_LEN, 1)  # Single batch size for individual assessment

predictions, real_values = evaluate(model, test_data_loader, device)

# Print predictions and actual labels
for i in range(len(predictions)):
    print(f"Headline: {test_sample['All_Headlines'].iloc[i]}")
    print(f"Predicted sentiment: {'Increase' if predictions[i] == 1 else 'Decrease'}")
    print(f"Actual movement: {'Increase' if real_values[i] == 1 else 'Decrease'}")
    print("---")


Some weights of GPT2ForSequenceClassification were not initialized from the model checkpoint at gpt2 and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch 1/3
----------


Training:  26%|██▌       | 26/100 [01:04<03:02,  2.46s/it]


KeyboardInterrupt: 