In [1]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import MaxAbsScaler

# Preprocessing the dataset

This part of the code is essentially preparing the data for a machine learning model, transforming the text data into numerical form, and splitting the data into training and testing sets.

The combined dataset is loaded from a CSV file using pandas' read_csv function and all news headlines for each record (each day) are concatenated into a single string. A CountVectorizer is initialized to convert the headlines into a matrix of token counts. The maximum number of features is set to 5000, but this can be adjusted based on computational capacity. A LabelEncoder is used to prepare the output matrix (Y) by transforming the labels into normalized encoding.

The dataset is split into training and testing sets based on specific date ranges. The variables X_train, X_test, Y_train, and Y_test are defined in a later cell, which split the X and Y matrices into training and testing sets based on the indices of the original train and test dataframes.

In [2]:
# Load the combined dataset
data = pd.read_csv('Datasets/Combined_News_DJIA.csv')

# Concatenate all the news headlines into a single string for each record
# Optimized by directly using pandas functionality
data['All_Headlines'] = data.iloc[:, 2:].fillna('').apply(lambda x: ' '.join(x), axis=1)

# Initialize a CountVectorizer to convert the headlines to a matrix of token counts
# Consider limiting max_features and experimenting with ngram_range for better performance
vectorizer = CountVectorizer(max_features=5000, stop_words='english')
X = vectorizer.fit_transform(data['All_Headlines'])

# Prepare the output matrix
Y = LabelEncoder().fit_transform(data['Label'])

# Verify the shapes of the matrices and the first few rows to ensure the preprocessing is as expected.
print("Shape of X:", X.shape)
print("Shape of Y:", Y.shape)
print(data[['Date', 'All_Headlines', 'Label']].head())

Shape of X: (1989, 5000)
Shape of Y: (1989,)
         Date                                      All_Headlines  Label
0  2008-08-08  b"Georgia 'downs two Russian warplanes' as cou...      0
1  2008-08-11  b'Why wont America and Nato help us? If they w...      1
2  2008-08-12  b'Remember that adorable 9-year-old who sang a...      0
3  2008-08-13  b' U.S. refuses Israel weapons to attack Iran:...      0
4  2008-08-14  b'All the experts admit that we should legalis...      1


In [3]:
# Splitting the dataset into training and testing sets
# Typically, you might want to use 80% of the data for training and 20% for testing, but these proportions can be adjusted.
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, random_state=42)

# Standardizing the features: since X is a sparse matrix returned by CountVectorizer, 
# we use MaxAbsScaler which is more appropriate for sparse data. StandardScaler is generally not used for sparse data
# because it can break the sparsity structure.

scaler = MaxAbsScaler()

X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Verify the standardization and splitting by printing shapes
print("X_train_scaled shape:", X_train_scaled.shape)
print("X_test_scaled shape:", X_test_scaled.shape)
print("Y_train shape:", Y_train.shape)
print("Y_test shape:", Y_test.shape)

X_train_scaled shape: (1591, 5000)
X_test_scaled shape: (398, 5000)
Y_train shape: (1591,)
Y_test shape: (398,)


# Possible Baseline Model
Below is an example extension adding a simple logistic regression model as a baseline.

In [4]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, accuracy_score

# Initialize the Logistic Regression model
lr_model = LogisticRegression(max_iter=1000)

# Fit the model on the scaled training data
lr_model.fit(X_train_scaled, Y_train)

# Predict on the scaled testing data
Y_pred = lr_model.predict(X_test_scaled)

# Evaluation
print("Accuracy on Test Set:", accuracy_score(Y_test, Y_pred))
print("\nClassification Report:\n", classification_report(Y_test, Y_pred))

Accuracy on Test Set: 0.46733668341708545

Classification Report:
               precision    recall  f1-score   support

           0       0.39      0.43      0.41       171
           1       0.54      0.50      0.52       227

    accuracy                           0.47       398
   macro avg       0.46      0.46      0.46       398
weighted avg       0.47      0.47      0.47       398

