In [33]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.preprocessing import LabelEncoder

# Preprocessing the dataset

This part of the code is essentially preparing the data for a machine learning model, transforming the text data into numerical form, and splitting the data into training and testing sets.

The combined dataset is loaded from a CSV file using pandas' read_csv function and all news headlines for each record (each day) are concatenated into a single string. A CountVectorizer is initialized to convert the headlines into a matrix of token counts. The maximum number of features is set to 5000, but this can be adjusted based on computational capacity. A LabelEncoder is used to prepare the output matrix (Y) by transforming the labels into normalized encoding.

The dataset is split into training and testing sets based on specific date ranges. The variables X_train, X_test, Y_train, and Y_test are defined in a later cell, which split the X and Y matrices into training and testing sets based on the indices of the original train and test dataframes.

In [34]:
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.preprocessing import LabelEncoder

# Load the dataset
data = pd.read_csv('Datasets/Combined_News_DJIA.csv')

data.head()

# Concatenate all news headlines into a single string for each record more efficiently
data['All_Headlines'] = data.iloc[:, 2:].fillna('').agg(' '.join, axis=1)

# Convert the 'All_Headlines' column into a series of tokens
tokens = vectorizer.transform(data['All_Headlines'])

# Initialize a CountVectorizer with efficient memory usage
vectorizer = CountVectorizer(max_features=5000, dtype='uint8')  # Using uint8 for memory efficiency
X = vectorizer.fit_transform(data['All_Headlines'])

# Display the shape of X
print("Shape of X:", X.shape)

# Prepare the output matrix with LabelEncoder
Y = LabelEncoder().fit_transform(data['Label'])

# Display the first few processed records to check everything went as expected
print(data[['Date', 'All_Headlines', 'Label']].head())

# Example headline
headline = ["This is an example headline"]

# Transform the headline into a matrix of token counts
headline_vector = vectorizer.transform(headline)

# Convert the sparse matrix to a dense matrix and print it
print(headline_vector.toarray())

Shape of X: (1989, 5000)
         Date                                      All_Headlines  Label
0  2008-08-08  b"Georgia 'downs two Russian warplanes' as cou...      0
1  2008-08-11  b'Why wont America and Nato help us? If they w...      1
2  2008-08-12  b'Remember that adorable 9-year-old who sang a...      0
3  2008-08-13  b' U.S. refuses Israel weapons to attack Iran:...      0
4  2008-08-14  b'All the experts admit that we should legalis...      1
[[0 0 0 ... 0 0 0]]


In [35]:
# Split the dataset into training and testing sets based on the provided date ranges
train = data[(data['Date'] >= '2008-08-08') & (data['Date'] <= '2014-12-31')]
test = data[(data['Date'] >= '2015-01-02') & (data['Date'] <= '2016-07-01')]

# Print the shape of train and test data
print("Shape of train data:", train.shape)
print("Shape of test data:", test.shape)

X_train, X_test = X[data.index.isin(train.index)], X[data.index.isin(test.index)]
Y_train, Y_test = Y[data.index.isin(train.index)], Y[data.index.isin(test.index)]

Shape of train data: (1611, 28)
Shape of test data: (378, 28)


In [36]:
# Calculate the mean along the 0th axis
mean = np.mean(X_train, axis=0)

# Calculate the standard deviation
std_dev = np.std(X_train, axis=0)

# Standardize the training, validation, and testing sets
X_train = (X_train - mean) / std_dev
# X_valid = (X_valid - mean) / std_dev  # Uncomment this line if you have a validation set
X_test = (X_test - mean) / std_dev


AxisError: axis 0 is out of bounds for array of dimension 0