In [1]:
!pip install scikit-learn
!pip install xgboost



In [3]:
import pandas as pd
from sklearn.model_selection import train_test_split
from transformers import pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error, mean_squared_error
from sklearn.impute import SimpleImputer
from sklearn.metrics import accuracy_score,classification_report
import xgboost as xgb


In [4]:
# Replace the file path with the correct path to your CSV file
file_path = "../data/final_dataset_with_stock_price.csv"

# Read the CSV file into a pandas DataFrame
data = pd.read_csv(file_path)

In [5]:
data.head()

Unnamed: 0,date,headline,Open,Close,Difference (%)
0,3/20/2018,Jim Cramer : A better way to invest in the Cov...,244.465456,244.474487,0.003694
1,3/20/2018,Cramer's lightning round : I would own Teradyne,244.465456,244.474487,0.003694
2,3/20/2018,"Cramer's week ahead : Big week for earnings , ...",244.465456,244.474487,0.003694
3,3/20/2018,IQ Capital CEO Keith Bliss says tech and healt...,244.465456,244.474487,0.003694
4,3/20/2018,Wall Street delivered the 'kind of pullback I'...,244.465456,244.474487,0.003694


In [None]:
import re
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
corpus=[]
# Get the number of rows in the dataset
num_rows = len(data)
for i in range(0,num_rows):
    headline=re.sub('[^a-zA-Z]',' ', data['headline'][i]) #every punctuation by everything we want (we will replace everything that is not letter to space)
    headline=headline.lower()
    headline=headline.split()
    ps=PorterStemmer()
    all_stopwords=stopwords.words('english')
    all_stopwords.remove('not')
    headline=[ps.stem((word)) for word in headline if not word in set(all_stopwords)]
    headline=' '.join(headline)
    corpus.append(headline)
    
    
# Load a pre-trained sentiment analysis model
sentiment_analyzer = pipeline("sentiment-analysis")

# Define batch size
batch_size = 10000
num_batches = len(corpus) // batch_size + (1 if len(corpus) % batch_size != 0 else 0)

# Process each batch and save the results incrementally
output_file_path = "../data/final_dataset_with_cleaned_and_sentiment.xlsx"
for batch_num in range(num_batches):
    start_idx = batch_num * batch_size
    end_idx = min(start_idx + batch_size, len(corpus))
    batch_corpus = corpus[start_idx:end_idx]

    sentiment_labels = []
    for headline in batch_corpus:
        result = sentiment_analyzer(headline)[0]
        sentiment_labels.append(0 if result['label'] == 'NEGATIVE' else 1)

    # Update the DataFrame with the sentiment labels for the current batch
    data.loc[start_idx:end_idx-1, 'sentiment'] = sentiment_labels

    # Save the DataFrame to an Excel file after each batch
    data.to_excel(output_file_path, index=False)

    print(f"Processed batch {batch_num+1} of {num_batches} and saved to {output_file_path}")

print("Batch processing complete. Final results saved.")

In [6]:
# Replace the file path with the correct path to your CSV file
file_path = "../data/final_dataset_with_cleaned_and_sentiment.xlsx"

# Read the CSV file into a pandas DataFrame
data = pd.read_excel(file_path)

In [7]:
data.head()

Unnamed: 0,date,headline,Open,Close,Difference (%),cleaned_headline,sentiment
0,3/20/2018,Jim Cramer : A better way to invest in the Cov...,244.465456,244.474487,0.003694,jim cramer better way invest covid vaccin gold...,0.0
1,3/20/2018,Cramer's lightning round : I would own Teradyne,244.465456,244.474487,0.003694,cramer lightn round would teradyn,0.0
2,3/20/2018,"Cramer's week ahead : Big week for earnings , ...",244.465456,244.474487,0.003694,cramer week ahead big week earn even bigger we...,1.0
3,3/20/2018,IQ Capital CEO Keith Bliss says tech and healt...,244.465456,244.474487,0.003694,iq capit ceo keith bliss say tech healthcar ralli,0.0
4,3/20/2018,Wall Street delivered the 'kind of pullback I'...,244.465456,244.474487,0.003694,wall street deliv kind pullback wait jim crame...,0.0


In [8]:
clean_data=data[['cleaned_headline','Open','Close']].copy()
print(clean_data)

                                          cleaned_headline        Open  \
0        jim cramer better way invest covid vaccin gold...  244.465456   
1                        cramer lightn round would teradyn  244.465456   
2        cramer week ahead big week earn even bigger we...  244.465456   
3        iq capit ceo keith bliss say tech healthcar ralli  244.465456   
4        wall street deliv kind pullback wait jim crame...  244.465456   
...                                                    ...         ...   
1048570           ba suspend gatwick flight due coronaviru  240.161607   
1048571  councillor oven readi scheme forc privat schoo...  240.161622   
1048572  councillor oven readi scheme forc privat schoo...  240.161607   
1048573  councillor oven readi scheme forc privat schoo...  240.161638   
1048574  councillor oven readi scheme forc privat schoo...  240.161591   

              Close  
0        244.474487  
1        244.474487  
2        244.474487  
3        244.474487  
4

In [9]:
# Calculate stock movement
clean_data['Movement'] = (clean_data['Close'] > clean_data['Open']).astype(int)  # 1 if price increased, 0 if decreased


In [10]:
print(clean_data)

                                          cleaned_headline        Open  \
0        jim cramer better way invest covid vaccin gold...  244.465456   
1                        cramer lightn round would teradyn  244.465456   
2        cramer week ahead big week earn even bigger we...  244.465456   
3        iq capit ceo keith bliss say tech healthcar ralli  244.465456   
4        wall street deliv kind pullback wait jim crame...  244.465456   
...                                                    ...         ...   
1048570           ba suspend gatwick flight due coronaviru  240.161607   
1048571  councillor oven readi scheme forc privat schoo...  240.161622   
1048572  councillor oven readi scheme forc privat schoo...  240.161607   
1048573  councillor oven readi scheme forc privat schoo...  240.161638   
1048574  councillor oven readi scheme forc privat schoo...  240.161591   

              Close  Movement  
0        244.474487         1  
1        244.474487         1  
2        244.47

In [11]:
# create object
vectorizer = TfidfVectorizer(max_features=1000)

 
# get tf-df values
X_text = vectorizer .fit_transform(clean_data['cleaned_headline'])

# Convert the matrix to a dense array and print
print(X_text.toarray())

# Get feature names
print(vectorizer.get_feature_names_out())


[[0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 ...
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]]
['account' 'accus' 'act' 'action' 'activ' 'activist' 'ad' 'add'
 'administr' 'advis' 'affect' 'age' 'agenc' 'agenda' 'agre' 'agreement'
 'ahead' 'aid' 'aim' 'air' 'airbu' 'aircraft' 'airlin' 'airport' 'airway'
 'alibaba' 'alleg' 'allianc' 'allow' 'almost' 'alphabet' 'amazon'
 'america' 'american' 'amid' 'among' 'analyst' 'announc' 'annual' 'anoth'
 'anti' 'antitrust' 'app' 'appeal' 'appl' 'appoint' 'approv' 'april'
 'arabia' 'aramco' 'around' 'arrest' 'asda' 'asia' 'ask' 'asset' 'attack'
 'australia' 'australian' 'auto' 'automak' 'aviat' 'avoid' 'away' 'back'
 'bad' 'bailout' 'ban' 'bank' 'banker' 'bankruptci' 'barclay' 'base'
 'batteri' 'battl' 'bayer' 'bear' 'beat' 'becom' 'begin' 'behind'
 'benefit' 'best' 'bet' 'better' 'beyond' 'bid' 'big' 'biggest' 'bill'
 'billion' 'billionair' 'bitcoin' 'bite' 'black' 'blackrock' 'blame'
 'b

In [12]:
X = np.hstack((X_text.toarray(), clean_data['Open'].values.reshape(-1, 1)))
y=clean_data['Movement']
print(X)
print(y)

[[  0.          0.          0.        ...   0.          0.
  244.4654556]
 [  0.          0.          0.        ...   0.          0.
  244.4654556]
 [  0.          0.          0.        ...   0.          0.
  244.4654556]
 ...
 [  0.          0.          0.        ...   0.          0.
  240.1616066]
 [  0.          0.          0.        ...   0.          0.
  240.1616378]
 [  0.          0.          0.        ...   0.          0.
  240.161591 ]]
0          1
1          1
2          1
3          1
4          1
          ..
1048570    0
1048571    0
1048572    0
1048573    0
1048574    0
Name: Movement, Length: 1048575, dtype: int32


In [13]:
X_clean = X[~np.isnan(X).any(axis=1)]
y_clean = y[~np.isnan(X).any(axis=1)]

In [14]:
X_train, X_test, y_train, y_test = train_test_split(X_clean, y_clean, test_size=0.2, random_state=42)


In [15]:
# Initialize and train the model
model = xgb.XGBClassifier(objective='binary:logistic', n_estimators=100, learning_rate=0.1)
model.fit(X_train, y_train)

In [16]:
# Predict on the training set
y_train_pred = model.predict(X_train)

# Display the first few predictions and corresponding actual values
print("First few predictions on training set:")
print(pd.DataFrame({'Actual': y_train, 'Predicted': y_train_pred}).head())

# Evaluate the performance on training set
print("Accuracy on training set:")
print(accuracy_score(y_train, y_train_pred))

print("Classification report on training set:")
print(classification_report(y_train, y_train_pred))

First few predictions on training set:
        Actual  Predicted
521592       0          0
347384       0          0
524411       0          0
149966       0          0
802906       1          1
Accuracy on training set:
1.0
Classification report on training set:
              precision    recall  f1-score   support

           0       1.00      1.00      1.00    412359
           1       1.00      1.00      1.00    341173

    accuracy                           1.00    753532
   macro avg       1.00      1.00      1.00    753532
weighted avg       1.00      1.00      1.00    753532

