In [1]:
import pandas as pd


In [2]:
file_path = "C:\\Users\\levi_\\Desktop\\combined_headlines_new.csv"
data = pd.read_csv(file_path)
print(data.head())

         date                                          headlines
0  2020-07-17  Jim Cramer : A better way to invest in the Cov...
1  2020-07-17    Cramer's lightning round : I would own Teradyne
2  2020-07-17  Cramer's week ahead : Big week for earnings , ...
3  2020-07-17  IQ Capital CEO Keith Bliss says tech and healt...
4  2020-07-16  Wall Street delivered the 'kind of pullback I'...


In [3]:
def clean_text(text):
  text = text.lower()
  text = text.replace('\n', ' ')
  text = text.replace('[^a-zA-Z]', ' ')
  return text

In [4]:
data['cleaned_headlines'] = data['headlines'].apply(clean_text)
print(data.head())

         date                                          headlines  \
0  2020-07-17  Jim Cramer : A better way to invest in the Cov...   
1  2020-07-17    Cramer's lightning round : I would own Teradyne   
2  2020-07-17  Cramer's week ahead : Big week for earnings , ...   
3  2020-07-17  IQ Capital CEO Keith Bliss says tech and healt...   
4  2020-07-16  Wall Street delivered the 'kind of pullback I'...   

                                   cleaned_headlines  
0  jim cramer : a better way to invest in the cov...  
1    cramer's lightning round : i would own teradyne  
2  cramer's week ahead : big week for earnings , ...  
3  iq capital ceo keith bliss says tech and healt...  
4  wall street delivered the 'kind of pullback i'...  


In [5]:
import yfinance as yf

In [6]:
spy_data = yf.download('SPY', start= '2020-01-01', end='2023-01-01')
spy_data.reset_index(inplace=True)

[*********************100%%**********************]  1 of 1 completed


In [7]:
spy_data = spy_data[['Date', 'Adj Close']]
spy_data.rename(columns= {'Adj Close': 'spy_price'}, inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  spy_data.rename(columns= {'Adj Close': 'spy_price'}, inplace=True)


In [8]:
data['Date'] = pd.to_datetime(data['date'])
merged_data = pd.merge(data, spy_data, how='left', left_on= 'Date', right_on='Date')

In [9]:
print(merged_data.head())

         date                                          headlines  \
0  2020-07-17  Jim Cramer : A better way to invest in the Cov...   
1  2020-07-17    Cramer's lightning round : I would own Teradyne   
2  2020-07-17  Cramer's week ahead : Big week for earnings , ...   
3  2020-07-17  IQ Capital CEO Keith Bliss says tech and healt...   
4  2020-07-16  Wall Street delivered the 'kind of pullback I'...   

                                   cleaned_headlines       Date   spy_price  
0  jim cramer : a better way to invest in the cov... 2020-07-17  303.290344  
1    cramer's lightning round : i would own teradyne 2020-07-17  303.290344  
2  cramer's week ahead : big week for earnings , ... 2020-07-17  303.290344  
3  iq capital ceo keith bliss says tech and healt... 2020-07-17  303.290344  
4  wall street delivered the 'kind of pullback i'... 2020-07-16  302.413605  


In [10]:
merged_data['price_change'] = merged_data['spy_price'].pct_change()

def classify_change(change):
    if change > 0:
        return 1
    elif change < 0:
        return -1
    else:
        return 0

merged_data['target'] = (merged_data['price_change'] > 0).astype(int)
merged_data.dropna(subset=['price_change'], inplace=True)

  merged_data['price_change'] = merged_data['spy_price'].pct_change()


In [11]:
print(merged_data[['Date', 'spy_price', 'price_change', 'target']].head())

        Date   spy_price  price_change  target
1 2020-07-17  303.290344      0.000000       0
2 2020-07-17  303.290344      0.000000       0
3 2020-07-17  303.290344      0.000000       0
4 2020-07-16  302.413605     -0.002891       0
5 2020-07-16  302.413605      0.000000       0


In [12]:
#Finalize the Dataset

final_data = merged_data[['cleaned_headlines', 'target']]

In [13]:
print(final_data['target'].value_counts())

target
0    53141
1      188
Name: count, dtype: int64


In [14]:
print(final_data.head())

                                   cleaned_headlines  target
1    cramer's lightning round : i would own teradyne       0
2  cramer's week ahead : big week for earnings , ...       0
3  iq capital ceo keith bliss says tech and healt...       0
4  wall street delivered the 'kind of pullback i'...       0
5  cramer's lightning round : i would just stay l...       0


In [15]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [16]:
vectorizer = TfidfVectorizer(max_features=5000) 

In [17]:
tfidf_matrix = vectorizer.fit_transform(merged_data['cleaned_headlines'])
tfidf_df = pd.DataFrame(tfidf_matrix.toarray(), columns=vectorizer.get_feature_names_out())
final_data = pd.concat([merged_data.reset_index(drop=True), tfidf_df], axis=1)


In [18]:
print(final_data.head())

         date                                          headlines  \
0  2020-07-17    Cramer's lightning round : I would own Teradyne   
1  2020-07-17  Cramer's week ahead : Big week for earnings , ...   
2  2020-07-17  IQ Capital CEO Keith Bliss says tech and healt...   
3  2020-07-16  Wall Street delivered the 'kind of pullback I'...   
4  2020-07-16  Cramer's lightning round : I would just stay l...   

                                   cleaned_headlines       Date   spy_price  \
0    cramer's lightning round : i would own teradyne 2020-07-17  303.290344   
1  cramer's week ahead : big week for earnings , ... 2020-07-17  303.290344   
2  iq capital ceo keith bliss says tech and healt... 2020-07-17  303.290344   
3  wall street delivered the 'kind of pullback i'... 2020-07-16  302.413605   
4  cramer's lightning round : i would just stay l... 2020-07-16  302.413605   

   price_change  target  000   10  100  ...  your  youtube  yuan  yum  \
0      0.000000       0  0.0  0.0  0.0  ...

In [19]:
from sklearn.model_selection import train_test_split

X = final_data[vectorizer.get_feature_names_out()] 
y = final_data['target']  

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

print("Training set size:", X_train.shape)
print("Test set size:", X_test.shape)


Training set size: (42663, 5002)
Test set size: (10666, 5002)


In [20]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, accuracy_score

model = LogisticRegression(multi_class='multinomial', solver='lbfgs', max_iter=1000)
model.fit(X_train, y_train)
y_pred = model.predict(X_test)

print("Accuracy:", accuracy_score(y_test, y_pred))
print("Classification Report:")
print(classification_report(y_test, y_pred, target_names=['Decrease (-1)', 'No Change (0)', 'Increase (1)']))


ValueError: could not convert string to float: '2019-01-07'