In [None]:
import pandas as pd
import numpy as np
import os
import json

from tqdm import tqdm

In [None]:
from google.colab import drive

# Mount your Google Drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
PATH_PROJECT = '''/content/drive/MyDrive/UCSD/02_Courses_UCSD/04_Fall_Term_2023/01_Advanced_Data_Mining/DSC_250_PROJECT'''
PATH_DATA = os.path.join(PATH_PROJECT, 'datasets')

aggregated_folder = 'AGGREGATED_DATA'
PATH_AGG_DATA = os.path.join(PATH_DATA, aggregated_folder)

#### Consolidating Financial Data

In [None]:
PATH_PRICE_DATA = os.path.join(PATH_DATA, 'price/raw')

In [None]:
stock_data_files = [file for file in os.listdir(PATH_PRICE_DATA) if '.csv' in file]

In [None]:
stock_data_all = pd.DataFrame()
for file_name in tqdm(stock_data_files, position=0):
    ticker = file_name.split('.')[0]
    file_data = pd.read_csv(os.path.join(PATH_PRICE_DATA, file_name))
    file_data['ticker'] = ticker
    # Aggregating all the datasets
    stock_data_all = pd.concat([stock_data_all, file_data], axis=0)

In [None]:
stock_data_all.columns = [x.lower() for x in stock_data_all.columns]

In [None]:
# Storing the data in the Aggregated Directory
stock_data_all.to_csv(os.path.join(PATH_AGG_DATA, 'financial_data.csv'), index=False)

In [None]:
# Reading the data again from the Aggregated Directory
stock_data_all = pd.read_csv(os.path.join(PATH_AGG_DATA, 'financial_data.csv'))

## Adding Tweet Vectors

In [None]:
# Reading Tweet data
tweets = pd.read_csv(os.path.join(PATH_AGG_DATA, 'tweet_embeddings_all_agg.csv'))

In [None]:
tweets.head()

Unnamed: 0,ticker,date,tp_0,tp_1,tp_2,tp_3,tp_4,tp_5,tp_6,tp_7,...,t2ve_503,t2ve_504,t2ve_505,t2ve_506,t2ve_507,t2ve_508,t2ve_509,t2ve_510,t2ve_511,t2ve_512
0,AAPL,2013-12-31,1,0,0,0,0,0,0,0,...,0.024533,0.045217,0.082425,-0.009038,0.024683,0.01423,-0.005057,0.036838,-0.03425,0.042868
1,AAPL,2014-01-01,8,0,0,0,0,0,0,0,...,0.021426,0.034109,0.057758,-0.006272,0.038048,0.028415,-0.003747,0.028702,-0.046322,0.019313
2,AAPL,2014-01-02,31,0,0,0,0,0,0,0,...,-0.007182,0.042413,0.04577,0.003726,0.038268,0.026364,-0.009559,0.031142,-0.054542,-0.013422
3,AAPL,2014-01-03,21,0,0,0,0,0,0,0,...,0.006636,0.038112,0.044563,0.007831,0.034383,0.044858,-0.010264,0.029918,-0.045999,0.008392
4,AAPL,2014-01-04,5,0,0,0,0,0,0,0,...,-0.002843,0.01698,0.058355,-0.014244,0.015553,0.025035,-0.018254,0.04457,-0.054826,0.017356


### Merging Financial and Tweets Vectors

In [None]:
fin_tickers = stock_data_all['ticker'].drop_duplicates().to_list()
twe_tickers = tweets['ticker'].drop_duplicates().to_list()
[x for x in fin_tickers if x not in twe_tickers]

['GMRE']

['GMRE']

In [None]:
stock_data_merged = stock_data_all.merge(tweets, on=['date', 'ticker'], how='left')

In [None]:
# Storing the data (financial + tweets) in the Aggregated Directory
stock_data_merged.to_csv(os.path.join(PATH_AGG_DATA, 'financial_tweets_data.csv'), index=False)

In [None]:
# Reading the data again from the Aggregated Directory (Financial + Tweets)
stock_data_merged = pd.read_csv(os.path.join(PATH_AGG_DATA, 'financial_tweets_data.csv'))

In [None]:
stock_data_merged.isnull().mean()

date        0.000000
open        0.000028
high        0.000028
low         0.000028
close       0.000028
              ...   
t2ve_508    0.798024
t2ve_509    0.798024
t2ve_510    0.798024
t2ve_511    0.798024
t2ve_512    0.798024
Length: 575, dtype: float64

## Adding News Vectors

In [None]:
os.path.join(PATH_AGG_DATA, 'financial_tweets_data.csv')

'/content/drive/MyDrive/UCSD/02_Courses_UCSD/04_Fall_Term_2023/01_Advanced_Data_Mining/DSC_250_PROJECT/datasets/AGGREGATED_DATA/financial_tweets_data.csv'

In [None]:
ls /content/drive/MyDrive/DSC250_project/DSC_250_PROJECT/datasets/AGGREGATED_DATA/

financial_data.csv         news.csv     news_upd.csv     tweet_embeddings_all_agg.csv
financial_tweets_data.csv  news.gsheet  news_upd.gsheet


In [None]:
fintweet = os.path.join('/content/drive/MyDrive/DSC250_project/DSC_250_PROJECT/datasets/AGGREGATED_DATA/','financial_tweets_data.csv')

In [None]:
fintweetdf = pd.read_csv(fintweet)

In [None]:
# fintweetdf[fintweetdf['t2ve_503'].notna()]

In [None]:
newspath = os.path.join('/content/drive/MyDrive/DSC250_project/DSC_250_PROJECT/datasets/AGGREGATED_DATA/','news_upd.csv')

In [None]:
newsdf = pd.read_csv(newspath, encoding='latin-1', parse_dates= [1], lineterminator='\n')

In [None]:
newsdf['Date'] = pd.to_datetime(newsdf['Date'], errors='coerce')

In [None]:
newsdf

Unnamed: 0,Date,Title,Body,ticker,tokenized_words,cleaned_text,Title sentiment_label,Title sentiment_score,newsVector
0,2011-07-07,SPECIAL REPORT-Malaysia's dilemma: Can it refo...,"putrajaya, malaysia, july 7 dr. mahathir moha...",T,"['putrajaya', ',', 'malaysia', ',', 'july', '7...","['putrajaya', 'malaysia', 'july', '7', 'mahath...",NEGATIVE,0.993214,[-4.31280017e-01 -4.49921563e-02 3.54213178e-...
1,2011-07-08,Special Report: Can Malaysia reform and discri...,"putrajaya, malaysia dr. mahathir mohamad sits ...",T,"['putrajaya', ',', 'malaysia', 'dr.', 'mahathi...","['putrajaya', 'malaysia', 'mahathir', 'mohamad...",NEGATIVE,0.987828,[-0.19804451 0.0515668 0.3471907 0.451883...
2,2011-07-13,Danielle Chiesi settles SEC insider trading ca...,"new york danielle chiesi, a former trader who ...",T,"['new', 'york', 'danielle', 'chiesi', ',', 'a'...","['new', 'york', 'danielle', 'chiesi', 'former'...",POSITIVE,0.923182,[-0.46932358 1.1376562 -0.36117563 1.298507...
3,2011-07-18,"UPDATE 5-IBM's services signings surge, stock ...","* sets positive tone for tech earns, corporat...",T,"['*', 'sets', 'positive', 'tone', 'for', 'tech...","['sets', 'positive', 'tone', 'tech', 'earns', ...",NEGATIVE,0.997758,[ 0.26701182 -0.8121374 -0.35769838 0.968121...
4,2011-07-19,"Wall Street up on profits, Apple surges late,U...",new york stocks recorded their best day since ...,T,"['new', 'york', 'stocks', 'recorded', 'their',...","['new', 'york', 'stocks', 'recorded', 'best', ...",NEGATIVE,0.980940,[ 1.6810181 -0.8761942 -1.6215013 1.167018...
...,...,...,...,...,...,...,...,...,...
17675,2017-07-19,Activist Jana cashes out of Whole Foods follow...,activist investor jana partners llc cashed out...,AMZN,"['activist', 'investor', 'jana', 'partners', '...","['activist', 'investor', 'jana', 'partners', '...",NEGATIVE,0.976309,[ 2.8787196 0.47986117 -0.47300547 -1.054337...
17676,2017-07-14,U.S. lawmaker calls for hearing on Amazon's Wh...,washington/new york the top democrat on the u....,AMZN,"['washington/new', 'york', 'the', 'top', 'demo...","['york', 'top', 'democrat', 'house', 'represen...",NEGATIVE,0.991219,[ 0.26377246 -1.0956086 -1.2374927 -1.706804...
17677,2017-06-24,"Even with Whole Foods, Amazon would need many ...",if amazon.com inc hopes to revolutionize groc...,AMZN,"['if', 'amazon.com', 'inc', 'hopes', 'to', 're...","['inc', 'hopes', 'revolutionize', 'grocery', '...",NEGATIVE,0.999077,[ 1.1441845e+00 4.1975534e-01 -1.4262140e-01 ...
17678,2017-06-29,UPDATE 2-Nike to launch pilot program with Ama...,* shares up 8.2 pct at $57.54 in after-market ...,AMZN,"['*', 'shares', 'up', '8.2', 'pct', 'at', '$',...","['shares', 'pct', 'trade', 'adds', 'details', ...",NEGATIVE,0.998893,[-1.46899009e+00 -1.12892054e-01 2.65835994e-...


In [None]:
# np.unique(newsdf['Title sentiment_label'])

In [None]:
df_encoded = pd.get_dummies(newsdf, columns=['Title sentiment_label'])

In [None]:
# df_encoded['Title sentiment_score'].apply(lambda x: -x if df_encoded['Title sentiment_label_NEGATIVE'] == 1 else x)

In [None]:
df_encoded['Title sentiment_score'] = df_encoded.apply(lambda row: -row['Title sentiment_score'] if row['Title sentiment_label_NEGATIVE'] == 1 else row['Title sentiment_score'], axis=1)

In [None]:
# newsdf_agg = newsdf.groupby('Date', 'ticker').agg({'Title sentiment_label_NEGATIVE':'sum', 'Title sentiment_label_POSITIVE': 'sum','Title sentiment_score':mean,'newsVector':'mean'})

In [None]:
df_encoded= df_encoded[df_encoded['newsVector'].notna()]

In [None]:
# df_encoded = pd.concat([df_encoded, df_encoded['newsVector'].apply(pd.Series).add_prefix('newsVector_')], axis=1)
# df_encoded = df_encoded.drop(columns=['newsVector'])

In [None]:
# df_encoded.explode('newsVector')

In [None]:
df_encoded['newsVector']

0        [-4.31280017e-01 -4.49921563e-02  3.54213178e-...
1        [-0.19804451  0.0515668   0.3471907   0.451883...
2        [-0.46932358  1.1376562  -0.36117563  1.298507...
3        [ 0.26701182 -0.8121374  -0.35769838  0.968121...
4        [ 1.6810181  -0.8761942  -1.6215013   1.167018...
                               ...                        
17675    [ 2.8787196   0.47986117 -0.47300547 -1.054337...
17676    [ 0.26377246 -1.0956086  -1.2374927  -1.706804...
17677    [ 1.1441845e+00  4.1975534e-01 -1.4262140e-01 ...
17678    [-1.46899009e+00 -1.12892054e-01  2.65835994e-...
17679    [ 0.56067514  0.70193255  0.20231985  0.174875...
Name: newsVector, Length: 17680, dtype: object

In [None]:
df_encoded['newsVector'] = df_encoded['newsVector'].str.replace(r'\[ ', '[').str.replace('\n', ' ').str.replace(r'\s+', ',')
df_encoded['newsVector']

  df_encoded['newsVector'] = df_encoded['newsVector'].str.replace(r'\[ ', '[').str.replace('\n', ' ').str.replace(r'\s+', ',')


0        [-4.31280017e-01,-4.49921563e-02,3.54213178e-0...
1        [-0.19804451,0.0515668,0.3471907,0.45188385,1....
2        [-0.46932358,1.1376562,-0.36117563,1.2985078,1...
3        [0.26701182,-0.8121374,-0.35769838,0.9681215,1...
4        [1.6810181,-0.8761942,-1.6215013,1.1670184,1.0...
                               ...                        
17675    [2.8787196,0.47986117,-0.47300547,-1.054337,0....
17676    [0.26377246,-1.0956086,-1.2374927,-1.7068045,1...
17677    [1.1441845e+00,4.1975534e-01,-1.4262140e-01,-1...
17678    [-1.46899009e+00,-1.12892054e-01,2.65835994e-0...
17679    [0.56067514,0.70193255,0.20231985,0.17487547,1...
Name: newsVector, Length: 17680, dtype: object

In [None]:
import ast
df_encoded['newsVector'].apply(ast.literal_eval)

0        [-0.431280017, -0.0449921563, 0.354213178, 0.6...
1        [-0.19804451, 0.0515668, 0.3471907, 0.45188385...
2        [-0.46932358, 1.1376562, -0.36117563, 1.298507...
3        [0.26701182, -0.8121374, -0.35769838, 0.968121...
4        [1.6810181, -0.8761942, -1.6215013, 1.1670184,...
                               ...                        
17675    [2.8787196, 0.47986117, -0.47300547, -1.054337...
17676    [0.26377246, -1.0956086, -1.2374927, -1.706804...
17677    [1.1441845, 0.41975534, -0.1426214, -0.1963934...
17678    [-1.46899009, -0.112892054, 0.00265835994, -1....
17679    [0.56067514, 0.70193255, 0.20231985, 0.1748754...
Name: newsVector, Length: 17680, dtype: object

In [None]:

# df_encoded['newsVector'] = df_encoded['newsVector'].apply(ast.literal_eval)

# # Create separate columns for each element in the 'newsVector' arrays
# embedding = pd.DataFrame(df_encoded['newsVector'].to_list(), columns=[f'news2ve_{i+1:03d}' for i in range(128)])

In [None]:
df_encoded['newsVector'] = df_encoded['newsVector'].apply(ast.literal_eval)

# Create separate columns for each element in the 'newsVector' arrays
embedding = pd.DataFrame(df_encoded['newsVector'].to_list(), columns=[f'news2ve_{i+1:03d}' for i in range(128)])

# Concatenate the DataFrames along columns
df_encoded = pd.concat([df_encoded, embedding], axis=1)

In [None]:
df_encoded.drop('newsVector', inplace = True, axis = 1)

In [None]:
df_encoded.drop(['Body','Title', 'tokenized_words', 'cleaned_text'], inplace = True, axis =1)
df_encoded[:3]

Unnamed: 0,Date,ticker,Title sentiment_score,Title sentiment_label_NEGATIVE,Title sentiment_label_POSITIVE,news2ve_001,news2ve_002,news2ve_003,news2ve_004,news2ve_005,...,news2ve_119,news2ve_120,news2ve_121,news2ve_122,news2ve_123,news2ve_124,news2ve_125,news2ve_126,news2ve_127,news2ve_128
0,2011-07-07,T,-0.993214,1,0,-0.43128,-0.044992,0.354213,0.697911,2.04545,...,0.652926,0.672569,1.553473,0.262625,-0.799215,1.546489,0.2539,0.593565,-1.572047,0.571509
1,2011-07-08,T,-0.987828,1,0,-0.198045,0.051567,0.347191,0.451884,1.218303,...,0.282431,0.532512,0.984212,0.093855,-0.571536,1.049526,0.365417,0.412591,-0.873022,0.345252
2,2011-07-13,T,0.923182,0,1,-0.469324,1.137656,-0.361176,1.298508,1.164419,...,-0.490781,-1.114714,0.677273,1.133146,-0.884837,1.651788,1.189902,0.965121,-0.288995,0.767504


In [None]:
sum_cols = ['Title sentiment_label_NEGATIVE', 'Title sentiment_label_POSITIVE']
agg_spec = {col: 'sum' if col in sum_cols else 'mean' for col in df_encoded.columns if col not in ['Date', 'ticker']}

result_df = df_encoded.groupby(['Date', 'ticker']).agg(agg_spec).reset_index()


In [None]:
result_df[~result_df['Title sentiment_label_NEGATIVE'].isin([0, 1])][:3]

Unnamed: 0,Date,ticker,Title sentiment_score,Title sentiment_label_NEGATIVE,Title sentiment_label_POSITIVE,news2ve_001,news2ve_002,news2ve_003,news2ve_004,news2ve_005,...,news2ve_119,news2ve_120,news2ve_121,news2ve_122,news2ve_123,news2ve_124,news2ve_125,news2ve_126,news2ve_127,news2ve_128
10,2011-07-07,CVX,-0.99563,2,0,-0.449709,-0.058782,-0.639056,0.511857,1.384502,...,0.156071,0.173405,1.604592,-0.256845,0.126767,1.364651,0.185753,0.334969,-0.906285,1.272592
18,2011-07-07,T,-0.99563,2,0,-0.450152,-0.147789,-0.623698,0.500802,1.374063,...,0.251578,0.293472,1.779962,-0.188921,0.002315,1.179925,0.213279,0.740445,-1.119182,1.100583
20,2011-07-07,WFC,-0.997364,2,0,-0.459645,-1.063087,-0.789878,0.79472,1.180768,...,0.051219,0.199608,1.569285,-0.317709,0.399325,1.846466,-0.183971,0.052971,-1.141585,0.925457


In [None]:
result_df.rename(columns={"Title sentiment_score": "News_SScore", "Title sentiment_label_NEGATIVE": "NegSenCount_news", "Title sentiment_label_POSITIVE": "PosSenCount_news"}, inplace = True)

In [None]:
import datetime as dt
result_df['Day'] = result_df['Date'].dt.dayofweek

In [None]:
pwd

'/content'

In [None]:
result_df.to_csv('/content/drive/MyDrive/DSC250_project/DSC_250_PROJECT/datasets/AGGREGATED_DATA/news_final_agg.csv')

In [None]:
result_df.drop('Day', inplace = True, axis = 1)

In [None]:
weekend_mask = (result_df['Date'].dt.dayofweek == 5) | (result_df['Date'].dt.dayofweek == 6)
result_df.loc[weekend_mask, 'Date'] = result_df.loc[weekend_mask, 'Date'] - pd.to_timedelta((result_df.loc[weekend_mask, 'Date'].dt.dayofweek + 2) % 7, unit='d')

In [None]:
result_df[:3]

Unnamed: 0,Date,ticker,News_SScore,NegSenCount_news,PosSenCount_news,news2ve_001,news2ve_002,news2ve_003,news2ve_004,news2ve_005,...,news2ve_119,news2ve_120,news2ve_121,news2ve_122,news2ve_123,news2ve_124,news2ve_125,news2ve_126,news2ve_127,news2ve_128
0,2011-07-06,005930.KS,-0.998924,1,0,-1.010626,-1.703238,-0.413511,0.389713,2.163167,...,0.207646,1.655146,0.889079,1.48676,-1.184951,-0.201042,0.47257,-0.201521,1.017336,-0.113666
1,2011-07-06,AAPL,-0.999012,1,0,-0.157108,-0.571859,0.032514,0.166861,0.729311,...,-0.760558,0.543086,0.346933,0.05183,-0.204596,0.859705,0.747626,-0.776559,0.479887,-0.620001
2,2011-07-06,GOOG.O,-0.880484,1,0,-0.856936,0.664619,-1.106251,-0.652251,-0.023104,...,0.391603,-0.588558,-1.007424,-0.152823,-0.645491,0.437398,0.711479,1.316568,-0.006403,2.237492


In [None]:
sum_cols = ['NegSenCount_news', 'PosSenCount_news']
agg_spec = {col: 'sum' if col in sum_cols else 'mean' for col in result_df.columns if col not in ['Date', 'ticker']}

result_df = result_df.groupby(['Date', 'ticker']).agg(agg_spec).reset_index()

In [None]:
pd.to_datetime('2011-07-12') - pd.to_timedelta(2, unit='d')


Timestamp('2011-07-10 00:00:00')

In [None]:
# agg_spec
result_df[:3]

Unnamed: 0,Date,ticker,News_SScore,NegSenCount_news,PosSenCount_news,news2ve_001,news2ve_002,news2ve_003,news2ve_004,news2ve_005,...,news2ve_119,news2ve_120,news2ve_121,news2ve_122,news2ve_123,news2ve_124,news2ve_125,news2ve_126,news2ve_127,news2ve_128
0,2011-07-06,005930.KS,-0.998924,1,0,-1.010626,-1.703238,-0.413511,0.389713,2.163167,...,0.207646,1.655146,0.889079,1.48676,-1.184951,-0.201042,0.47257,-0.201521,1.017336,-0.113666
1,2011-07-06,AAPL,-0.999012,1,0,-0.157108,-0.571859,0.032514,0.166861,0.729311,...,-0.760558,0.543086,0.346933,0.05183,-0.204596,0.859705,0.747626,-0.776559,0.479887,-0.620001
2,2011-07-06,GOOG.O,-0.880484,1,0,-0.856936,0.664619,-1.106251,-0.652251,-0.023104,...,0.391603,-0.588558,-1.007424,-0.152823,-0.645491,0.437398,0.711479,1.316568,-0.006403,2.237492


In [None]:
result_df[result_df['Date']=='2011-07-09']

Unnamed: 0,Date,ticker,News_SScore,NegSenCount_news,PosSenCount_news,news2ve_001,news2ve_002,news2ve_003,news2ve_004,news2ve_005,...,news2ve_119,news2ve_120,news2ve_121,news2ve_122,news2ve_123,news2ve_124,news2ve_125,news2ve_126,news2ve_127,news2ve_128
35,2011-07-09,GOOG.O,-0.99375,1,0,0.074114,-1.43425,0.393718,-0.382882,-0.34456,...,-0.145133,1.002738,1.032894,-0.351467,-0.227782,0.358869,0.731445,0.239168,1.762773,1.043558
36,2011-07-09,MSFT,-0.975574,1,0,-0.94403,-1.163441,0.403251,0.927118,-0.199482,...,1.039433,0.062991,0.650166,1.076942,-0.760517,1.099971,-0.254094,-1.026519,-0.569805,-0.970371
37,2011-07-09,WFC,-0.997016,1,0,-0.513324,0.488187,-0.444872,0.149936,1.965795,...,0.816621,0.424943,0.096163,-0.573076,-0.873368,0.165905,-0.352631,-0.015,1.161511,0.817149
38,2011-07-09,WMT,-0.997016,1,0,-0.299365,0.607652,-0.516626,0.366252,1.787261,...,0.73739,0.097007,0.074738,-0.380683,-0.778668,0.485035,-0.3909,-0.039776,1.104684,0.823243
39,2011-07-09,XOM,-0.986899,1,0,2.125535,-1.308712,-0.182609,-0.073314,3.261914,...,-0.233849,1.485784,-0.685938,0.849161,-3.926746,-0.280761,-2.077832,-1.530151,0.228261,1.020479
40,2011-07-09,GOOG.O,0.997549,0,1,0.121043,-1.272155,1.259052,-1.098615,-0.087348,...,-0.430775,0.353442,0.752615,-0.186525,-1.012268,0.498293,0.530837,0.806912,2.979025,2.057071
41,2011-07-09,SNE,0.998013,0,1,1.349513,-0.647289,-1.199408,1.655642,2.480747,...,0.663139,1.054932,2.737459,0.263174,-1.671819,0.166997,-1.15791,-1.553723,-0.262906,0.461717
42,2011-07-09,WFC,-0.999193,1,0,0.527875,-1.076852,-1.091059,-0.344906,0.599533,...,-0.162097,1.178586,0.661078,-0.468704,-0.740535,0.163233,-0.560348,-0.219111,-0.113235,0.100567
43,2011-07-09,WMT,-0.999193,1,0,0.486509,-1.044712,-0.951296,-0.553122,0.45718,...,-0.202617,0.944761,0.619674,-0.670443,-0.553915,0.204401,-0.455772,-0.318683,-0.213919,0.110243
44,2011-07-09,XOM,0.743077,0,1,1.307279,-0.900849,-0.177552,0.28642,1.010895,...,0.482052,0.93821,0.514401,-0.747438,-1.997041,2.121648,-0.46775,-1.02777,-1.017986,0.292294


In [None]:
# df_encoded['newsVector'] = df_encoded['newsVector'].apply(lambda x: pd.to_numeric(x, errors='coerce'))

# columns_to_aggregate = ['Title sentiment_label_NEGATIVE', 'Title sentiment_label_POSITIVE', 'Title sentiment_score', 'newsVector']
# grouped_df = df_encoded.groupby(['Date', 'ticker'])[columns_to_aggregate].agg({
#     'Title sentiment_label_NEGATIVE': 'sum',
#     'Title sentiment_label_POSITIVE': 'sum',
#     'Title sentiment_score': np.mean,
#     'newsVector': np.mean
# }).reset_index()

In [None]:
grouped_df[grouped_df['newsVector'].notna()]

Unnamed: 0,Date,ticker,Title sentiment_label_NEGATIVE,Title sentiment_label_POSITIVE,Title sentiment_score,newsVector


In [None]:
df_encoded

Unnamed: 0,Date,Title,Body,ticker,tokenized_words,cleaned_text,Title sentiment_score,newsVector,Title sentiment_label_NEGATIVE,Title sentiment_label_POSITIVE
0,2011-07-07,SPECIAL REPORT-Malaysia's dilemma: Can it refo...,"putrajaya, malaysia, july 7 dr. mahathir moha...",T,"['putrajaya', ',', 'malaysia', ',', 'july', '7...","['putrajaya', 'malaysia', 'july', '7', 'mahath...",-0.993214,[-4.31280017e-01 -4.49921563e-02 3.54213178e-...,1,0
1,2011-07-08,Special Report: Can Malaysia reform and discri...,"putrajaya, malaysia dr. mahathir mohamad sits ...",T,"['putrajaya', ',', 'malaysia', 'dr.', 'mahathi...","['putrajaya', 'malaysia', 'mahathir', 'mohamad...",-0.987828,[-0.19804451 0.0515668 0.3471907 0.451883...,1,0
2,2011-07-13,Danielle Chiesi settles SEC insider trading ca...,"new york danielle chiesi, a former trader who ...",T,"['new', 'york', 'danielle', 'chiesi', ',', 'a'...","['new', 'york', 'danielle', 'chiesi', 'former'...",0.923182,[-0.46932358 1.1376562 -0.36117563 1.298507...,0,1
3,2011-07-18,"UPDATE 5-IBM's services signings surge, stock ...","* sets positive tone for tech earns, corporat...",T,"['*', 'sets', 'positive', 'tone', 'for', 'tech...","['sets', 'positive', 'tone', 'tech', 'earns', ...",-0.997758,[ 0.26701182 -0.8121374 -0.35769838 0.968121...,1,0
4,2011-07-19,"Wall Street up on profits, Apple surges late,U...",new york stocks recorded their best day since ...,T,"['new', 'york', 'stocks', 'recorded', 'their',...","['new', 'york', 'stocks', 'recorded', 'best', ...",-0.980940,[ 1.6810181 -0.8761942 -1.6215013 1.167018...,1,0
...,...,...,...,...,...,...,...,...,...,...
17675,2017-07-19,Activist Jana cashes out of Whole Foods follow...,activist investor jana partners llc cashed out...,AMZN,"['activist', 'investor', 'jana', 'partners', '...","['activist', 'investor', 'jana', 'partners', '...",-0.976309,[ 2.8787196 0.47986117 -0.47300547 -1.054337...,1,0
17676,2017-07-14,U.S. lawmaker calls for hearing on Amazon's Wh...,washington/new york the top democrat on the u....,AMZN,"['washington/new', 'york', 'the', 'top', 'demo...","['york', 'top', 'democrat', 'house', 'represen...",-0.991219,[ 0.26377246 -1.0956086 -1.2374927 -1.706804...,1,0
17677,2017-06-24,"Even with Whole Foods, Amazon would need many ...",if amazon.com inc hopes to revolutionize groc...,AMZN,"['if', 'amazon.com', 'inc', 'hopes', 'to', 're...","['inc', 'hopes', 'revolutionize', 'grocery', '...",-0.999077,[ 1.1441845e+00 4.1975534e-01 -1.4262140e-01 ...,1,0
17678,2017-06-29,UPDATE 2-Nike to launch pilot program with Ama...,* shares up 8.2 pct at $57.54 in after-market ...,AMZN,"['*', 'shares', 'up', '8.2', 'pct', 'at', '$',...","['shares', 'pct', 'trade', 'adds', 'details', ...",-0.998893,[-1.46899009e+00 -1.12892054e-01 2.65835994e-...,1,0


In [None]:
df_encoded[df_encoded['Title sentiment_score']]

Unnamed: 0,Date,Title,Body,ticker,tokenized_words,cleaned_text,Title sentiment_score,newsVector,Title sentiment_label_NEGATIVE,Title sentiment_label_POSITIVE
0,2011-07-07,SPECIAL REPORT-Malaysia's dilemma: Can it refo...,"putrajaya, malaysia, july 7 dr. mahathir moha...",T,"['putrajaya', ',', 'malaysia', ',', 'july', '7...","['putrajaya', 'malaysia', 'july', '7', 'mahath...",0.993214,[-4.31280017e-01 -4.49921563e-02 3.54213178e-...,1,0
1,2011-07-08,Special Report: Can Malaysia reform and discri...,"putrajaya, malaysia dr. mahathir mohamad sits ...",T,"['putrajaya', ',', 'malaysia', 'dr.', 'mahathi...","['putrajaya', 'malaysia', 'mahathir', 'mohamad...",0.987828,[-0.19804451 0.0515668 0.3471907 0.451883...,1,0
2,2011-07-13,Danielle Chiesi settles SEC insider trading ca...,"new york danielle chiesi, a former trader who ...",T,"['new', 'york', 'danielle', 'chiesi', ',', 'a'...","['new', 'york', 'danielle', 'chiesi', 'former'...",0.923182,[-0.46932358 1.1376562 -0.36117563 1.298507...,0,1
3,2011-07-18,"UPDATE 5-IBM's services signings surge, stock ...","* sets positive tone for tech earns, corporat...",T,"['*', 'sets', 'positive', 'tone', 'for', 'tech...","['sets', 'positive', 'tone', 'tech', 'earns', ...",0.997758,[ 0.26701182 -0.8121374 -0.35769838 0.968121...,1,0
4,2011-07-19,"Wall Street up on profits, Apple surges late,U...",new york stocks recorded their best day since ...,T,"['new', 'york', 'stocks', 'recorded', 'their',...","['new', 'york', 'stocks', 'recorded', 'best', ...",0.980940,[ 1.6810181 -0.8761942 -1.6215013 1.167018...,1,0
...,...,...,...,...,...,...,...,...,...,...
17675,2017-07-19,Activist Jana cashes out of Whole Foods follow...,activist investor jana partners llc cashed out...,AMZN,"['activist', 'investor', 'jana', 'partners', '...","['activist', 'investor', 'jana', 'partners', '...",0.976309,[ 2.8787196 0.47986117 -0.47300547 -1.054337...,1,0
17676,2017-07-14,U.S. lawmaker calls for hearing on Amazon's Wh...,washington/new york the top democrat on the u....,AMZN,"['washington/new', 'york', 'the', 'top', 'demo...","['york', 'top', 'democrat', 'house', 'represen...",0.991219,[ 0.26377246 -1.0956086 -1.2374927 -1.706804...,1,0
17677,2017-06-24,"Even with Whole Foods, Amazon would need many ...",if amazon.com inc hopes to revolutionize groc...,AMZN,"['if', 'amazon.com', 'inc', 'hopes', 'to', 're...","['inc', 'hopes', 'revolutionize', 'grocery', '...",0.999077,[ 1.1441845e+00 4.1975534e-01 -1.4262140e-01 ...,1,0
17678,2017-06-29,UPDATE 2-Nike to launch pilot program with Ama...,* shares up 8.2 pct at $57.54 in after-market ...,AMZN,"['*', 'shares', 'up', '8.2', 'pct', 'at', '$',...","['shares', 'pct', 'trade', 'adds', 'details', ...",0.998893,[-1.46899009e+00 -1.12892054e-01 2.65835994e-...,1,0


In [None]:
newsdf[newsdf['Date']=='2011-07-12']

Unnamed: 0,Date,Title,Body,ticker,tokenized_words,cleaned_text,Title sentiment_label,Title sentiment_score,newsVector
18,2011-07-12,Obama holding lunch mtg with business execs on...,"washington, july 12 president barack obama wa...",T,"['washington', ',', 'july', '12', 'president',...","['washington', 'july', '12', 'president', 'bar...",NEGATIVE,0.972524,[-5.4592901e-01 -5.7942414e-01 2.1099211e-01 ...
468,2011-07-12,Regulators combat unauthorized phone bill fees...,washington telephone companies would have to m...,T,"['washington', 'telephone', 'companies', 'woul...","['washington', 'telephone', 'companies', 'woul...",NEGATIVE,0.983715,[ 2.1390417 1.1236602 -2.027831 1.788465...
1124,2011-07-12,Electronic Arts buying PopCap Games for up to ...,"new york electronic arts inc , the video game ...",GOOG.O,"['new', 'york', 'electronic', 'arts', 'inc', '...","['new', 'york', 'electronic', 'arts', 'inc', '...",NEGATIVE,0.999164,[-0.37570712 -3.498245 -0.39497977 0.146711...
2026,2011-07-12,UPDATE 1-Firestone wins Liberian child labor c...,* companies can be held liable for human-righ...,XOM,"['*', 'companies', 'can', 'be', 'held', 'liabl...","['companies', 'held', 'liable', 'abuses', 'abr...",POSITIVE,0.999353,[-2.1302390e+00 5.4395765e-02 -2.9384639e+00 ...
2893,2011-07-12,Regulators combat unauthorized phone bill fees...,washington telephone companies would have to m...,VZ,"['washington', 'telephone', 'companies', 'woul...","['washington', 'telephone', 'companies', 'woul...",NEGATIVE,0.976724,[-6.2695724e-01 2.8825265e-01 -1.5188003e+00 ...
3395,2011-07-12,"Apple chief patent lawyer leaving: sources,Tos...",san francisco apple inc's chief patent counsel...,005930.KS,"['san', 'francisco', 'apple', 'inc', ""'s"", 'ch...","['san', 'francisco', 'apple', 'inc', 'chief', ...",NEGATIVE,0.993491,[-1.9358366 -0.64709556 -0.0476862 1.592252...
4948,2011-07-12,Electronic Arts buying PopCap Games for up to ...,"new york electronic arts inc , the video game ...",AAPL,"['new', 'york', 'electronic', 'arts', 'inc', '...","['new', 'york', 'electronic', 'arts', 'inc', '...",NEGATIVE,0.998376,[-1.8747267 -2.0403597 -1.3048285 0.518847...
6608,2011-07-12,Regulators combat unauthorized phone bill fees...,washington telephone companies would have to m...,WFC,"['washington', 'telephone', 'companies', 'woul...","['washington', 'telephone', 'companies', 'woul...",NEGATIVE,0.976724,[-0.63758945 0.5392483 -1.5211304 -1.144361...
7111,2011-07-12,Analysis: Wal-Mart ruling no knock-out blow fo...,new york the end of the road for a class-actio...,WFC,"['new', 'york', 'the', 'end', 'of', 'the', 'ro...","['new', 'york', 'end', 'road', 'discrimination...",NEGATIVE,0.987345,[-1.3457037e+00 2.1614358e+00 -5.1890165e-01 ...
7920,2011-07-12,BRIEF-Moody's Downgrades Wells Fargo's Service...,"july 13, 2011 (reuters)--moody's downgrades we...",WFC,"['july', '13', ',', '2011', '(', 'reuters', ')...","['july', '13', '2011', 'reuters', 'moody', 'do...",NEGATIVE,0.9455,[-1.9909439 -0.46401983 -1.6327851 0.249815...
