# Classifying Stocks Using Machine Learning
## Cleaning and Analysis
<hr>

The purpose of this notebook is to evaluate and clean the recently collected data.

### Objectives
 - Delete any dates that fall on the weekend
   - because they may add noise/bias
 - Review null values ("-1" in dataframe)
 - Find approach for nulls
   - nulls in social sentiment should probably be an average because social seems to skew negative
   - could also try whatever the actual middle would be ("0" for range of -1 to 1 for example)
 - Shrink and change news sentiment to -1 to 1
 - Get binary values for beating open and tomorrow's close

In [1]:
import math
import os
import requests
from datetime import datetime, timedelta
import time
import warnings
import pandas as pd
import numpy as np
from collections import Counter 

In [2]:
# load data
df = pd.read_hdf('stocks.h5')

In [3]:
# view original dataset
df

Unnamed: 0,ticker,date,headline,news_sentiment_score,source,url,amount_of_articles,open,close,volume,social_sentiments,mentions
0,FB,2016-08-09,[Onetime Home of Warner Bros.’ Harry Warner As...,3.00,[DowJones],[https://finnhub.io/api/news?id=7dbe5db9757dda...,1,125.340,125.06,19620967,-1.000000,-1
1,FB,2021-03-15,[Rupert Murdoch's News Corp strikes deal as Fa...,2.00,"[The Guardian, https://nypost.com, https://www...",[https://finnhub.io/api/news?id=61c0d589cb8bf9...,70,269.080,273.75,16856746,0.066288,45
2,FB,2021-03-16,[NetApp reformula a organização de vendas glob...,1.91,"[businesswire, benzinga, businesswire, busines...",[https://finnhub.io/api/news?id=6479351ac59fa2...,89,276.085,279.28,22437665,-0.339269,85
3,FB,2021-03-17,[Facebook Promises More Support For Human Righ...,1.89,"[https://www.forbes.com, businesswire, busines...",[https://finnhub.io/api/news?id=ad0559e9f8ae60...,58,275.705,284.01,21315044,-0.589213,135
4,FB,2021-03-18,[Take A Sneak Peek At The Weirdly-Shaped New P...,1.85,"[benzinga, benzinga, benzinga, businesswire, b...",[https://finnhub.io/api/news?id=e851ef47ee28e6...,77,279.870,278.62,18754853,-0.361794,534
...,...,...,...,...,...,...,...,...,...,...,...,...
521,UBER,2021-07-29,[Replay: Joby Aviation Executive Chairman and ...,1.90,"[Yahoo, Yahoo, Yahoo, Yahoo, Yahoo, Yahoo, Uni...",[https://finnhub.io/api/news?id=62bfd8bf18171d...,30,44.120,44.69,51033697,-1.000000,-1
522,UBER,2021-07-30,[Uber looks beyond ride-hailing as rebound and...,2.06,"[Yahoo, Yahoo, DowJones, CNBC, Yahoo, Yahoo, Y...",[https://finnhub.io/api/news?id=e725733cc1743a...,18,44.380,43.46,22194938,0.195272,40
523,UBER,2021-07-31,[Why No One Should Invest in Food Delivery Sto...,2.00,"[Yahoo, MarketWatch, SeekingAlpha, SeekingAlpha]",[https://finnhub.io/api/news?id=1185c1d8cba9af...,4,-1.000,-1.00,-1,-1.000000,-1
524,UBER,2021-08-01,[Videogames entered the mainstream in the pand...,2.08,"[Yahoo, Yahoo, MarketWatch, MarketWatch, Marke...",[https://finnhub.io/api/news?id=90034f7a2d7274...,13,44.100,43.49,26579889,0.148094,33


In [4]:
# find weekday values of rows
weekdates = []
for i in df["date"]:
    weekdates.append(datetime.strptime(i, "%Y-%m-%d").weekday())

In [5]:
# apply to dataframe
df["weekdates"] = weekdates

In [6]:
# remove weekends
df = df[df.weekdates != 5]
df = df[df.weekdates != 6]

In [7]:
del df["weekdates"]

In [8]:
# get list of stock tickers
tickers = list(df['ticker'].unique())

In [9]:
# applies average of the ticker to null vals
for i in tickers:
    stockDF = df[df.ticker == i]
    socAvg = stockDF[stockDF.mentions != -1]['social_sentiments'].mean()
    df.loc[(df['mentions'] == -1) & (df['ticker'] == i),'social_sentiments'] = socAvg

In [10]:
# change mentions from -1 to 0
df.loc[df['mentions'] == -1,'mentions'] = 0

In [11]:
# resize range of news score from 1 to 4 to -1 to 1 
inLow = 0
inHigh = 4
outLow = -1
outHigh = 1

def resizeRange(inputNum):
    result = ((inputNum - inLow) / (inHigh - inLow)) * (outHigh - outLow) + outLow
    return result

news_sentiment = []
for i in df["news_sentiment_score"]:
    if math.isnan(i):
        news_sentiment.append(0.0)
    else:
        news_sentiment.append(resizeRange(i))

In [12]:
# add resized sentiment to dataframe
df['news_sentiment'] = news_sentiment

In [13]:
# remove days where no stock prices are recorded
df = df[df.open != -1.0]

In [14]:
# get binary values for beating open and stock doing better tomorrow
openVsClose= []
for i in tickers:
    stockDF = df[df.ticker == i]
    for idx,row in stockDF.iterrows():
        if row['open'] < row['close']:
            openVsClose.append(1)
        else:
            openVsClose.append(0)

In [19]:
tomorrowBetter = []
for i in tickers:
    stockDF = df[df.ticker == i]
    ind = -1
    for idx,row in stockDF.iterrows():
        ind = ind + 1
        try:
            if row['close'] < stockDF.iloc[ind+1]['close']:
                tomorrowBetter.append(1)
            else:
                tomorrowBetter.append(0)
        except:
            tomorrowBetter.append(-1)

In [20]:
df['close_better'] = openVsClose
df['tomorrow_better'] = tomorrowBetter

In [21]:
# the last dates will not have proper "tomorrow_better" data
# so if its small set, manual adjust should be good
# big set would be easier just to drop data

# if large set, just do the following
# df = df[df.date != "LatestDate"]

df = df.reset_index(drop=True)
df.sort_values(by="date").tail()

Unnamed: 0,ticker,date,headline,news_sentiment_score,source,url,amount_of_articles,open,close,volume,social_sentiments,mentions,news_sentiment,close_better,tomorrow_better
99,FB,2021-07-30,[Stocks Are Sturdier Than Big Tech’s Tumble Su...,1.87,"[MarketWatch, Yahoo, CNBC, Yahoo, MarketWatch,...",[https://finnhub.io/api/news?id=cc2d5aa5b45bc9...,43,354.0,356.3,15976179,-0.490594,113,-0.065,1,0
199,ORCL,2021-08-02,[Oracle Is Betting Its Cloud Services Will Con...,2.0,"[Yahoo, SeekingAlpha, Thefly.com]",[https://finnhub.io/api/news?id=49487e9d83bd41...,3,87.73,87.6,5579320,0.036111,0,0.0,0,-1
100,FB,2021-08-02,[My $376k 33 Stock Retirement Portfolio Is Out...,1.99,"[SeekingAlpha, Yahoo, DowJones, DowJones, Yaho...",[https://finnhub.io/api/news?id=741f2114bf7596...,44,358.1,351.95,13180439,-0.346166,110,-0.005,0,-1
298,PYPL,2021-08-02,"[Bronstein, Gewirtz & Grossman, LLC Notifies S...",1.97,"[Yahoo, CNBC, Yahoo, Yahoo, Yahoo, Yahoo, Yaho...",[https://finnhub.io/api/news?id=d237362241a537...,34,276.885,270.99,10082821,-0.303386,60,-0.015,0,-1
398,UBER,2021-08-02,"[Disney, Google and other U.S. companies requi...",1.88,"[Yahoo, Yahoo, Yahoo, MarketWatch, Yahoo, Yaho...",[https://finnhub.io/api/news?id=4d0fe9d5700b42...,28,44.1,43.49,26579889,-0.072947,39,-0.06,0,-1


In [22]:
# looking up the prices to make sure the last values are good
df.tomorrow_better.iloc[[398]] = 0
df.tomorrow_better.iloc[[298]] = 1
df.tomorrow_better.iloc[[100]] = 0
df.tomorrow_better.iloc[[199]] = 1

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_with_indexer(indexer, value)


In [23]:
# save the cleaned frame to file
df.to_hdf('stocksCleaned.h5', 'data')

your performance may suffer as PyTables will pickle object types that it cannot
map directly to c-types [inferred_type->mixed,key->block2_values] [items->Index(['ticker', 'date', 'headline', 'source', 'url'], dtype='object')]

  pytables.to_hdf(


In [24]:
reread = pd.read_hdf('stocksCleaned.h5')

In [25]:
reread

Unnamed: 0,ticker,date,headline,news_sentiment_score,source,url,amount_of_articles,open,close,volume,social_sentiments,mentions,news_sentiment,close_better,tomorrow_better
0,FB,2016-08-09,[Onetime Home of Warner Bros.’ Harry Warner As...,3.00,[DowJones],[https://finnhub.io/api/news?id=7dbe5db9757dda...,1,125.340,125.06,19620967,-0.199921,0,0.500,0,1
1,FB,2021-03-15,[Rupert Murdoch's News Corp strikes deal as Fa...,2.00,"[The Guardian, https://nypost.com, https://www...",[https://finnhub.io/api/news?id=61c0d589cb8bf9...,70,269.080,273.75,16856746,0.066288,45,0.000,1,1
2,FB,2021-03-16,[NetApp reformula a organização de vendas glob...,1.91,"[businesswire, benzinga, businesswire, busines...",[https://finnhub.io/api/news?id=6479351ac59fa2...,89,276.085,279.28,22437665,-0.339269,85,-0.045,1,1
3,FB,2021-03-17,[Facebook Promises More Support For Human Righ...,1.89,"[https://www.forbes.com, businesswire, busines...",[https://finnhub.io/api/news?id=ad0559e9f8ae60...,58,275.705,284.01,21315044,-0.589213,135,-0.055,1,0
4,FB,2021-03-18,[Take A Sneak Peek At The Weirdly-Shaped New P...,1.85,"[benzinga, benzinga, benzinga, businesswire, b...",[https://finnhub.io/api/news?id=e851ef47ee28e6...,77,279.870,278.62,18754853,-0.361794,534,-0.075,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
394,UBER,2021-07-27,[WSJ News Exclusive | Ride-Sharing Startup Swv...,1.25,"[DowJones, Yahoo, Associated Press, The, GuruF...",[https://finnhub.io/api/news?id=1fbbc9dbf5f83e...,8,46.540,45.82,16874229,0.000000,1,-0.375,0,1
395,UBER,2021-07-28,[SoftBank Said to Offer $2.1 Billion Uber Stak...,1.83,"[Yahoo, Yahoo, Yahoo, Yahoo, CNBC, CNBC, Yahoo...",[https://finnhub.io/api/news?id=0592272765241b...,24,45.880,46.14,19510262,-0.472390,4,-0.085,1,0
396,UBER,2021-07-29,[Replay: Joby Aviation Executive Chairman and ...,1.90,"[Yahoo, Yahoo, Yahoo, Yahoo, Yahoo, Yahoo, Uni...",[https://finnhub.io/api/news?id=62bfd8bf18171d...,30,44.120,44.69,51033697,-0.155166,0,-0.050,1,0
397,UBER,2021-07-30,[Uber looks beyond ride-hailing as rebound and...,2.06,"[Yahoo, Yahoo, DowJones, CNBC, Yahoo, Yahoo, Y...",[https://finnhub.io/api/news?id=e725733cc1743a...,18,44.380,43.46,22194938,0.195272,40,0.030,0,1
