#### STOCK PRICE PREDICTION: DATA WRANGLING

In [80]:
import os
from bs4 import BeautifulSoup, SoupStrainer
import requests
import pandas as pd
import httplib2
from selenium import webdriver      
import time
import yfinance as yf
from datetime import datetime
from datetime import timedelta

In [81]:
#TICKERS IN TECH INDUSTRY
tickers = ['AMD', 'NVDA', 'MSFT','ADBE','AAPL','TSLA'] #TECH
ticker_names = ['Advanced Micro Devices', 'Nvidia', 'Microsoft','Adobe','Apple','Tesla'] #TECH

In [82]:
#FINDING ARTICLES LINKS RELATED TO TICKER

#initializing variables and driver
article_links = []
driver = webdriver.Chrome('./chromedriver')

#loop through ticker_names
for i in range(len(ticker_names)): 
    site = 'https://in.reuters.com/search/news?blob=' + ticker_names[i]
    driver.get(site)
    
    #click 'load more' button 9x 
    for _ in range(9): 
        try:
            loadMoreButton = driver.find_element_by_xpath('//*[@id="content"]/section[2]/div/div[1]/div[4]/div/div[4]/div[1]')
            time.sleep(2) 
            loadMoreButton.click()
            time.sleep(4)
        except Exception as e:
            print( e)
            break
    #find elements 
    elems = [elem.get_attribute("href") for elem in driver.find_elements_by_xpath("//a[@href]")]
    
    #append article_links if article in link 
    articles = [[tickers[i],x] for x in elems if 'article' in x ]
    article_links += articles
    
driver.quit()

In [83]:
len(article_links)

879

In [84]:
#GETTING CONTENT OF ARTICLE LINK AND CREATING DF WITH TICKER, DATE OF POST, CONTENT

df = pd.DataFrame(columns = ['ticker','date','content'])
for ticker , link in article_links: 
    req = requests.get(link)
    souper = BeautifulSoup(req.text,'html.parser')

    #post date
    mydivs = souper.find_all('div', class_="ArticleHeader_date")
    DOP = str(mydivs[0]).split('>')[1].split('/')[0][:-1]

    #getting content 
    content = souper.get_text().replace('\n', '')

    #adding to df
    data = [[ticker, DOP, content]]

    df2 = pd.DataFrame(data ,columns = ['ticker','date','content'])

    df = df.append(df2)


In [85]:
df.head()

Unnamed: 0,ticker,date,content
0,AMD,"March 26, 2020",BRIEF-Advanced Micro Devices -...
0,AMD,"August 28, 2017","BRIEF-Advanced Micro Devices, ..."
0,AMD,"February 3, 2020","COLUMN-With virus outbreak, Ch..."
0,AMD,"March 13, 2018",BRIEF-Advanced Micro Devices C...
0,AMD,"July 3, 2017",BRIEF-Advanced Micro Devices a...


In [87]:
#yf.download prints completion data that I'd like to not see.

from contextlib import contextmanager
import sys, os

@contextmanager
def suppress_stdout():
    with open(os.devnull, "w") as devnull:
        old_stdout = sys.stdout
        sys.stdout = devnull
        try:  
            yield
        finally:
            sys.stdout = old_stdout

In [88]:
'''
get highest price delta within a 30 day period
'''

def loss_gain(ticker, date, end):
    
    
    day = yf.download(ticker , date , date + timedelta(end))
    df_close = pd.DataFrame(day.Close)
    
    cmax = df_close.iloc[1:].max().values
    cmax = df_close[df_close['Close']==cmax[0]] #max value 
    cmax_date = cmax.index #date of max value 
    first_price = df_close.iloc[0].values
    pct_change = 100*((cmax.values - first_price)/first_price)
    
    tdelta = (cmax_date - date).days.values[0]
    
    return [[pct_change[0][0], tdelta]]

change = []
dates = df.date.values

for i in range(len(dates)):  #change [:2]
    with suppress_stdout(): 
        try: 
            ticker = df.iloc[i].ticker
            #getting date and reformatting to fit into yfinance download function. 

            date = datetime.strptime(dates[i], '%B %d, %Y').strftime('%Y-%m-%d')
            date = datetime.strptime(date, '%Y-%m-%d' )


            #finding the time delta from post date to now.
            end = abs(int(str(date - datetime.now()).split(',')[0].replace('days','')))

            #getting highest percent change within a 30 day forecast. If post does not have 30 days future it will get the max future data
            if end > 30: 
                end = 30
                print('ticker: %s' %ticker,'date: %s' %date, 'timedelta: %s' %end)
                change += loss_gain(ticker, date, end)
            else: 
                print(ticker,date,end)
                change += loss_gain(ticker, date, end)

        except:
        
            print('error')
            change += [[0,0]] #substituting for error raised, row will be dropped at the end. 


In [89]:
df_delta = pd.DataFrame(change, columns=['pct_change', 'time_delta'])
df_delta.T

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,869,870,871,872,873,874,875,876,877,878
pct_change,19.936845,12.346691,22.657228,-1.460482,21.481487,4.081629,10.741301,11.574074,4.606666,38.105044,...,17.617947,61.075214,61.075214,16.908131,16.908131,16.908131,16.908131,16.908131,55.883539,27.616588
time_delta,25.0,23.0,16.0,3.0,23.0,1.0,26.0,6.0,1.0,29.0,...,16.0,28.0,28.0,27.0,27.0,27.0,27.0,27.0,25.0,14.0


In [90]:
#adding time_delta and pct_change to df
df['time_delta'] = df_delta['time_delta'].tolist()
df['pct_change'] = df_delta['pct_change'].tolist()


In [91]:
df.head()

Unnamed: 0,ticker,date,content,time_delta,pct_change
0,AMD,"March 26, 2020",BRIEF-Advanced Micro Devices -...,25,19.936845
0,AMD,"August 28, 2017","BRIEF-Advanced Micro Devices, ...",23,12.346691
0,AMD,"February 3, 2020","COLUMN-With virus outbreak, Ch...",16,22.657228
0,AMD,"March 13, 2018",BRIEF-Advanced Micro Devices C...,3,-1.460482
0,AMD,"July 3, 2017",BRIEF-Advanced Micro Devices a...,23,21.481487


#### Cleaning data

In [92]:
#taking out rows where pct_change value is 0
df = df[df['pct_change']!=0]
#taking out rows where time_delta is neg and >30 
df = df[(df['time_delta']>0) & (df['time_delta']<30)]


In [93]:
df.head()

Unnamed: 0,ticker,date,content,time_delta,pct_change
0,AMD,"March 26, 2020",BRIEF-Advanced Micro Devices -...,25,19.936845
0,AMD,"August 28, 2017","BRIEF-Advanced Micro Devices, ...",23,12.346691
0,AMD,"February 3, 2020","COLUMN-With virus outbreak, Ch...",16,22.657228
0,AMD,"March 13, 2018",BRIEF-Advanced Micro Devices C...,3,-1.460482
0,AMD,"July 3, 2017",BRIEF-Advanced Micro Devices a...,23,21.481487


In [94]:
#getting a quick sentiment analysis from vader to compare to pct_change

from nltk.sentiment.vader import SentimentIntensityAnalyzer
analyzer = SentimentIntensityAnalyzer()
sentiment = df.content.apply(analyzer.polarity_scores).tolist()

In [95]:
#getting pos/neg values of content sentiment
sent_parsed = []
for i in sentiment: 
    if i['pos'] > i['neg']: 
        sent_parsed += [i['pos']]
    else: 
        sent_parsed += [-i['neg']]


In [96]:
df['sentiment'] = sent_parsed 
df.head()

Unnamed: 0,ticker,date,content,time_delta,pct_change,sentiment
0,AMD,"March 26, 2020",BRIEF-Advanced Micro Devices -...,25,19.936845,0.103
0,AMD,"August 28, 2017","BRIEF-Advanced Micro Devices, ...",23,12.346691,0.11
0,AMD,"February 3, 2020","COLUMN-With virus outbreak, Ch...",16,22.657228,0.112
0,AMD,"March 13, 2018",BRIEF-Advanced Micro Devices C...,3,-1.460482,0.123
0,AMD,"July 3, 2017",BRIEF-Advanced Micro Devices a...,23,21.481487,0.052


In [97]:
#selecting rows where pct_change and sentiment both have + or - values
df = df[(df['sentiment']>0) & (df['pct_change']>0) | (df['sentiment']<0) & (df['pct_change']<0)]
print(df.describe())
print(df.head().T)

       time_delta  pct_change   sentiment
count  687.000000  687.000000  687.000000
mean    17.362445   11.960637    0.072076
std      9.735028   12.870122    0.037975
min      1.000000   -7.459492   -0.136000
25%      7.000000    4.390251    0.056000
50%     20.000000    9.076299    0.072000
75%     27.000000   15.568348    0.095000
max     29.000000   80.245453    0.199000
                                                            0  \
ticker                                                    AMD   
date                                           March 26, 2020   
content                     BRIEF-Advanced Micro Devices -...   
time_delta                                                 25   
pct_change                                            19.9368   
sentiment                                               0.103   

                                                            0  \
ticker                                                    AMD   
date                                

In [98]:
df.to_csv('./data/tech_stock.csv')

In [100]:
print('useable data size: %s' %df.shape[0])

useable data size: 687


POTENTIAL ISSUES THAT MIGHT COME UP WITH DATASET
- UNEVEN TRAINING DATA: NOT ENOUGH NEGATIVE ARTICLES TO BALANCE THE POSITIVE
      

more tickers here http://eoddata.com/stocklist/NYSE/B.htm
