#### STOCK PRICE PREDICTION: DATA WRANGLING

In [22]:
import os
from bs4 import BeautifulSoup, SoupStrainer
import requests
import re
import pandas as pd
import httplib2
import numpy as np
from selenium import webdriver      
from selenium.common.exceptions import NoSuchElementException
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
import time
import yfinance as yf
from datetime import datetime
from datetime import timedelta

In [5]:
#TICKERS IN TECH INDUSTRY
tickers = ['AMD', 'NVDA', 'MSFT','ADBE','AAPL','TSLA'] #TECH
ticker_names = ['Advanced Micro Devices', 'Nvidia', 'Microsoft','Adobe','Apple','Tesla'] #TECH

In [7]:
#FINDING ARTICLES RELATED TO TICKER

#initializing variables and driver
article_links = []
driver = webdriver.Chrome('./chromedriver')

#loop through ticker_names
for i in range(len(ticker_names)): 
    site = 'https://in.reuters.com/search/news?blob=' + ticker_names[i]
    driver.get(site)
    
    #click 'load more' button 9x 
    for _ in range(9): 
        try:
            loadMoreButton = driver.find_element_by_xpath('//*[@id="content"]/section[2]/div/div[1]/div[4]/div/div[4]/div[1]')
            time.sleep(2) 
            loadMoreButton.click()
            time.sleep(4)
        except Exception as e:
            print( e)
            break
    #find elements 
    elems = [elem.get_attribute("href") for elem in driver.find_elements_by_xpath("//a[@href]")]
    
    #append article_links if article in link 
    articles = [[tickers[i],x] for x in elems if 'article' in x ]
    article_links += articles
    
driver.quit()

In [8]:
len(article_links)

872

In [None]:
'''
CREATING DF WITH TICKER, DATE OF POST, CONTENT
'''
df = pd.DataFrame(columns = ['ticker','date','content'])
for ticker , link in article_links: 
    req = requests.get(link)
    souper = BeautifulSoup(req.text,'html.parser')

    #post date
    mydivs = souper.find_all('div', class_="ArticleHeader_date")
    DOP = str(mydivs[0]).split('>')[1].split('/')[0][:-1]

    #getting content 
    content = souper.get_text().replace('\n', '')

    #adding to df
    data = [[ticker, DOP, content]]

    df2 = pd.DataFrame(data ,columns = ['ticker','date','content'])

    df = df.append(df2)


In [11]:
# df = df.set_index('date').drop_duplicates()
df.head()
# df.describe()

Unnamed: 0,ticker,date,content
0,AMD,"March 26, 2020",BRIEF-Advanced Micro Devices -...
0,AMD,"August 28, 2017","BRIEF-Advanced Micro Devices, ..."
0,AMD,"February 3, 2020","COLUMN-With virus outbreak, Ch..."
0,AMD,"March 13, 2018",BRIEF-Advanced Micro Devices C...
0,AMD,"July 3, 2017",BRIEF-Advanced Micro Devices a...


In [19]:
df.date

0      March 26, 2020
0     August 28, 2017
0    February 3, 2020
0      March 13, 2018
0        July 3, 2017
           ...       
0    January 13, 2020
0       March 5, 2020
0       March 5, 2020
0    January 14, 2020
0    January 29, 2020
Name: date, Length: 385, dtype: object

In [78]:
from contextlib import contextmanager
import sys, os

@contextmanager
def suppress_stdout():
    with open(os.devnull, "w") as devnull:
        old_stdout = sys.stdout
        sys.stdout = devnull
        try:  
            yield
        finally:
            sys.stdout = old_stdout

In [26]:
'''
get highest price delta within a 30 day period
'''

def loss_gain(ticker, date, end):
    
    
    day = yf.download(ticker , date , date + timedelta(end))
    df_close = pd.DataFrame(day.Close)
    
    cmax = df_close.iloc[1:].max().values
    cmax = df_close[df_close['Close']==cmax[0]] #max value 
    cmax_date = cmax.index #date of max value 
    first_price = df_close.iloc[0].values
    pct_change = 100*((cmax.values - first_price)/first_price)
    
    tdelta = (cmax_date - date).days.values[0]
    
    return [[pct_change[0][0], tdelta]]

change = []
dates = df.date.values

for i in range(len(dates)):  #change [:2]
    with suppress_stdout(): 
        try: 
            ticker = df.iloc[i].ticker
            #getting date and reformatting to fit into yfinance download function. 

            date = datetime.strptime(dates[i], '%B %d, %Y').strftime('%Y-%m-%d')
            date = datetime.strptime(date, '%Y-%m-%d' )


            #finding the time delta from post date to now.
            end = abs(int(str(date - datetime.now()).split(',')[0].replace('days','')))

            #getting highest percent change within a 30 day forecast. If post does not have 30 days future it will get the max future data
            if end > 30: 
                end = 30
                print('ticker: %s' %ticker,'date: %s' %date, 'timedelta: %s' %end)
                change += loss_gain(ticker, date, end)
            else: 
                print(ticker,date,end)
                change += loss_gain(ticker, date, end)

        except:
            print('error')
            change += [[0,0]] #substituting for error raised, row will be dropped at the end. 


ticker: AMD date: 2020-03-26 00:00:00 timedelta: 30
[*********************100%***********************]  1 of 1 completed
ticker: AMD date: 2017-08-28 00:00:00 timedelta: 30
error
ticker: AMD date: 2020-02-03 00:00:00 timedelta: 30
error
ticker: AMD date: 2018-03-13 00:00:00 timedelta: 30
[*********************100%***********************]  1 of 1 completed
ticker: AMD date: 2017-07-03 00:00:00 timedelta: 30
error
ticker: AMD date: 2018-03-06 00:00:00 timedelta: 30
[*********************100%***********************]  1 of 1 completed
ticker: AMD date: 2017-03-08 00:00:00 timedelta: 30
error
ticker: AMD date: 2017-11-01 00:00:00 timedelta: 30
[*********************100%***********************]  1 of 1 completed
ticker: AMD date: 2017-07-25 00:00:00 timedelta: 30
error
ticker: AMD date: 2018-04-25 00:00:00 timedelta: 30
[*********************100%***********************]  1 of 1 completed
ticker: AMD date: 2017-03-21 00:00:00 timedelta: 30
[*********************100%***********************]  1

In [27]:
df_delta = pd.DataFrame(change, columns=['pct_change', 'time_delta'])
df_delta.T

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,375,376,377,378,379,380,381,382,383,384
pct_change,19.936845,0.0,0.0,12.346691,0.0,-1.460482,0.0,21.481487,0.0,10.741301,...,-3.161263,22.042539,9.163146,2.914769,2.914769,15.568348,-2.826726,-2.826726,16.388078,12.294694
time_delta,25.0,0.0,0.0,-174.0,0.0,10.0,0.0,-98.0,0.0,-387.0,...,1.0,23.0,13.0,2.0,2.0,28.0,1.0,1.0,27.0,12.0


In [34]:
#adding time_delta and pct_change to df
df['time_delta'] = df_delta['time_delta'].tolist()
df['pct_change'] = df_delta['pct_change'].tolist()


ValueError: Length of values does not match length of index

In [35]:
df.head()

Unnamed: 0,ticker,date,content,time_delta,pct_change
0,AMD,"March 26, 2020",BRIEF-Advanced Micro Devices -...,25,19.936845
0,AMD,"March 13, 2018",BRIEF-Advanced Micro Devices C...,-174,12.346691
0,AMD,"March 6, 2018",BRIEF-Advanced Micro Devices S...,10,-1.460482
0,AMD,"November 1, 2017",BRIEF-Advanced Micro Devices I...,-98,21.481487
0,AMD,"April 25, 2018",BRIEF-Advanced Micro Devices R...,-387,10.741301


#### Cleaning data

In [48]:
#taking out rows where pct_change value is 0
df = df[df['pct_change']!=0]
#taking out rows where time_delta is neg and >30 
df = df[(df['time_delta']>0) & (df['time_delta']<30)]


In [49]:
df.head()

Unnamed: 0,ticker,date,content,time_delta,pct_change
0,AMD,"March 26, 2020",BRIEF-Advanced Micro Devices -...,25,19.936845
0,AMD,"March 6, 2018",BRIEF-Advanced Micro Devices S...,10,-1.460482
0,AMD,"July 3, 2017",BRIEF-Advanced Micro Devices s...,23,4.606666
0,AMD,"April 23, 2020",Special Report: As virus advan...,4,1.055456
0,AMD,"April 23, 2020",Special Report: As virus advan...,4,1.055456


In [50]:
#getting a quick sentiment analysis from vader to compare to pct_change

from nltk.sentiment.vader import SentimentIntensityAnalyzer
analyzer = SentimentIntensityAnalyzer()
sentiment = df.content.apply(analyzer.polarity_scores).tolist()

In [59]:
#getting pos/neg values of content sentiment
sent_parsed = []
for i in sentiment: 
    if i['pos'] > i['neg']: 
        sent_parsed += [i['pos']]
    else: 
        sent_parsed += [-i['neg']]


In [60]:
df['sentiment'] = sent_parsed 
df.head()

Unnamed: 0,ticker,date,content,time_delta,pct_change,sentiment
0,AMD,"March 26, 2020",BRIEF-Advanced Micro Devices -...,25,19.936845,0.103
0,AMD,"March 6, 2018",BRIEF-Advanced Micro Devices S...,10,-1.460482,0.065
0,AMD,"July 3, 2017",BRIEF-Advanced Micro Devices s...,23,4.606666,0.097
0,AMD,"April 23, 2020",Special Report: As virus advan...,4,1.055456,-0.1
0,AMD,"April 23, 2020",Special Report: As virus advan...,4,1.055456,-0.1


In [75]:
#selecting rows where pct_change and sentiment both have + or - values
df = df[(df['sentiment']>0) & (df['pct_change']>0) | (df['sentiment']<0) & (df['pct_change']<0)]
print(df.describe())
print(df.head().T)

       time_delta  pct_change   sentiment
count  238.000000  238.000000  238.000000
mean    17.231092   10.195527    0.074782
std      9.593046    7.330073    0.034268
min      1.000000   -3.183001   -0.086000
25%      7.000000    4.390251    0.057250
50%     20.000000    9.076299    0.076000
75%     27.000000   15.469290    0.096250
max     29.000000   45.977015    0.170000
                                                            0  \
ticker                                                    AMD   
date                                           March 26, 2020   
content                     BRIEF-Advanced Micro Devices -...   
time_delta                                                 25   
pct_change                                            19.9368   
sentiment                                               0.103   

                                                            0  \
ticker                                                    AMD   
date                                

In [76]:
df.to_csv('./data/tech_stock.csv')

In [77]:
df.shape

(238, 6)

more tickers here http://eoddata.com/stocklist/NYSE/B.htm
