In [None]:
# To check the current working directory
import os
os.getcwd()

In [None]:
# To change and check the current working directory
os.chdir('/Users/wimal/Documents/DataScience/UNC_Bootcamp/Project/')
os.getcwd()

In [None]:
# Import our dependencies
import pandas as pd
import numpy as np
import csv
import re
import pandas_datareader.data as pdr
import datetime as dt
from sqlalchemy import create_engine
import psycopg2

from config import db_password
import requests
import pickle
from bs4 import BeautifulSoup as bs

import matplotlib.pyplot as plt
%matplotlib inline
import time
from collections import Counter

In [None]:
import bs4 as bs
import datetime as dt
import os
from pandas_datareader import data as pdr
import pickle
import requests
import fix_yahoo_finance as yf
from collections import Counter


yf.pdr_override('tickers')

def save_sp500_tickers():
    resp = requests.get('http://en.wikipedia.org/wiki/List_of_S%26P_500_companies')
    soup = bs.BeautifulSoup(resp.text, 'lxml')
    table = soup.find('table', {'class': 'wikitable sortable'})
    tickers = []
    for row in table.findAll('tr')[1:]:
        ticker = row.findAll('td')[0].text.replace('.', '-')
        ticker = ticker[:-1]
        tickers.append(ticker)
    with open("sp500tickers.pickle", "wb") as f:
        pickle.dump(tickers, f)
    return tickers


# save_sp500_tickers()
def get_data_from_yahoo(reload_sp500=False):
    if reload_sp500:
        tickers = save_sp500_tickers()
    else:
        with open("sp500tickers.pickle", "rb") as f:
            tickers = pickle.load(f)
    if not os.path.exists('stock_dfs'):
        os.makedirs('stock_dfs')
    start = dt.datetime(2020, 1, 1)
    end = dt.datetime.now()
    for ticker in tickers:
        try:
            print(ticker)
            if not os.path.exists('stock_dfs/{}.csv'.format(ticker)):
                df = pdr.get_data_yahoo(ticker, start, end)
                df.reset_index(inplace=True)
                df.set_index("Date", inplace=True)
                df.to_csv('stock_dfs/{}.csv'.format(ticker))
            else:
                print('Already have {}'.format(ticker))
        except KeyError:
            pass

save_sp500_tickers()
get_data_from_yahoo()

In [None]:
def compile_data():
    with open("sp500tickers.pickle", "rb") as f:
        tickers = pickle.load(f)

    main_df = pd.DataFrame()

    for count, ticker in enumerate(tickers):
        df = pd.read_csv('stock_dfs/{}.csv'.format(ticker))
        df.set_index('Date', inplace=True)

        df.rename(columns={'Adj Close': ticker}, inplace=True)
        df.drop(['Open', 'High', 'Low', 'Close', 'Volume'], 1, inplace=True)

        if main_df.empty:
            main_df = df
        else:
            main_df = main_df.join(df, how='outer')

        if count % 10 == 0:
            print(count)
    print(main_df.head())
    main_df.to_csv('sp500_joined_closes.csv')


compile_data()

In [None]:
def visualize_data():
    df = pd.read_csv('sp500_joined_closes.csv')
    df_corr = df.corr()
    print(df_corr.head())
    df_corr.to_csv('sp500corr.csv')
    data1 = df_corr.values
    fig1 = plt.figure()
    ax1 = fig1.add_subplot(111)

    heatmap1 = ax1.pcolor(data1, cmap=plt.cm.RdYlGn)
    fig1.colorbar(heatmap1)

    ax1.set_xticks(np.arange(data1.shape[1]) + 0.5, minor=False)
    ax1.set_yticks(np.arange(data1.shape[0]) + 0.5, minor=False)
    ax1.invert_yaxis()
    ax1.xaxis.tick_top()
    column_labels = df_corr.columns
    row_labels = df_corr.index
    ax1.set_xticklabels(column_labels)
    ax1.set_yticklabels(row_labels)
    plt.xticks(rotation=90)
    heatmap1.set_clim(-1, 1)
    plt.tight_layout()
    plt.show()


visualize_data()


In [None]:
def process_data_for_labels(ticker):
    hm_days = 7
    df = pd.read_csv('sp500_joined_closes.csv', index_col=0)
    tickers = df.columns.values.tolist()
    df.fillna(0, inplace=True)

    for i in range(1,hm_days+1):
        df['{}_{}d'.format(ticker,i)] = (df[ticker].shift(-i) - df[ticker]) / df[ticker]

    df.fillna(0, inplace=True)
    return tickers, df

In [None]:
def buy_sell_hold(*args):
    cols = [c for c in args]
    requirement = 0.02
    for col in cols:
        if col > requirement:
            return 1
        if col < -requirement:
            return -1
    return 0

In [None]:
def extract_featuresets(ticker):
    tickers, df = process_data_for_labels(ticker)

    df['{}_target'.format(ticker)] = list(map( buy_sell_hold,
                                               df['{}_1d'.format(ticker)],
                                               df['{}_2d'.format(ticker)],
                                               df['{}_3d'.format(ticker)],
                                               df['{}_4d'.format(ticker)],
                                               df['{}_5d'.format(ticker)],
                                               df['{}_6d'.format(ticker)],
                                               df['{}_7d'.format(ticker)]))

    vals = df['{}_target'.format(ticker)].values.tolist()
    str_vals = [str(i) for i in vals]
    print('Data spread:', Counter(str_vals))

    df.fillna(0, inplace=True)
    df = df.replace([np.inf, -np.inf], np.nan)
    df.dropna(inplace=True)

    df_vals = df[[ticker for ticker in tickers]].pct_change()
    df_vals = df_vals.replace([np.inf, -np.inf], 0)
    df_vals.fillna(0, inplace=True)

    X = df_vals.values
    y = df['{}_target'.format(ticker)].values
    return X, y, df


## Machine learning for "BUYING/ SELLING"

def do_ml(ticker):
    from sklearn.model_selection import train_test_split
    from sklearn.svm import LinearSVC
    from sklearn.neighbors import KNeighborsClassifier
    from sklearn.ensemble import RandomForestClassifier, VotingClassifier
    X, y, df = extract_featuresets(ticker)

    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25)

    clf = VotingClassifier([('lsvc', LinearSVC()),
                            ('knn', KNeighborsClassifier()),
                            ('rfor', RandomForestClassifier())])
    clf.fit(X_train, y_train)
    confidence = clf.score(X_test, y_test)
    print('accuracy:', confidence)
    predictions = clf.predict(X_test)
    print('predicted class counts:', Counter(predictions))
    print()
    print()
    return confidence


# examples of running:
do_ml('XOM')
do_ml('AAPL')
do_ml('ABT')

#We can also run this against all tickers:

from statistics import mean

with open("sp500tickers.pickle","rb") as f:
    tickers = pickle.load(f)

accuracies = []
for count,ticker in enumerate(tickers):

    if count%10==0:
        print(count)

    accuracy = do_ml(ticker)
    accuracies.append(accuracy)
    print("{} accuracy: {}. Average accuracy:{}".format(ticker,accuracy,mean(accuracies)))

In [None]:
# Pfizer
do_ml('PFE')

In [None]:
# Gilead
do_ml('GILD')

In [None]:
do_ml("LH")

In [None]:
do_ml("CI")

In [None]:
do_ml("BIIB")

In [None]:
do_ml("ABT")

In [None]:
do_ml("UNH")

In [None]:
do_ml("JNJ")

In [None]:
do_ml("AMGN")

In [None]:
do_ml("BMY")

In [None]:
df = pd.read_csv('sp500_joined_closes.csv')

In [None]:
def process_file(f):
    with open("sp500tickers.pickle", "rb") as f:
        tickers = pickle.load(f)

    for count, ticker in enumerate(tickers):
        df_temp = pd.read_csv('stock_dfs/{}.csv'.format(ticker))
        main_df = df.join(df_temp, how = "outer")
    print(main_df.head())
    main_df.to_csv('sp500_joined_closes.csv')


compile_data()

In [None]:
main_df

In [None]:
frames = [ process_your_file(f) for f in files ]
result = pd.concat(frames)

In [None]:
df_AAPL

In [None]:
import glob

In [None]:
path = '/apps/data_csv_files
csv_files = glob.glob(path + "/*.csv")

In [None]:
df_list = (pd.read_csv(file) for file in csv_files)

In [None]:
big_df   = pd.concat(df_list, ignore_index=True)

In [None]:
path = '/Users/wimal/Documents/DataScience/UNC_Bootcamp/Project/stock_dfs'
df = pd.concat(map(pd.read_csv, glob.glob(path + "/*.csv")))

In [None]:
df

In [None]:
df.set_index('Date', inplace=True)
df.rename(columns={'Adj Close': 'AdjClose'}, inplace=True)
df["Code"] = df[["Ticker", "Date"]].apply(lambda x: "_".join(x), axis = 1)

In [None]:
df

In [None]:
df["Code"] = df[["Ticker", "Date"]].apply(lambda x: "_".join(x), axis = 1)