S&P500 stock data:
http://pages.swcp.com/stocks/


n = pd.read_csv('http://chart.finance.yahoo.com/table.csv?s=ACIU&a=7&b=9&c=1996&d=10&e=19&f=2016&g=d&ignore=.csv')

http://www.eoddata.com/download.aspx

<p>SEC new https url format:</p>
https://www.sec.gov/Archives/edgar/data/320193/0000000000-11-005950.txt

In [1]:
from sqlalchemy.orm import sessionmaker
from sqlalchemy import Table
from sqlalchemy import create_engine, inspect
from sqlalchemy import MetaData
from edgerdb import helper_functions as hlp
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import os
import requests
from yahoo_finance import Share
from pprint import pprint


meta = MetaData()

engine = create_engine('postgresql://analyst:@localhost:5432/edgar')

messages = Table('filings', meta, autoload=True, autoload_with=engine)

def sql_to_df(query, connection, *args, **kwargs):
    df = pd.read_sql(query, connection, *args, **kwargs)
    connection.close()
    return df

In [2]:
%matplotlib inline

<h2>Load table containing cik_symbol matches from database</h2>

In [3]:
cik_and_tickers = sql_to_df('select * from temp_cik_to_ticker;', engine.connect()).set_index(keys='index')

In [4]:
cik_and_tickers.head()

Unnamed: 0_level_0,cik,company_name,exchange,isactive,symbol
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
0,1591890,"1347 PROPERTY INSURANCE HOLDINGS, INC.",NASDAQ,1,PIH
1,1508475,"21VIANET GROUP, INC.",NASDAQ,1,VNET
2,1459417,"2U, INC.",NASDAQ,1,TWOU
3,1295484,"51JOB, INC.",NASDAQ,1,JOBS
4,1295484,"51JOB, INC.",NASDAQ,1,JOBS


<h2>The below cell is for storing functions</h2>

In [5]:
def in_both(first_li, second_li):
    both = []
    for x in second_li:
        if x in first_li:
            both.append(x)
    return both


def find_csv_files_in_dir(directory):
    paths = []
    for subdir, dirs, files in os.walk(directory):
        for file in files:
            filepath = subdir + os.sep + file
            if filepath.endswith(".csv"):
                paths.append(filepath)
    return paths


def concat_eod_data(list_of_paths):
    '''
        This functions takes as input a list of paths to csv files containg eod stock data and join the files
        into a single dataframe.
    '''
    df = pd.DataFrame(columns=['Symbol', 'Open', 'High', 'Low', 'Close', 'Volume'])
    for path in list_of_paths:
        temp_df = pd.read_csv(path, index_col='Date',
                             parse_dates=True)
        df = pd.concat([df, temp_df])
    return df


def insert(df, entries):
    """
        Takes a set of values and inserts them into a dataframe sequentially where the index is
        sequential and continuous.
    """
    try:
        df.loc[max(df.index) + 1] = entries
    except ValueError:
        df.loc[0] = entries
        
        
def create_spy_df(start_date, end_date, file=None):
    if file == None:
        spy = Share('SPY')
        spy_5_year = spy.get_historical(start_date, end_date)        
        spy_df = pd.DataFrame(columns=['Low', 'Date', 'Adj_Close', 'Volume', 'Close', 'Open', 'Symbol', 'High'])

        for val in range(len(spy_5_year)):
            values = []
            for x in spy_5_year[val]:
                values.append(spy_5_year[val][x])
            insert(spy_df, values)

        spy_df = spy_df.set_index(spy_df['Date'])
        spy_df = spy_df[['Symbol', 'Open', 'High', 'Low', 'Adj_Close', 'Volume']]
        spy_df = spy_df.rename(columns={'Adj_Close': 'Close'})
        return spy_df
    else:
        df = pd.read_csv(file)

<h2> Create spy_df from yahoo data </h2>

In [6]:
try:
    spy_df = pd.read_csv('data/spy2011-2016.csv', index_col='Date', parse_dates=True)
except:
    spy_df = create_spy_df('2011-01-01', '2016-11-03')
    spy_df.to_csv('data/spy2011-2016.csv')

<h2>Create five_year_nasdaq_df from directory of csv's</h2>

In [7]:
five_year_nasdaq_files = find_csv_files_in_dir('data/5_YEAR_NASDAQ')
five_year_nasdaq_df = concat_eod_data(five_year_nasdaq_files)
#five_year_nasdaq_df['Date'] = five_year_nasdaq_df['Date'].apply(lambda x: x.replace(x[3:6], months[x[3:6]]))
#five_year_nasdaq_df['Date'] = five_year_nasdaq_df['Date'].apply(lambda x: x.split('-')[2] + x.split('-')[1] + x.split('-')[0])


<h2>Add SPY data to five_year_nasdaq_df as a baseline</h2>

In [8]:
five_year_nasdaq_df = pd.concat([five_year_nasdaq_df, spy_df])
five_year_nasdaq_df.columns = map(str.lower, five_year_nasdaq_df.columns)
five_year_nasdaq_df[five_year_nasdaq_df['symbol'] == 'SPY'].head()


Unnamed: 0,symbol,open,high,low,close,volume
2016-11-03,SPY,209.990005,210.240005,208.460007,208.779999,88939300
2016-11-02,SPY,210.649994,211.100006,209.229996,209.740005,103330800
2016-11-01,SPY,212.929993,212.990005,209.600006,211.009995,122781800
2016-10-31,SPY,212.929993,213.190002,212.360001,212.550003,61272500
2016-10-28,SPY,213.139999,213.929993,211.710007,212.539993,140623200


<h2>Look for tickers in both cik_symbol list and nasdaq_df</h2>

<p>This will help to narrow down data to use when running analysis on the filings.</p>

In [10]:
tickers = cik_and_tickers['symbol'].tolist()
tickers_in_df = five_year_nasdaq_df['symbol'].unique().tolist()
tickers_in_both = in_both(tickers_in_df, tickers)
tickers_cik_in_both = cik_and_tickers[cik_and_tickers['symbol'].isin(tickers_in_both)]

<h2>Load filings table to a pandas dataframe</h2>

In [11]:
filings = sql_to_df('select * from filings where cik in {};'.format(tuple(tickers_cik_in_both['cik'])), engine.connect(), index_col='date')
#filings.index = filings.index.map(lambda x: x[0:4] + '-' + x[4:6] + '-' + x[6:8])

In [12]:
len(filings)

1307357

<h2>Create dataframe of both companies in both sets of data</h2>

<p>This will need to be corrected because the data is useless without a date.</p>

In [13]:
tickers_cik = pd.DataFrame(tickers_cik_in_both, columns=['cik', 'symbol'])
symbol_to_filings = pd.merge(left=filings, right=tickers_cik[tickers_cik['cik'].isin(
            tickers_cik['cik'].unique())], left_on='cik', right_on='cik')
#symbol_to_filings = pd.DataFrame(symbol_to_filings, columns=['cik', 'symbol', 'company_name', 'type', 'date', 'path'])

In [14]:
symbol_to_filings.head()

Unnamed: 0,cik,company_name,type,path,symbol
0,2034,ACETO CORP,10-K,edgar/data/2034/0000002034-95-000005.txt,ACET
1,2034,ACETO CORP,10-K,edgar/data/2034/0000002034-96-000004.txt,ACET
2,2034,ACETO CORP,10-K,edgar/data/2034/0000002034-97-000005.txt,ACET
3,2034,ACETO CORP,10-K,edgar/data/2034/0000002034-98-000003.txt,ACET
4,2034,ACETO CORP,10-K,edgar/data/2034/0000002034-99-000008.txt,ACET


<h2>Example of date splicing</h2>

In [15]:
dates = pd.date_range('1994-01-22', '1994-04-26')
df = pd.DataFrame(index=dates)
temp_df = pd.DataFrame(filings)
df = df.join(temp_df)
df = df.dropna()
df.head()

Unnamed: 0,cik,company_name,type,path
1994-01-24,813828,VIACOM INC,SC 14D1/A,edgar/data/813828/0000950112-94-000152.txt
1994-01-24,813828,VIACOM INC,SC 14D1/A,edgar/data/813828/0000950112-94-000159.txt
1994-01-25,106040,WESTERN DIGITAL CORP,10-Q,edgar/data/106040/0000892569-94-000025.txt
1994-01-25,750577,HANCOCK HOLDING CO,PRE 14A,edgar/data/750577/0000950129-94-000042.txt
1994-01-25,813828,VIACOM INC,SC 14D1/A,edgar/data/813828/0000950112-94-000163.txt


In [20]:
five_year_nasdaq_df[five_year_nasdaq_df['symbol'] == 'AAPL']

Unnamed: 0,symbol,open,high,low,close,volume
2011-01-03,AAPL,46.5200,47.1800,46.4057,47.0814,15897200
2011-01-04,AAPL,47.4914,47.5000,46.8786,47.3271,11048100
2011-01-05,AAPL,47.0786,47.7629,47.0714,47.7143,9125500
2011-01-06,AAPL,47.8157,47.8929,47.5571,47.6757,10729500
2011-01-07,AAPL,47.7129,48.0500,47.4143,48.0171,11140300
2011-01-10,AAPL,48.3971,49.0329,48.1671,48.9214,16019900
2011-01-11,AAPL,49.2686,49.2800,48.4957,48.8057,15859900
2011-01-12,AAPL,49.0357,49.2043,48.8571,49.2029,10806300
2011-01-13,AAPL,49.3086,49.5200,49.1214,49.3829,10648000
2011-01-14,AAPL,49.4129,49.7829,49.2057,49.7829,11029900


<h2>Example analysis with AAPL</h2>

In [21]:
dates = pd.date_range('2011-01-02', '2011-01-26')
aapl_stock_price = pd.DataFrame(index=dates)
temp_df = pd.DataFrame(five_year_nasdaq_df[five_year_nasdaq_df['symbol'] =="AAPL"], columns=['close'])
aapl_stock_price = aapl_stock_price.join(temp_df)
aapl_stock_price = aapl_stock_price.dropna()
aapl_stock_price.head()

Unnamed: 0,close
2011-01-03,47.0814
2011-01-04,47.3271
2011-01-05,47.7143
2011-01-06,47.6757
2011-01-07,48.0171


In [22]:
aapl_cik = cik_and_tickers[cik_and_tickers['symbol'] == 'AAPL']
print(aapl_cik)

          cik company_name exchange  isactive symbol
index                                               
6580   320193    APPLE INC   NASDAQ         1   AAPL


In [23]:
dates = pd.date_range('2011-01-02', '2016-11-03')
aapl_filings = pd.DataFrame(index=dates)
temp_df = pd.DataFrame(filings[filings['cik'] =="320193"])
aapl_filings = aapl_filings.join(temp_df)
aapl_filings = aapl_filings.dropna()
aapl_filings.head()

Unnamed: 0,cik,company_name,type,path
2011-01-07,320193,APPLE INC,DEF 14A,edgar/data/320193/0001193125-11-003231.txt
2011-01-07,320193,APPLE INC,DEFA14A,edgar/data/320193/0001193125-11-003232.txt
2011-01-12,320193,APPLE INC,4,edgar/data/320193/0001181431-11-003352.txt
2011-01-18,320193,APPLE INC,8-K,edgar/data/320193/0001181431-11-003847.txt
2011-01-18,320193,APPLE INC,8-K,edgar/data/320193/0001193125-11-009392.txt


In [None]:
#dates = pd.date_range('2011-01-02', '2011-01-26')
#aapl_stock_price = pd.DataFrame(index=dates)
#temp_df = pd.DataFrame(five_year_nasdaq_df[five_year_nasdaq_df['Symbol'] =="AAPL"], columns=['Close'])
#aapl_stock_price = aapl_stock_price.join(temp_df)
#aapl_stock_price = aapl_stock_price.dropna()

def stock_data_filings(startdate, enddate, symbol, cik_symbol_match_df, stock_data_df):
    """
    Returns two dataframes. The first contains the eod stock price and the other
    the filings for the given date period.
    """
    dates = pd.date_range(startdate, enddate)
    try:
        cik = cik_symbol_match_df[cik_symbol_match_df['symbol'] == symbol]['cik'].values[0]
        stock_price = pd.DataFrame(index=dates)
        temp_df = pd.DataFrame(stock_data_df[stock_data_df['symbol'] == symbol], columns=['close'])
        stock_price = stock_price.join(temp_df)
        stock_price = stock_price.dropna()
        this_filings = pd.DataFrame(index=dates)
        temp_df2 = pd.DataFrame(filings[filings['cik'] == cik])
        this_filings = this_filings.join(temp_df2)
        this_filings = this_filings.dropna()
        return stock_price, this_filings
    except IndexError:
        return 'Symbol not located in dataset.'
    


In [None]:
aapl_price, aapl_filings = stock_data_filings('2011-01-02', '2016-11-03', 'AAPL', cik_and_tickers, five_year_nasdaq_df)

In [None]:
stock_price_on_day_of_filing = aapl_filings.join(aapl_price)


In [None]:
# Add previous close and next day close to the dataframe
stock_price_on_day_of_filing.head()

<h2>AAPL Rolling Mean</h2>

In [None]:
def get_data(symbols, dates, eod_df):
    """Read stock data (adjusted close) for given symbols from CSV files."""
    df = pd.DataFrame(index=dates)
    count = 0
    for symbol in symbols:
        # TODO: Read and join data for each symbol
        temp_df = pd.DataFrame(eod_df[eod_df['Symbol'] == symbol], columns=['Close'])
        temp_df = temp_df.rename(columns={'Close': symbol})
        df = df.join(temp_df)
        df = df.dropna()
    return df


def plot_data(df, title='Stock Prices', ylabel="Price", xlabel="Date"):
    """Plot stock prices with a custom title and meaningful axis labels."""
    ax = df.plot(title=title, fontsize=12)
    ax.set_xlabel(xlabel)
    ax.set_ylabel(ylabel)
    plt.show()
    
    
def test_run():
    # Read data
    dates = pd.date_range('2011-01-01', '2016-12-31')
    symbols = ['AAPL']
    df = get_data(symbols, dates, five_year_nasdaq_df)
    
    # Plot SPY data, retain matplotlib axis object
    ax = df['AAPL'].plot(title='2012 Apple rolling mean', label='AAPL')
    
    # Compute the rolling mean using a 20 day window
    rm_AAPL = pd.rolling_mean(df['AAPL'], window=20)
    
    # Add rolling mean to same plot
    rm_AAPL.plot(label='Rolling mean', ax=ax)
    
    # Add axis labels and legend
    ax.set_xlabel('Date')
    ax.set_ylabel('Price')
    ax.legend(loc='upper left')
    plt.show()
    
    
    
if __name__ == "__main__":
    test_run()

In [None]:
dates = pd.date_range('2011-01-22', '2011-04-26')
get_data(['AAPL', 'ACET'], dates, five_year_nasdaq_df).head()

<h2>Goal:</h2>

<p>
The goal is to find out which forms affect stock price.  My idea is to to look at the price the day before and day of a form being filed.  To do this start with one stock, make it work then switch to many stocks.  To meausre the change in price to the the mean error (standard deviation).
</p>

In [None]:
first_day = 70
second_day = 56
change = pow(second_day - first_day, 2)


In [None]:
mean = five_year_nasdaq_df[five_year_nasdaq_df['Symbol'] =='AAME']['Close'].mean()
std = five_year_nasdaq_df[five_year_nasdaq_df['Symbol'] =='AAME']['Close'].std()

In [None]:
five_year_nasdaq_df[five_year_nasdaq_df['Symbol'] =='AAME']['Close'][0:10]