In [1]:
from sqlalchemy.orm import sessionmaker
from sqlalchemy import Table
from sqlalchemy import create_engine, inspect
from sqlalchemy import MetaData
from edgerdb import helper_functions as hlp
import pandas as pd
import numpy as np
from fuzzywuzzy import fuzz, process
import re
from requests import get

In [2]:
daily_files = hlp.generate_daily_file_paths()

last_date_in_db = int(hlp.latest_index_in_db('filings', hlp.db())[0])

hlp.load_latest_files(daily_files, last_date=last_date_in_db)

Latest File in DB: 20161122
edgar/daily-index/master.20161003.idx already loaded
edgar/daily-index/master.20161004.idx already loaded
edgar/daily-index/master.20161005.idx already loaded
edgar/daily-index/master.20161006.idx already loaded
edgar/daily-index/master.20161007.idx already loaded
edgar/daily-index/master.20161011.idx already loaded
edgar/daily-index/master.20161012.idx already loaded
edgar/daily-index/master.20161013.idx already loaded
edgar/daily-index/master.20161014.idx already loaded
edgar/daily-index/master.20161017.idx already loaded
edgar/daily-index/master.20161018.idx already loaded
edgar/daily-index/master.20161019.idx already loaded
edgar/daily-index/master.20161020.idx already loaded
edgar/daily-index/master.20161021.idx already loaded
edgar/daily-index/master.20161024.idx already loaded
edgar/daily-index/master.20161025.idx already loaded
edgar/daily-index/master.20161026.idx already loaded
edgar/daily-index/master.20161027.idx already loaded
edgar/daily-index/

In [5]:
meta = MetaData()

engine = create_engine('postgresql://analyst:@localhost:5432/edgar')

messages = Table('filings', meta, autoload=True, autoload_with=engine)

def sql_to_df(query, connection):
    df = pd.read_sql(query, connection)
    connection.close()
    return df


In [6]:
def create_csi_data_table(engine):
    '''
    Creates a table with the data from csidata.com
    '''
    conn = engine.connect()
    csi_stock_data = pd.read_csv('http://www.csidata.com/factsheets.php?type=stock&format=csv')
    csi_stock_data.columns = map(str.lower, csi_stock_data.columns)
    csi_stock_data = csi_stock_data.rename(columns={'name': 'company_name'})
    csi_stock_data['company_name'] = csi_stock_data['company_name'].str.upper()
    csi_stock_data['company_name'] = csi_stock_data['company_name'].str.strip()
    csi_stock_data = csi_stock_data.fillna(value="NONE")
    csi_stock_data.to_sql('csi_stock_data', engine)
    conn.close()
    
def create_table_with_cik_and_csi_data():
    '''
        Creates a table called cik_to_csi by joining filings and csi_stock_data tables on company_name
    '''
    join_ticker_to_cik = """select * into temp_cik_to_csi from (select fil.cik, fil.company_name,
                            csi.symbol, csi.exchange,
                            csi.isactive, csi.startdate, csi.enddate
                            from filings fil
                            join csi_stock_data csi on fil.company_name = csi.company_name) as foo;"""
    clean_ticker_to_cik_table = """select * into cik_to_csi
    from (select distinct cik, c.company_name, c.symbol, c.exchange, c.isactive, c.startdate, c.enddate  from temp_cik_to_csi c) as foo;"""
    hlp.clear_sessions('edgar', hlp.db())
    hlp.statement(join_ticker_to_cik, hlp.db(), output=False, commit=True)
    hlp.statement(clean_ticker_to_cik_table, hlp.db(), output=False, commit=True)
    hlp.statement("drop table temp_cik_to_csi;", hlp.db(), output=False, commit=True)
    


In [14]:
def tickers_to_cik(list_of_tickers):
    URL = 'http://www.sec.gov/cgi-bin/browse-edgar?CIK={}&Find=Search&owner=exclude&action=getcompany'
    CIK_RE = re.compile('.*CIK=(\d{10}).*')

    cik_dict = {}
    for ticker in list_of_tickers:
        results = CIK_RE.findall(get(URL.format(ticker)).text)
        if len(results):
            cik_dict[str(ticker).upper()] = str(results[0]).lstrip('0')
        else:
            no_matches.append(str(ticker).upper())
    return cik_dict

def new_nasdaq_tickers_and_names(engine):
    """
        This function queries the cik_to_csi table and returns a dataframe of tickers and company names for companies
        not found in our data.
    """
    nasdaq_companies = pd.read_csv("http://www.nasdaq.com/screening/companies-by-industry.aspx?exchange=NASDAQ&render=download")
    amex_companies = pd.read_csv("http://www.nasdaq.com/screening/companies-by-industry.aspx?exchange=AMEX&render=download")
    nyse_companies = pd.read_csv("http://www.nasdaq.com/screening/companies-by-industry.aspx?exchange=NYSE&render=download")
    nasdaq_name_and_ticker = pd.DataFrame(nasdaq_companies, columns=['Symbol', 'Name'])
    nasdaq_name_and_ticker['exchange'] = 'NASDAQ'
    amex_name_and_ticker = pd.DataFrame(amex_companies, columns=['Symbol', 'Name'])
    amex_name_and_ticker['exchange'] = 'AMEX'
    nyse_name_and_ticker = pd.DataFrame(nyse_companies, columns=['Symbol', 'Name'])
    nyse_name_and_ticker['exchange'] ='NYSE'
    companies_and_symbols = pd.concat([nasdaq_name_and_ticker, amex_name_and_ticker, nyse_name_and_ticker])
    companies_and_tickers = companies_and_symbols.rename(columns={'Symbol': 'symbol', "Name": "name"})
    companies_and_tickers['isactive'] = 1
    list_of_tickers_from_nasdaq = companies_and_tickers['symbol'].values.tolist()
    list_of_tickers_in_cik_to_csi_table = pd.read_sql_query('select distinct symbol from cik_to_csi;',con=engine)['symbol'].values.tolist()
    not_found_in_db = []
    for x in list_of_tickers_from_nasdaq:
        if x not in list_of_tickers_in_cik_to_csi_table and x not in not_found_in_db:
            not_found_in_db.append(x)
    return companies_and_tickers[companies_and_tickers['symbol'].isin(not_found_in_db)]

def return_matches(first_li, second_li):
    return set(first_li) & set(second_li)

def insert(df, entries):
    """
        Takes a set of values and inserts them into a dataframe sequentially where the index is
        sequential and continuous.
    """
    try:
        df.loc[max(df.index) + 1] = entries
    except ValueError:
        df.loc[0] = entries
        
def make_list_of_tickers(csi_dataframe, engine):
    tickers_from_nasdaq_site = new_nasdaq_tickers_and_names(engine)
    ticker_list1 = tickers_from_nasdaq_site['symbol'].values.tolist()
    ticker_list2 = csi_dataframe['symbol'].values.tolist()
    return list(set(ticker_list1 + ticker_list2))

In [3]:
# Create a filings dataframe from the edgar database filings table
filings_data = pd.read_sql_query('select * from filings;', con=engine)
filings_data = filings_data.rename(columns={'company': 'company_name'})


NameError: name 'pd' is not defined

In [8]:
# Create a csi_data datagrame from csidata.com
csi_stock_data = pd.read_csv('http://www.csidata.com/factsheets.php?type=stock&format=csv')
csi_data = pd.DataFrame(csi_stock_data, columns=['csinumber', 'symbol', 'company_name', 'exchange', 'isactive', 'startdate', 'enddate'])
csi_data = csi_data.dropna()

In [None]:
tickers_from_nasdaq_site = new_nasdaq_tickers_and_names(engine)
tickers_from_nasdaq_site = tickers_from_nasdaq_site.rename(columns={"name": "company_name"})
tickers_from_nasdaq_site['company_name'] = tickers_from_nasdaq_site['company_name'].str.upper()
matches_from_exchange_list= pd.merge(left=tickers_from_nasdaq_site, right=filings_df, right_on='company_name', left_on='company_name')
conn = engine.connect()
csi_to_cik_tick_matches = sql_to_df("select cik, company_name, symbol, exchange, isactive from cik_to_csi;",conn)
conn.close()
cik_to_ticker = pd.concat([matches_from_exchange_list, csi_to_cik_tick_matches])
print(len(cik_to_ticker))
cik_to_ticker.head()

In [13]:
import multiprocessing as mp
from requests import get
import json

ticker_list = make_list_of_tickers(csi_data, engine)


chunks = lambda lst, sz: [lst[i:i+sz] for i in range(0, len(lst), sz)]
        
def worker(inq,outq):
    list_of_tickers = inq.get()
    URL = 'http://www.sec.gov/cgi-bin/browse-edgar?CIK={}&Find=Search&owner=exclude&action=getcompany'
    CIK_RE = re.compile('.*CIK=(\d{10}).*')
    no_matches = []
    cik_dict = {}
    for ticker in list_of_tickers:
        results = CIK_RE.findall(get(URL.format(ticker), timeout=(3.05, 27)).text)
        if len(results):
            cik_dict[str(ticker).upper()] = str(results[0]).lstrip('0')
        else:
            no_matches.append(str(ticker).upper())
    cik_dict['no_matches'] = no_matches
    outq.put(cik_dict)

def make_ticker_cik_file(ticker_list, n_processes=3):
    processes = n_processes
    tick_list = ticker_list
    chunk_list = chunks(tick_list, int(len(tick_list) / processes))
    results_dict = {}
    no_matches = []
    
    for chu in chunk_list:
        inq = mp.Queue()
        outq = mp.Queue()
        p = mp.Process(target=worker, args=(inq,outq))
        p.start()
    
        inq.put(chu)

        # Wait for the worker to finish
        p.join()
        result = outq.get()
        for tic in result['no_matches']:
            no_matches.append(tic)
        result.pop('no_matches')
        results_dict.update(result)
        p.terminate()
    with open('data/ticker_to_cik.json', 'w') as f:
        f.write(json.dumps(results_dict))
        f.close()
    with open('data/no_cik_matches.json', 'w') as f:
        f.write(json.dumps({'no_matches': no_matches}))
        f.close()

In [10]:
ticker_list = make_list_of_tickers(csi_data, engine)


In [12]:
%%time
make_ticker_cik_file(ticker_list, n_processes=1000)


Process Process-588:
Traceback (most recent call last):
  File "/Users/lancerogers/Developer/Python/algo_analysis/.direnv/python-3.5.1/lib/python3.5/site-packages/requests/packages/urllib3/connection.py", line 142, in _new_conn
    (self.host, self.port), self.timeout, **extra_kw)
  File "/Users/lancerogers/Developer/Python/algo_analysis/.direnv/python-3.5.1/lib/python3.5/site-packages/requests/packages/urllib3/util/connection.py", line 98, in create_connection
    raise err
  File "/Users/lancerogers/Developer/Python/algo_analysis/.direnv/python-3.5.1/lib/python3.5/site-packages/requests/packages/urllib3/util/connection.py", line 88, in create_connection
    sock.connect(sa)
OSError: [Errno 65] No route to host

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
  File "/Users/lancerogers/Developer/Python/algo_analysis/.direnv/python-3.5.1/lib/python3.5/site-packages/requests/packages/urllib3/connectionpool.py", line 595, in urlopen

KeyboardInterrupt: 