In [1]:
import pandas as pd

from pathlib import Path
import csv

import os
import requests
import json

import quandl

from dotenv import load_dotenv
load_dotenv()

True

In [2]:
## Constants 

## Set start date variable - dataframes will be created starting from this date
start_date = '2016-01-01'
end_date = '2021-10-29'
default_date_range = '71m' ## Default Range for IEX functions - don't need more at the moment

In [3]:
## QUANDL/NASDAQ 
nsdq_api_key = os.environ.get('NASDAQ_API_KEY')
base_url_nsdq = 'https://data.nasdaq.com/api/v3/datasets/FINRA/'

In [4]:
## Get Short "Interest" Data from Quandl 
def get_short_data_QUANDL(symbol):
    string_nsdq = "FINRA/FNSQ_"+symbol
    string_nyse = "FINRA/FNYX_"+symbol
    
    df1 = quandl.get(string_nsdq,start_date=start_date,end_date=end_date,authtoken=nsdq_api_key)   ## Nasdaq
    df2 = quandl.get(string_nyse,start_date=start_date,end_date=end_date,authtoken=nsdq_api_key)   ## NYSE

    df1 = df1.rename(columns={'ShortVolume':'ShortVolumeNSDQ','TotalVolume':'TotalVolumeNSDQ'})
    #df1 = df1.drop(columns={'ShortExemptVolume'})
    df1 = df1.rename(columns={'ShortExemptVolume':'ShortExemptVolumeNSDQ'})

    df2 = df2.rename(columns={'ShortVolume':'ShortVolumeNYSE','TotalVolume':'TotalVolumeNYSE'})
    #df2 = df2.drop(columns={'ShortExemptVolume'})
    df2 = df2.rename(columns={'ShortExemptVolume':'ShortExemptVolumeNYSE'})

    df3 = pd.merge(df1,df2,on='Date',how='outer')
    #df3 = df3.fillna(0)
    
    return df3


## Return FTD Data from SEC FTD files using a Stock's CUSIP number to sort 
def return_ftd_data_cusip(cusip_number):
    df = ftd_df.copy()
    df.set_index("CUSIP",inplace=True)
    df = df.loc[cusip_number]
    df['Date'] = pd.to_datetime(df['Date'])
    df = df.set_index('Date')
    return df

## Return the CUSIP symbol from the symbol_df symbol list 
def return_CUSIP_from_symbol(symbol):
    df = symbol_df.copy()
    df.set_index('SYMBOL',inplace=True)
    cusip_variable = df.loc[symbol]
    cusip_variable = cusip_variable['CUSIP']
    return cusip_variable

def return_ftd_data_symbol(symbol):
    cusip_number = return_CUSIP_from_symbol(symbol)
    df = return_ftd_data_cusip(cusip_number)
    return df

In [5]:
## FMP Constants 
fmpbase_urlv3 = 'https://fmpcloud.io/api/v3/'
fmpbase_urlv4 = 'https://fmpcloud.io/api/v4/'
api_key = os.getenv("FMP_CLOUD_API_KEY")

## FMP Functions 
def get_FMP_historical_data(symbol, startDate=start_date, endDate=end_date, apiKey=api_key):
    url_hist_price = fmpbase_urlv3+'historical-price-full/'
    url_hist_query_with_date = url_hist_price+symbol+'?from='+startDate+'&to='+endDate+'&apikey='+apiKey
    resp_data = requests.get(url_hist_query_with_date)
    json_ = resp_data.json()
    data = json_['historical']
    df = pd.DataFrame(data)
    df.rename(columns={'date':'Date'},inplace=True)
    df['Date'] = pd.to_datetime(df['Date'])
    df = df.reindex(index=df.index[::-1]) ## Reverse the DataFrame 
    df.set_index('Date',inplace=True)
    df.drop(columns='label',inplace=True)
    return df

api_key = os.getenv("FMP_CLOUD_API_KEY")
def get_float_data_FMP(symbol):
    url_float_shares = fmpbase_urlv4+'shares_float?symbol='
    url_query_float_data = url_float_shares+symbol+'&apikey='+api_key
    resp_data = requests.get(url_query_float_data)
    #df = pd.DataFrame(resp_data.json())
    json_ = resp_data.json()
    return json_[0]

def get_company_profile_FMP_json(symbol):
    ## https://fmpcloud.io/api/v3/profile/AAPL?apikey='yourkeyhere'
    url_company_profile_url = fmpbase_urlv3+'profile/'+symbol+'?apikey='+api_key
    resp_data = requests.get(url_company_profile_url)
    json_response = resp_data.json()
    return json_response[0]

def save_and_export_raw_df_csv(data, symbol):
    path = ('../FilesExportIndividualStockDFs_Big/'+symbol+'_combined_df.csv')
    data.to_csv(path)

In [6]:
def get_time_series_data(symbol):
    ftd_data = return_ftd_data_symbol(symbol)
    ftd_data = ftd_data.drop(columns={'SYMBOL'})

    fmp_data = get_FMP_historical_data(symbol)
    df1 = pd.merge(fmp_data,ftd_data, on='Date',how='outer')
    df1['QUANTITY_FAILS'] = df1['QUANTITY_FAILS'].fillna(0)
    df1['volume'] = df1['volume'].fillna(0)
    df1['unadjustedVolume'] = df1['unadjustedVolume'].fillna(0)
    df1['vwap'] = df1['vwap'].fillna(0)
    df2 = get_short_data_QUANDL(symbol)
    df = pd.merge(df1,df2,on='Date',how='outer')
    df = df.fillna(0)
    return df

In [7]:
## Use pickle module to import and export and save files
import pickle
def load_obj(path):
    with open(path, 'rb') as f:
        return pickle.load(f)
def save_obj(obj, path ):
    with open(path, 'wb') as f:
        pickle.dump(obj, f, pickle.HIGHEST_PROTOCOL)

In [8]:
## Import FTD File using CSV 
ftd_df = pd.read_csv(
    Path('../Resources/ftd_all_data.csv'),
    index_col=0, parse_dates=True
)
## Import Symbol and CUSIP list using CSV
symbol_df = pd.read_csv(
    Path('../Resources/symbol_all_list.csv'),
    index_col=0
)

In [9]:
## Set iteration through symbol_df 
length_ = len(symbol_df)

complete_dict = {}            ## Empty dicts in order to create
incomplete_dict = {}          ## two big files at end of while loop
error_symbol_list = []

In [10]:
## Create while loop that access FMP and gets data 
x = 0    
y = 50
increment = 50  ## Make sure increment is right
test_length = 200 

## Setup in 50 increments to find error in case the loop fails. 


loop_length = length_ ## Set to length_ to run as full, or your test_length

## If not testing make x <= test_length
while (x < loop_length):
    ## Initialize temp dictionaries 
    complete_dict_temp = {}
    incomplete_dict_temp = {}
    ## Set y value for end of while loop
    if y >= length_: y = length_
    ## In theory this should stop the for loop from crashing 
    ##    at the end and running out of index range     
    
    range_var = range(x,y)
    ## Set label variables for exports 
    str_symbol1 = str(x)
    y2 = y - 1 
    str_symbol2 = str(y2)
    
    for i in range_var: 
        ## Iterate through symbol list and create data by symbol
        symbol_var = symbol_df['SYMBOL'][i]
        
        # Use while running loop, if error is made, can see what index it happened on 
        # can also check current value of symbol_var as well 
        error_var = i
        
        ## Get Time Series Data 
        try:
            time_series_df = get_time_series_data(symbol_var)
        except KeyError:## Some symbols fail and are unreadable. Unreadable symbols are unimportant and okay to be discarded
            error_symbol_list.append(symbol_var)
            continue
        except: 
            error_symbol_list.append(symbol_var)
            continue
#         except AttributeError:
#             continue
#         except NotFoundError: ## Occurs when no Quandl data is found
#             continue
#         except NameError: ## Occurs when no Quandl data is found 
#             continue 
        ## If successful, export data 
        save_and_export_raw_df_csv(time_series_df,symbol_var)
            
        ## Check if null values, add to different dicts if null values present, or no nulls present
        bool_var = time_series_df.isnull().values.any()
        if bool_var == False:
            complete_dict[symbol_var] = time_series_df
            complete_dict_temp[symbol_var] = time_series_df
        elif bool_var == True:
            incomplete_dict[symbol_var] = time_series_df
            incomplete_dict_temp[symbol_var] = time_series_df

    
    ## Exporting in batches is useful to catch errors, 
    ## and to also pick up where you left off if API fails while running overnight 
    pickle_path1= Path('../FilesExportCompleteFMP_big/data_complete_'+str_symbol1+'_'+str_symbol2+'.pkl')
    save_obj(complete_dict_temp,pickle_path1)
    pickle_path2= Path('../FilesExportIncompleteFMP_big/data_incomplete_'+str_symbol1+'_'+str_symbol2+'.pkl')
    save_obj(incomplete_dict_temp,pickle_path2)
    
    ## Check before run, if incorrect, can waste a lot of API credits  
    x += increment
    y += increment
    
    ## Export in blocks of {increment} in-case there are errors while processing data. 
    ## Can pick up where the function left off by changing x and y vars to
    ## avoid repeating API calls by doing this in order to not burn 
    ## IEX API tokens unnecessarily 
    
    
## If while loop finishes - export all data 
pkl_path_complete= Path('../Resources/all_FMP_data_complete.pkl')
save_obj(complete_dict,pkl_path_complete)
pkl_path_incomplete= Path('../Resources/all_FMP_data_incomplete.pkl')
save_obj(incomplete_dict,pkl_path_incomplete)