In [1]:
import pandas as pd

from pathlib import Path
import csv

import os
import requests
import json

import quandl

from dotenv import load_dotenv
load_dotenv()

True

In [2]:
## Constants 

## Set start date variable - dataframes will be created starting from this date
start_date = '2016-01-01'
end_date = '2021-12-31'
#default_date_range = '71m' ## Default Range for IEX functions - don't need more at the moment

In [3]:
## QUANDL/NASDAQ 
nsdq_api_key = os.environ.get('NASDAQ_API_KEY')
base_url_nsdq = 'https://data.nasdaq.com/api/v3/datasets/FINRA/'

In [4]:
## Use pickle module to import and export and save files
import pickle
def load_obj(path):
    with open(path, 'rb') as f:
        return pickle.load(f)
def save_obj(obj, path ):
    with open(path, 'wb') as f:
        pickle.dump(obj, f, pickle.HIGHEST_PROTOCOL)

In [5]:
## Get Short "Interest" Data from Quandl 
def get_short_data_QUANDL(symbol):
    string_nsdq = "FINRA/FNSQ_"+symbol
    string_nyse = "FINRA/FNYX_"+symbol
    
    df1 = quandl.get(string_nsdq,start_date=start_date,end_date=end_date,authtoken=nsdq_api_key)   ## Nasdaq
    df2 = quandl.get(string_nyse,start_date=start_date,end_date=end_date,authtoken=nsdq_api_key)   ## NYSE

    df1 = df1.rename(columns={'ShortVolume':'ShortVolumeNSDQ','TotalVolume':'TotalVolumeNSDQ'})
    #df1 = df1.drop(columns={'ShortExemptVolume'})
    df1 = df1.rename(columns={'ShortExemptVolume':'ShortExemptVolumeNSDQ'})

    df2 = df2.rename(columns={'ShortVolume':'ShortVolumeNYSE','TotalVolume':'TotalVolumeNYSE'})
    #df2 = df2.drop(columns={'ShortExemptVolume'})
    df2 = df2.rename(columns={'ShortExemptVolume':'ShortExemptVolumeNYSE'})

    df3 = pd.merge(df1,df2,on='Date',how='outer')
    #df3 = df3.fillna(0)
    
    return df3


## Return FTD Data from SEC FTD files using a Stock's CUSIP number to sort 
def return_ftd_data_cusip(cusip_number):
    df = ftd_df.copy()
    df.set_index("CUSIP",inplace=True)
    df = df.loc[cusip_number]
    df['Date'] = pd.to_datetime(df['Date'])
    df = df.set_index('Date')
    return df

## Return the CUSIP symbol from the symbol_df symbol list 
def return_CUSIP_from_symbol(symbol):
    df = symbol_df.copy()
    df.set_index('SYMBOL',inplace=True)
    cusip_variable = df.loc[symbol]
    cusip_variable = cusip_variable['CUSIP']
    return cusip_variable

def return_ftd_data_symbol(symbol):
    cusip_number = return_CUSIP_from_symbol(symbol)
    df = return_ftd_data_cusip(cusip_number)
    return df

In [6]:
## FMP Constants 
fmpbase_urlv3 = 'https://fmpcloud.io/api/v3/'
fmpbase_urlv4 = 'https://fmpcloud.io/api/v4/'
api_key = os.getenv("FMP_CLOUD_API_KEY")

## FMP Functions 
def get_FMP_historical_data(symbol, startDate=start_date, endDate=end_date, apiKey=api_key):
    url_hist_price = fmpbase_urlv3+'historical-price-full/'
    url_hist_query_with_date = url_hist_price+symbol+'?from='+startDate+'&to='+endDate+'&apikey='+apiKey
    resp_data = requests.get(url_hist_query_with_date)
    json_ = resp_data.json()
    data = json_['historical']
    df = pd.DataFrame(data)
    df.rename(columns={'date':'Date'},inplace=True)
    df['Date'] = pd.to_datetime(df['Date'])
    df = df.reindex(index=df.index[::-1]) ## Reverse the DataFrame 
    df.set_index('Date',inplace=True)
    df.drop(columns='label',inplace=True)
    return df

def get_float_data_FMP(symbol):
    url_float_shares = fmpbase_urlv4+'shares_float?symbol='
    url_query_float_data = url_float_shares+symbol+'&apikey='+api_key
    resp_data = requests.get(url_query_float_data)
    #df = pd.DataFrame(resp_data.json())
    json_ = resp_data.json()
    return json_[0]

def get_company_profile_FMP_json(symbol):
    ## https://fmpcloud.io/api/v3/profile/AAPL?apikey='yourkeyhere'
    url_company_profile_url = fmpbase_urlv3+'profile/'+symbol+'?apikey='+api_key
    resp_data = requests.get(url_company_profile_url)
    json_response = resp_data.json()
    return json_response[0]

# def save_and_export_raw_df_csv(data, symbol, path='None'):
#     ## Can set custom path (useful for testing) otherwise will default to below path
#     if path=='None':
#         path = ('../FilesExportIndividualStockDFs_Big/'+symbol+'_combined_df.csv')
#     data.to_csv(path)
    
def save_and_export_raw_df_pkl(data, symbol, path='None'):
    ## Can set custom path (useful for testing) otherwise will default to below path
    if path=='None':
        path = ('../FilesExport_TimeSeries_DFs/'+symbol+'_combined_df.pkl')
    save_obj(data,path)

In [7]:
def get_time_series_data(symbol):
    ftd_data = return_ftd_data_symbol(symbol)
    ftd_data = ftd_data.drop(columns={'SYMBOL'})

    fmp_data = get_FMP_historical_data(symbol)
    df1 = pd.merge(fmp_data,ftd_data, on='Date',how='outer')
    df1['QUANTITY_FAILS'] = df1['QUANTITY_FAILS'].fillna(0)
    df1['volume'] = df1['volume'].fillna(0)
    df1['unadjustedVolume'] = df1['unadjustedVolume'].fillna(0)
    df1['vwap'] = df1['vwap'].fillna(0)
    df2 = get_short_data_QUANDL(symbol)
    df = pd.merge(df1,df2,on='Date',how='outer')
    df = df.fillna(0)
    return df

In [8]:
## Import dataframe objects using Pickle 
ftd_df = load_obj('../Resources/ftd_all_data.pkl')
symbol_df = load_obj('../Resources/symbol_all_list.pkl')

In [9]:
ftd_df

Unnamed: 0,Date,CUSIP,SYMBOL,QUANTITY_FAILS
0,2017-09-11,Y21990034,0034RIGHTS,103745.0
1,2017-09-12,Y21990034,0034RIGHTS,103745.0
2,2018-10-17,812350122,0122PIK,828.0
3,2018-10-18,812350122,0122PIK,828.0
4,2019-05-02,812350122,0122PIK,828.0
...,...,...,...,...
7279274,2021-05-25,98959W203,ZZZOD,324.0
7279275,2021-05-26,98959W203,ZZZOD,354.0
7279276,2021-05-27,98959W203,ZZZOD,976.0
7279277,2021-05-28,98959W203,ZZZOD,2.0


In [10]:
symbol_df

Unnamed: 0,SYMBOL,CUSIP
0,0034RIGHTS,Y21990034
1,0122PIK,812350122
2,0297RTS,G72990297
3,0329REORG,G33990329
4,0888RTSPYMNT,529900888
...,...,...
34763,ZZLL,98880P103
34764,ZZLL,98880P202
34765,ZZLLD,98880P202
34766,ZZZOD,98959W203


In [11]:
## Set iteration through symbol_df 
length_ = len(symbol_df)

complete_dict = {}            ## Empty dicts in order to create
incomplete_dict = {}          ## two big files at end of while loop
#error_symbol_list = []

In [12]:
test_length = 100

In [13]:
symbol_df['SYMBOL'].iloc[25000]

'PSIQ'

In [14]:
## Code to locate index of symbol 
test_symbol = 'GME'

length_ = len(symbol_df)
index_variable = 0 

for i in range(length_):
    if symbol_df['SYMBOL'].iloc[i] == test_symbol:
        index_variable = i
        print(symbol_df['SYMBOL'].iloc[i])
        print(index_variable)
        break

GME
13468


In [15]:
test_index = index_variable
test_symbol = symbol_df['SYMBOL'][test_index]

test_df = get_time_series_data(test_symbol)
test_df

Unnamed: 0_level_0,open,high,low,close,adjClose,volume,unadjustedVolume,change,changePercent,vwap,changeOverTime,QUANTITY_FAILS,ShortVolumeNSDQ,ShortExemptVolumeNSDQ,TotalVolumeNSDQ,ShortVolumeNYSE,ShortExemptVolumeNYSE,TotalVolumeNYSE
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1
2016-01-04,27.040001,28.459999,27.010000,28.309999,21.848804,4330400.0,4330400.0,1.27,4.697,27.92667,0.04697,9.0,734359.0,250.0,1091239.0,369005.0,0.0,411261.0
2016-01-05,28.590000,29.010000,28.059999,28.770000,22.203817,3416600.0,3416600.0,0.18,0.630,28.61333,0.00630,2709.0,324822.0,0.0,766181.0,45542.0,0.0,108705.0
2016-01-06,28.230000,28.740000,27.440001,28.370001,21.895107,3756400.0,3756400.0,0.14,0.496,28.18333,0.00496,129.0,470222.0,1300.0,818619.0,54449.0,0.0,136915.0
2016-01-07,28.120001,28.770000,27.740000,28.450001,21.956852,2662800.0,2662800.0,0.33,1.174,28.32000,0.01174,1733.0,255510.0,100.0,483840.0,27639.0,0.0,83420.0
2016-01-08,28.700001,28.790001,28.040001,28.370001,21.895107,2699200.0,2699200.0,-0.33,-1.150,28.40000,-0.01150,187.0,297805.0,268.0,441419.0,17055.0,0.0,32738.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2021-12-27,152.000000,152.620000,140.000000,148.310000,148.310000,1613729.0,1613729.0,-3.69,-2.428,146.97667,-0.02428,2828.0,227591.0,593.0,539793.0,56501.0,139.0,159382.0
2021-12-28,147.500000,157.410000,146.410000,146.460000,146.460000,1320374.0,1320374.0,-1.04,-0.705,150.09333,-0.00705,710.0,185176.0,0.0,424993.0,50426.0,0.0,126633.0
2021-12-29,147.850000,155.490000,142.140000,153.930000,153.930000,2029352.0,2029352.0,6.08,4.112,150.52000,0.04112,2518.0,280068.0,2383.0,627874.0,112580.0,798.0,295581.0
2021-12-30,151.000000,160.000000,150.000000,155.330000,155.330000,1556076.0,1556076.0,4.33,2.868,155.11000,0.02868,0.0,172058.0,329.0,463841.0,41010.0,0.0,161071.0


In [16]:
test_obj = [{test_symbol:test_df}] ## Testing object, to be exported 

In [17]:
#test_dict[0]

In [18]:
save_and_export_raw_df_pkl(test_obj, test_symbol,path='../Resources_Test/GME_test_df.pkl')

In [19]:
# import_test = load_obj('../Resources_Test/GME_test_df.pkl')
# import_test[0]

## Success 

In [20]:
## Rewrite loop to access FMP data using list of symbols from symbol_df 
## This code is written this way, with the while loop, because it allows you to pick up from close to
## where you left off, in the event of an API failure, or the code stopping. 
## Changing the x and y starting values will allow you to pick up from where the loop failed, and continue
## reading data. 

## Create while loop that access FMP and gets data 
x = 0    
y = 50
increment = 50  ## Make sure increment is right

test_length = 200 

## Setup in 50 increments to find error in case the loop fails, but keep previous data.

## Initialize lists for successful calls
symbol_success_list = []
symbol_not_success_list = [] 

loop_length = test_length ## Set to length_ to run as full, or your test_length

In [21]:
## Test list extending 
test_l1 = [0,1]
test_l2 = [2,3]
test_l1.extend(test_l2)
test_l1

[0, 1, 2, 3]

In [22]:
while (x < loop_length):

    ## Set y value for end of while loop
    if y >= length_: y = length_
    ## In theory this should stop the for loop from crashing 
    ##    at the end and running out of index range     
    
    range_var = range(x,y)
    ## Set label variables for exports 
    str_symbol1 = str(x)
    y2 = y - 1 
    str_symbol2 = str(y2)
    
    for i in range_var: 
        
        ## Create temp lists 
        temp_symbol_success_list = []
        temp_symbol_not_success_list = []
        
        ## Iterate through symbol list and create data by symbol
        symbol_var = symbol_df['SYMBOL'][i]
        
        # Use while running loop, if error is made, can see what index it happened on 
        # can also check current value of symbol_var as well 
        error_var = i
        
        ## Get Time Series Data 
        try:
            time_series_df = get_time_series_data(symbol_var)
            ## Export as a list with a key (the stock symbol) to the dataframe 
            export_obj = [{symbol_var:time_series_df}]
            
            if len(time_series_df) > 1500:
                symbol_success_list.append(symbol_var)
            else:
                symbol_not_success_list.append(symbol_var)               
            
        except:
            continue
            
            
        ## If successful, export data 
        save_and_export_raw_df_pkl(export_obj,symbol_var)
            
    ## If for loop successful, append temp_symbol lits to main lists 
    symbol_success_list.extend(temp_symbol_success_list)
    symbol_not_success_list.extend(temp_symbol_not_success_list)
    
    ## Export main lists each time, and rewrite during each iteration of while loop 
    ## Can read in list afterwards, to figure out where loop went wrong, and where to restart 
    save_obj(symbol_success_list,'../Resources/02_symbol_success_list.pkl')
    save_obj(symbol_not_success_list,'../Resources/02_symbol_not_success_list.pkl')
    
    ## Check before run, if incorrect, can waste a lot of API credits  
    x += increment
    y += increment