In [4]:
#Financial Valuation Model
#Author: Tom Pelletier 

import tensorflow as tf
import requests as r
import pandas as pd 
import numpy as np 
import time
import os
import pymysql
from sqlalchemy import create_engine
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)
warnings.simplefilter(action='ignore', category=RuntimeWarning)

In [5]:
#Data from API https://financialmodelingprep.com/developer/docs/

In [6]:
#Get Jsons of Company Financials
company_code = 'AAPL'

get_profile = r.get('https://financialmodelingprep.com/api/v3/company/profile/%s' % (company_code),timeout=3).json()
get_Qincome = r.get('https://financialmodelingprep.com/api/v3/financials/income-statement/%s?period=quarter' % (company_code),timeout=3).json()
get_Qbalance = r.get('https://financialmodelingprep.com/api/v3/financials/balance-sheet-statement/%s?period=quarter' % (company_code),timeout=3).json()
get_Qcashflow = r.get('https://financialmodelingprep.com/api/v3/financials/cash-flow-statement/%s?period=quarter' % (company_code),timeout=3).json()
get_Qgrowth = r.get('https://financialmodelingprep.com/api/v3/financial-statement-growth/%s?period=quarter' % (company_code),timeout=3).json()

#all company symbols on NASDAQ
get_symbols = r.get('https://financialmodelingprep.com/api/v3/company/stock/list',timeout=3).json()
symbol_lst = [get_symbols['symbolsList'][i]['symbol'] for i in range(len(get_symbols['symbolsList']))]

all_jsons = [get_Qincome,get_Qbalance,get_Qcashflow,get_Qgrowth]

In [7]:
#Clean symbol_lst for companies that are existing/have finanicals

def clean_symba(s,e): 
    
    lst = [get_symbols['symbolsList'][i]['symbol'] for i in range(len(get_symbols['symbolsList']))][s:e]
    
    for i in lst:
        try:
            company_code = i
            get_Qincome = r.get('https://financialmodelingprep.com/api/v3/financials/income-statement/%s?period=quarter' % (company_code),timeout=3).json()
            output = get_Qincome['financials']
            print(i)
            print(lst.index(i),'/',e)
            if not output:
                lst.remove(i)
                
        except: 
            pass
            
    return lst
    
    

In [8]:

#Convert Json features and label to Pandas DataFrames WRANGLING

def json_df(company_code):
    df_lst = []
    
    #GETS
    get_profile = r.get('https://financialmodelingprep.com/api/v3/company/profile/%s' % (company_code)).json()
    get_Qincome = r.get('https://financialmodelingprep.com/api/v3/financials/income-statement/%s?period=quarter' % (company_code)).json()
    get_Qbalance = r.get('https://financialmodelingprep.com/api/v3/financials/balance-sheet-statement/%s?period=quarter' % (company_code)).json()
    get_Qcashflow = r.get('https://financialmodelingprep.com/api/v3/financials/cash-flow-statement/%s?period=quarter' % (company_code)).json()
    get_Qgrowth = r.get('https://financialmodelingprep.com/api/v3/financial-statement-growth/%s?period=quarter' % (company_code)).json()
    all_jsons = [get_Qincome,get_Qbalance,get_Qcashflow,get_Qgrowth]
    
    #make individual DFs and store them in list
    for i in all_jsons: 
        key = list(i.keys())[1]
        df = pd.DataFrame.from_dict(i[key])
        df_lst.append(df)
    df = pd.DataFrame.from_dict(get_profile)
    
    #all we want from the profile json is the sector of the company
    sector = pd.DataFrame.from_dict(get_profile).T['sector'][1]
    sym = pd.DataFrame.from_dict(get_profile).T['sector'][0]
    
    try:

        #merge DataFrames into one
        df = df_lst[0].merge(df_lst[1])
        df = df.merge(df_lst[2])
        df = df.merge(df_lst[3])


        #add the sector and Symbol column
        df['Sector'] = sector
        

        #save and drop date column
        quarters = list(df['date'])
        df.drop('date',inplace=True,axis=1)

        #convert dtypes to numeric
        cols = df.columns
        for i in cols: 
            if i != 'Sector':
                df[i] = pd.to_numeric(df[i])

        #deal with the label(price) column:
        Mean_Q_P = []
        Mean_Q_V = []

        for i in range(len(quarters)): 
            if i <len(quarters)-1:
                close_lst = []
                vol_lst = []
                start = quarters[i] 
                end = quarters[i+1]
                get_price = r.get('https://financialmodelingprep.com/api/v3/historical-price-full/%s?from=%s&to=%s' % (company_code,end,start)).json()

                for i in get_price['historical']:
                    close = i['close']
                    volume = i['volume']
                    close_lst.append(close)
                    vol_lst.append(volume)

                Mean_Q_P.append(np.mean(close_lst))
                Mean_Q_V.append(np.mean(vol_lst))
        #remove nan
        filtered_P = [i for i in Mean_Q_P if str(i)!='nan']
        filtered_V = [i for i in Mean_Q_V if str(i)!='nan']
        df_y = pd.DataFrame([filtered_P,filtered_V]).T
        df_y.columns = ['Price','Volume']
        #match the shape of both DFs
        df = df[df.index <len(df_y)]
        df['Volume'] = df_y['Volume']
        df_y.drop('Volume',inplace=True,axis=1)
        
        #add symbol to df 
        df['Symbol'] = sym
        
        if df is not None:
            return df, df_y   
    
    except: 
        pass
    
       

In [9]:
#get all financials froma all publicly listed companies
def Total_Nasdaq_Financials(lst):
    boo = False
    counter = 0
    
    for i in lst:
        t0 = time.time()
        counter += 1
        company_code = i
        
        if boo:
            try:
                df_, df_y_ = json_df(i)
                df = pd.concat([df, df_])
                df_y = pd.concat([df_y, df_y_])
                #print(lst.index(i)+1,'/',len(lst))
            except: 
                pass
        
        if not boo:
            df, df_y = json_df(i)
            boo = True
        
            
        #make one DF
        df['Price'] = df_y['Price']
        
        
        #Calculate ETA of Scrape
        T = time.time() - t0 
        T_lst = [] 
        T_lst.append(T)
        ETA = np.mean(T_lst)*(len(lst)-lst.index(i))
                              
        print(i,lst.index(i)+1,'/',len(lst),'\n ETA: ',ETA)
        
        
    return df

In [10]:
#Save and Read mySQL Functions 

def to_SQL(data, table_name):
    
    engine = create_engine('mysql+pymysql://root:nothing1@localhost/PROJECT')
    data.to_sql(table_name, engine, if_exists='append', index=False)
    
    #return print(table_name,' Successfully Saved to SQL')




def from_SQL(Table): 
    
    engine = create_engine('mysql+pymysql://root:nothing1@localhost/PROJECT')
    data = pd.read_sql_query('SELECT * FROM PROJECT.'+Table, engine)
    #data = pd.DataFrame(data)
    
    return data

In [11]:
#CLEANING 

def Clean(df):  
    from scipy.stats import mode
    s = len(df)
#1. NULLs 
    df = df.fillna(0)
    
#2. Duplicates
    df = df.drop_duplicates()
    
#3. Remove Outliers 
    for i in df.columns: 
        
        
        if i != 'Sector' and i != 'Symbol':
            m = mode(df[i])[0]
            
            if df[i].quantile(.75) != 0 and m != float(0): 
                
                top = df[i].quantile(.75)
                bot = df[i].quantile(.25)
                IQR2 = ((top - bot)*10)

                T_limit = top+IQR2
                B_limit = bot-IQR2

                df = df[df[i]<T_limit]
                df = df[df[i]>B_limit]
                
                
    e = len(df)    
    print(e/s,'% of Data Kept')
    return df

        
#FEATURE ENGINEERING

def Engineering(df): 
    from sklearn.preprocessing import StandardScaler as SS 
    
#1. ONE HOT ENCODING 
    #we dont want to OHE the symbols so we drop 
    df.drop('Symbol',inplace=True,axis=1)
    df = pd.get_dummies(df)
    
#2. MultiCollinearity
                    
    # Create correlation matrix
    corr_matrix = df.corr().abs()

    # Select upper triangle of correlation matrix
    upper = corr_matrix.where(np.triu(np.ones(corr_matrix.shape), k=1).astype(np.bool))

    # Find index of feature columns with correlation greater than 0.95
    to_drop = [column for column in upper.columns if any(upper[column] > 0.80)]
    # Drop features 
    df = df.drop(df[to_drop], axis=1)

#3. split X and Y
    df_y = pd.DataFrame()
    df_y['Price'] = df['Price']
    df.drop('Price',inplace=True,axis=1)
    
    return df ,df_y


In [12]:
def pull(s,e,TableName): 
    #1. get clean lists clean_symba()
    symbols = clean_symba(s,e)
    #2. get financials Total_Nasdaq_Financials(list) 
    df = Total_Nasdaq_Financials(symbols) 
    #3. save RAW to SQL 
    to_SQL(df,TableName) 
    print('Scraped and Stored to SQL')
    
    return df

def clean_push(df): 
    df = Clean(df)
    to_SQL(df,'Xc')

In [None]:
df = from_SQL('X300')

In [None]:
df = Clean(df)

In [None]:
df, df_y = Engineering(df)

In [13]:
for i in range(7500,7700,5): 
    try:
        pull(i,i+5,'X200')
    except: 
        pass

SCIX
0 / 7505
SCTO
1 / 7505
SDD
2 / 7505
SCKT 1 / 2 
 ETA:  44.463688373565674
SCX 2 / 2 
 ETA:  24.5467209815979
Scraped and Stored to SQL
SDEM
0 / 7510
SDR
1 / 7510
SDYL
0 / 7515
SECT
1 / 7515
STPP
0 / 7625
SUMR
1 / 7625
SUNW
2 / 7625
SUSA
3 / 7625
SUSB
0 / 7630
SVBI
1 / 7630
SVT
2 / 7630
SWIN
3 / 7630
SWJ
0 / 7635
SXI
1 / 7635
SYG
2 / 7635
SYV
0 / 7640
SZO
1 / 7640
TACT
2 / 7640
TAGS
0 / 7645
TAO
1 / 7645
TATT
2 / 7645
TAIT 1 / 3 
 ETA:  65.87729930877686
TAPR 2 / 3 
 ETA:  4.126515865325928
TATT 3 / 3 
 ETA:  2.1659557819366455
Scraped and Stored to SQL
TAXR
0 / 7650
TBLU
1 / 7650
TCBIL
2 / 7650
TAYD 1 / 2 
 ETA:  49.51038646697998
TBX 2 / 2 
 ETA:  2.078047037124634
Scraped and Stored to SQL
TCBIP
0 / 7655
TCCA
1 / 7655
TCRZ
2 / 7655
TCTL
0 / 7660
TDJ
1 / 7660
TERM
2 / 7660
TFLO
0 / 7665
THGA
1 / 7665
TIBRU
2 / 7665
TGC 1 / 2 
 ETA:  45.88709831237793
THST 2 / 2 
 ETA:  2.1404170989990234
Scraped and Stored to SQL
TIK
0 / 7670
TIPZ
1 / 7670
TKAT
2 / 7670
TLDH
0 / 7675
TLTD
1 / 767

In [None]:
from keras import backend as K

def coeff_determination(y_true, y_pred):
    from keras import backend as K
    SS_res =  K.sum(K.square( y_true-y_pred ))
    SS_tot = K.sum(K.square( y_true - K.mean(y_true) ) )
    return ( 1 - SS_res/(SS_tot + K.epsilon()) )

In [None]:
from sklearn.model_selection import train_test_split as tts
X_train, X_test, y_train, y_test = tts(np.array(df), np.array(df_y), test_size=0.20)

In [None]:
X_train = tf.keras.utils.normalize(X_train, axis=1)
X_test = tf.keras.utils.normalize(X_test, axis=1)
model = tf.keras.models.Sequential()
model.add(tf.keras.layers.Flatten())
model.add(tf.keras.layers.Dense(250, activation=tf.nn.relu))    #hidden layer 1
model.add(tf.keras.layers.Dense(200, activation=tf.nn.tanh))    #hidden layer 2
model.add(tf.keras.layers.Dense(1, activation=tf.keras.activations.linear))   #output layer
model.compile(optimizer = 'adam',lr=0.0075, loss = 'mean_squared_error', metrics=[coeff_determination])
model.fit(X_train, y_train, epochs=50, batch_size=15)

In [None]:
x = model.predict(X_test)

In [None]:
x.mean()

In [None]:
from sklearn.ensemble import RandomForestRegressor as RFR

In [None]:
Forest = RFR(n_estimators=5000)
Forest.fit(X_train,Y_train)
Y_pred = Forest.predict(X_test)
Forest.score(Y_test,Y_pred)