# Setup

#### Load required objects

In [None]:
!pip install yfinance # installs from terminal

Collecting yfinance
  Downloading https://files.pythonhosted.org/packages/7a/e8/b9d7104d3a4bf39924799067592d9e59119fcfc900a425a12e80a3123ec8/yfinance-0.1.55.tar.gz
Collecting lxml>=4.5.1
[?25l  Downloading https://files.pythonhosted.org/packages/79/37/d420b7fdc9a550bd29b8cfeacff3b38502d9600b09d7dfae9a69e623b891/lxml-4.5.2-cp36-cp36m-manylinux1_x86_64.whl (5.5MB)
[K     |████████████████████████████████| 5.5MB 12.5MB/s 
Building wheels for collected packages: yfinance
  Building wheel for yfinance (setup.py) ... [?25l[?25hdone
  Created wheel for yfinance: filename=yfinance-0.1.55-py2.py3-none-any.whl size=22618 sha256=37de41ffb7414b5adc6de089710c55c43e47a3430b5cea5dcc3198fb93caf4b0
  Stored in directory: /root/.cache/pip/wheels/04/98/cc/2702a4242d60bdc14f48b4557c427ded1fe92aedf257d4565c
Successfully built yfinance
Installing collected packages: lxml, yfinance
  Found existing installation: lxml 4.2.6
    Uninstalling lxml-4.2.6:
      Successfully uninstalled lxml-4.2.6
Successfull

In [None]:
# Below are the packages/libraries requred to run run our code
import pandas as pd # used for data frames
import numpy as np # used for math
import yfinance as yf # used for finance
import plotly.express as px # used for interactive graphing
import plotly.graph_objects as go
from sklearn.linear_model import LinearRegression # for regression analysis
from scipy.stats import norm, f, chi2, iqr, t #used for stats

In [None]:
# list of ticker symbols
ticker_lst = ['DAL', 'UAL', 'AAL', 'SAVE', 'LUV']

# function taking a list of ticker symbols and creates one dataframe with daily log returns by ticker
def compile_log_returns(lst):
  compiled = pd.DataFrame()
  for i in lst:
    ticker = yf.Ticker(i)
    ticker_data = ticker.history(period='1d', start = '2020-1-1', end='2020-10-1')
    returns_df = ticker_data.assign(log_returns = lambda x: np.log(x.Close / x.Open)) # calculates log returns
    compiled[i] = returns_df['log_returns']
  return compiled

## Single Ticker

### Ticker vs Time regression

Perform a regression of the log-return on time

In [None]:
def one_ticker_regression(log_returns_df, ticker_str):
  log_returns_df['day_id'] = (log_returns_df.index - log_returns_df.index[0]).days
  x = log_returns_df['day_id'].values
  x_2d = x.reshape(-1, 1)
  y = log_returns_df[ticker_str]

  # fit regression
  reg = LinearRegression().fit(x_2d, y)
  y_pred = reg.predict(x_2d)
  log_returns_df["pred"] = y_pred
  r2 = reg.score(x_2d, y)
  m = reg.coef_
  b = reg.intercept_
  log_returns_df["residuals"] = y - y_pred

  # plot
  fig = px.scatter(log_returns_df, x = log_returns_df.index, y = ticker_str, 
                   title = "Regression of Daily Log Returns for " + ticker_str,
                   labels = {"x":"", ticker_str: "Log Return for " + ticker_str},
                   width = 800, height = 400)
  fig.add_trace(px.line(log_returns_df, x = log_returns_df.index, y = "pred").data[0])
  fig.show()

  print("%s R2: %.9f\n" % (ticker_str,r2))
  print("%s slope: %.9f\n" % (ticker_str, m))
  print("%s intercept: %.9f\n" % (ticker_str, b))

  fig_res = px.scatter(log_returns_df, log_returns_df.index, "residuals", 
                       title = "Residuals of Daily Log Returns with Linear Model (%s)" % ticker_str,
                       labels = {"x":"", "residuals":"Residuals"},
                      width = 800, height = 400)
  fig_res.show()
  return

### Ticker Confidence Intervals

Approximate confidence intervals for the means and variances given a confidence level.

In [None]:
#Confidence Interval for Single Ticker
def confidence_interval(df,str1,alpha):
  n = float(len(df[str1]))
  sample_mean = df[str1].mean()
  sample_var = df[str1].var()
  t_stat = t.ppf((alpha+(1-alpha)/2), n-1)
  
  # assume norm dist, calculates 100*alpha% C.I. for Mean and Variance
  mu_lb = sample_mean - (t_stat * np.sqrt(sample_var/n))
  mu_ub = sample_mean + (t_stat * np.sqrt(sample_var/n))
  
  s2_lb = (n-1) * sample_var / chi2.ppf(((1-alpha)/2)+alpha, n-1)
  s2_ub = (n-1) * sample_var / chi2.ppf(((1-alpha)/2), n-1) 

  x_bar = [mu_lb, mu_ub]
  s2 = [s2_lb, s2_ub]
  
  # results will be returned as a list of two lists
  # first list lower/upper bounds for pop. mean CI
  # second list lower/upper bounds for pop. variance CI
  results = [x_bar, s2]
  return results

### Normal Probability Plot

In [None]:
def remove_outliers(df,ticker):
  data = df[ticker]
  Q1 = data.quantile(q=.25)
  Q3 = data.quantile(q=.75)
  IQR = Q3 - Q1

  #only keep rows in dataframe that have values within 1.5*IQR of Q1 and Q3
  data_clean = data[~((data < (Q1-1.5*IQR)) | (data > (Q3+1.5*IQR)))]
  data_clean.columns = [ticker]
  return data_clean

def log_return_w_normal(df,ticker):
  fig = px.histogram(remove_outliers(df,ticker), x = ticker, 
                     title=("Daily Log Returns of %s With Normal Distribution plot" % (ticker)), 
                     nbins=60, range_x = [-0.3, 0.3],
                     width = 800, height = 400)
  
  #Calculate the mu and std of given stock
  mu, std = norm.fit(remove_outliers(df,ticker))
  x = np.linspace(-.3, .3, 1000)
  p = norm.pdf(x, mu, std)

  # Make it into a DF
  dfnorm = pd.DataFrame()
  dfnorm['x'] = x
  dfnorm['p'] = p
  
  # Plot both together
  fig.add_trace(go.Scatter(x = x, y = p, mode='lines', name='Normal Distribution'))
  fig.show()
  print("Sample Mean: %.5f \tSample Standard Deviation: %.5f" % (mu,std))
  return

def log_return_histogram(df,ticker):
  fig = px.histogram(df[ticker], x = ticker, title=("Daily Log Returns of %s" % (ticker)), nbins=100, range_x = [-0.3, 0.3],
                     width = 800, height = 400)    
  fig.show()
  mu, std = norm.fit(df[ticker])
  print("Sample Mean: %.5f \tSample Standard Deviation: %.5f" % (mu,std))
  return

## Two Tickers

In [None]:
#Regression of one log-return on the other
def hypothesis_test(df, str1, str2, alpha):
  # mean equality test
  # i.e. H0: mu1 = mu2
  #      H1: mu1 != mu2
  n = float(len(df[str1]))
  m = float(len(df[str2]))
  sample_mean_1 = df[str1].mean()
  sample_mean_2 = df[str2].mean()
  sample_var_1 = df[str1].var()
  sample_var_2 = df[str2].var()
  T = (sample_mean_1 - sample_mean_2)/(((sample_var_1/n)+(sample_var_2/m))**.5)
  z_stat = norm.ppf(alpha + (1-alpha)/2)
  F_stat_ub = f.ppf(alpha + (1-alpha)/2,n-1,m-1)
  F_stat_lb = f.ppf((1-alpha)/2,n-1,m-1)

  # output is a list: first element of list is outcome of mean equality, second element is variance equality
  # if 1, do not reject H0, if 0, reject H0
  hypothesis_test_lst = []

  # mean equality conclusions
  if abs(T) <= z_stat:
    hypothesis_test_lst.append(1)
  else:
    hypothesis_test_lst.append(0)
  
  # variance equality conclusions
  if F_stat_lb <= (sample_var_1/sample_var_2) <= F_stat_ub:
    hypothesis_test_lst.append(1)
  else:
    hypothesis_test_lst.append(0)

  return hypothesis_test_lst

def main_two_ticker(df, str1, str2, alpha):

  # get regression info
  x = np.array(df[str1]).reshape((-1, 1))
  y = df[str2]
  # Create linear regression object and fit it to data
  reg = LinearRegression().fit(x, y)
  reg_pred_y = reg.predict(x)
  reg_r2 = reg.score(x, y)
  reg_slope = reg.coef_
  reg_intercept = reg.intercept_
  df["pred"] = reg_pred_y
  df["residuals"] = y - reg_pred_y

  # plot figure
  axis_label = " (log dialy return)"
  fig = px.scatter(df, x = str1, y = str2,
                   title = ("Log Daily Returns of %s vs %s" % (str1, str2)),
                   labels={str1: str1 + axis_label, str2: str2 + axis_label},
                   width = 800, height = 400)
  fig.add_trace(px.line(df, x = str1, y = "pred").data[0])
  fig.show()

  print("%s on %s R2: %.08f\n"% (str1,str2,reg_r2))
  print("%s on %s slope: %.08f\n"% (str1,str2,reg_slope))
  print("%s on %s intercept: %.08f\n"% (str1,str2,reg_intercept))
  
  # Calls the Hypothesis Test function to get mean and variance CI in list
  hypothesis_test_results = hypothesis_test(df,str1,str2,alpha)
  if hypothesis_test_results[0] == 1:
    print("Do not reject Null Hypothesis: Population Mean of %s = Population Mean of %s" % (str1, str2))
  else:
    print("Reject Null Hypothesis: Population Mean of %s != Population Mean of %s" % (str1, str2))
  if hypothesis_test_results[1] == 1:
    print("Do not reject Null Hypothesis: Population Variance of %s = Population Variance of %s" % (str1, str2))
  else:
    print("Reject Null Hypothesis: Population Variance of %s != Population Variance of %s" % (str1, str2))

  fig_res = px.scatter(df, str1, "residuals", title = "Residuals of %s vs %s" % (str1, str2),
                   width = 800, height = 400)
  fig_res.show()

  return

### Main Function for Single Ticker Tests




In [None]:
# main function for single stock 
def main_single_ticker(dataframe,alpha,stock):
  log_return_histogram(dataframe,stock)
  log_return_w_normal(dataframe,stock)
  one_ticker_regression(dataframe,stock)
  CI_Data = confidence_interval(dataframe,stock,alpha)
  print("95 Percent Confidence Interval for Population Mean of %s: [%f, %f]" % (stock, CI_Data[0][0], CI_Data[0][1]))
  print("95 Percent Confidence Interval for Population Variance of %s: [%f, %f]" % (stock, CI_Data[1][0],  CI_Data[1][1]))
  return 


# **User Interface**


In [None]:
#  #  #  #  #  #  #  #  #  #  #  #
# #USER INPUTS FOR SINGLE STOCK# #
#  #  #  #  #  #  #  #  #  #  #  #

data = compile_log_returns(ticker_lst)

# Choose from AAL, DAL, LUV, SAVE, UAL 
stock = "UAL"
# Choole Significance Level
alpha = 0.95

main_single_ticker(data, alpha, stock)

Sample Mean: -0.00811 	Sample Standard Deviation: 0.05200


Sample Mean: -0.00380 	Sample Standard Deviation: 0.03917


UAL R2: 0.014006372

UAL slope: 0.000078130

UAL intercept: -0.018781852



95 Percent Confidence Interval for Population Mean of UAL: [-0.015590, -0.000626]
95 Percent Confidence Interval for Population Variance of UAL: [0.002243, 0.003365]


In [None]:
#  #  #  #  #  #  #  #  #  #  #  #
# # USER INPUTS FOR TWO STOCKS # #
#  #  #  #  #  #  #  #  #  #  #  #

# Choose from AAL, DAL, LUV, SAVE, UAL 
stock1 = "SAVE"
stock2 = "UAL"
alpha = 0.95

main_two_ticker(data, stock1, stock2,alpha)

SAVE on UAL R2: 0.59814313

SAVE on UAL slope: 0.73767053

SAVE on UAL intercept: -0.00067003

Do not reject Null Hypothesis: Population Mean of SAVE = Population Mean of UAL
Do not reject Null Hypothesis: Population Variance of SAVE = Population Variance of UAL
