In [1]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import pandas.io.data as web
import Quandl
import statsmodels.api as sm
import statsmodels.formula.api as smf
from statsmodels.graphics.api import abline_plot
import patsy
import seaborn as sns
sns.set(context='notebook', style='whitegrid', palette='deep', font='sans-serif', font_scale=1, rc=None)
from scipy import stats

In [2]:
# Exercise # 1 - Generating two series of 1000 numbers to fit as a Bivariate Function
x = pd.DataFrame(np.random.randint(0,9,1000))
y = pd.DataFrame(np.random.randint(0,9,1000))
data = pd.merge(y, x, left_index=True, right_index=True)
mod = smf.ols(formula='y ~ x', data=data).fit()
print(mod.summary())

                            OLS Regression Results                            
Dep. Variable:                      y   R-squared:                       0.000
Model:                            OLS   Adj. R-squared:                 -0.001
Method:                 Least Squares   F-statistic:                    0.1263
Date:                Fri, 25 Sep 2015   Prob (F-statistic):              0.722
Time:                        08:34:10   Log-Likelihood:                -2360.4
No. Observations:                1000   AIC:                             4725.
Df Residuals:                     998   BIC:                             4735.
Df Model:                           1                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [95.0% Conf. Int.]
------------------------------------------------------------------------------
Intercept      3.9370      0.153     25.756      0.0

For this two randomly generated series (x and y with 1.000 observations each), the T-statistic value is less in absolut value than absolute 2; in that case for a level of confidence of 95% we would accept the null hypothesis (H0: B1 = 0) and declare that the two variables are non-related to each other (given the proximity of the slope of the regression line of the x constant to 0), given the fact that the high probability of the T-statistic of x given y (54.9%). 

In [3]:
# Exercise # 2 - CAPM with changed stock
# Grab 5 years of IBM and NASDAQ data using Quandl.   

IBM_all = Quandl.get("YAHOO/IBM", trim_start="2010-9-1")
nasdaq_all = Quandl.get("NASDAQOMX/COMP", trim_start="2010-9-1")
IBM = IBM_all['Adjusted Close']
nasdaq = nasdaq_all['Index Value']

In [4]:
plt.figure(figsize = (12,10))
#plt.plot(IBM)
IBM.plot(color='blue')
plt.title('IBM ($/Share)', fontsize=20)

<matplotlib.text.Text at 0x18a99a58>

<img src = "Figure_1.png">

In [5]:
plt.figure(figsize = (12,10))
#plt.plot(nasdaq)
nasdaq.plot(color='blue')
plt.title('NASDAQ', fontsize=20)

<matplotlib.text.Text at 0x189bb710>

<img src="Figure_2.png"> 

In [6]:
IBM_returns = np.log(IBM / IBM.shift(1))
nasdaq_returns = np.log(nasdaq / nasdaq.shift(1))
IBM_returns = IBM_returns.dropna()
nasdaq_returns = nasdaq_returns.dropna()

In [7]:
print IBM_returns.mean(), IBM_returns.std()

0.000190036544099 0.0119003809381


In [8]:
plt.figure(figsize= (12,10))
plt.hist(IBM_returns, bins=50, normed=True, color='blue')
plt.title('Histogram of IBM Daily Returns Since 2010', fontsize=20)
plt.ylabel('%', fontsize=18)

<matplotlib.text.Text at 0x18cd8358>

<img src = "Figure_3.png">

In [9]:
print nasdaq_returns.mean(), nasdaq_returns.std()

0.00060988823878 0.0107742281971


In [10]:
plt.figure(figsize= (12,10))
plt.hist(nasdaq_returns, bins=50, normed=True, color='blue')
plt.title('Histogram of NASDAQ Daily Returns Since 2010', fontsize=20)
plt.ylabel('%', fontsize=18)

<matplotlib.text.Text at 0x1b027518>

<img src = "Figure_4.png">

In [11]:
# Create dataframes for merge and regression.

IBM_returns = pd.DataFrame(IBM_returns)
nasdaq_returns = pd.DataFrame(nasdaq_returns)

In [12]:
# Confirm dataframes

IBM_returns.info()
nasdaq_returns.info()

<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 1274 entries, 2010-09-02 to 2015-09-24
Data columns (total 1 columns):
Adjusted Close    1274 non-null float64
dtypes: float64(1)
memory usage: 19.9 KB
<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 1274 entries, 2010-09-02 to 2015-09-24
Data columns (total 1 columns):
Index Value    1274 non-null float64
dtypes: float64(1)
memory usage: 19.9 KB


In [13]:
plt.figure(figsize = (12,10))
plt.scatter(IBM_returns, nasdaq_returns)
plt.title('CAPM Data', fontsize = 20)
plt.xlabel('Log Returns of NASDAQ', fontsize = 18)
plt.ylabel('Log Returns of IBM', fontsize = 18)
plt.xlim([-0.1, 0.1])
plt.ylim([-0.1, 0.1])

(-0.1, 0.1)

<img src = "Figure_5.png">

In [14]:
data = pd.merge(nasdaq_returns, IBM_returns, left_index=True, right_index=True)
data.rename(columns={'Index Value':'nasdaq', 'Adjusted Close':'IBM'}, inplace=True)

In [15]:
print data.mean()
print data.std()

nasdaq    0.00061
IBM       0.00019
dtype: float64
nasdaq    0.010774
IBM       0.011900
dtype: float64


In [16]:
mod = smf.ols(formula='IBM ~ nasdaq', data = data).fit()
print(mod.summary())

                            OLS Regression Results                            
Dep. Variable:                    IBM   R-squared:                       0.384
Model:                            OLS   Adj. R-squared:                  0.383
Method:                 Least Squares   F-statistic:                     792.2
Date:                Fri, 25 Sep 2015   Prob (F-statistic):          6.80e-136
Time:                        08:34:48   Log-Likelihood:                 4146.5
No. Observations:                1274   AIC:                            -8289.
Df Residuals:                    1272   BIC:                            -8279.
Df Model:                           1                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [95.0% Conf. Int.]
------------------------------------------------------------------------------
Intercept     -0.0002      0.000     -0.867      0.3

In [17]:
# This a convoluted

figure, ax = plt.subplots(figsize=(12,10))
ax.scatter(IBM_returns, nasdaq_returns)
mod = smf.ols(formula='IBM ~ nasdaq', data = data).fit()
abline_plot(model_results=mod, ax=ax, color='red')

ax.set_title('CAPM Data', fontsize = 20)
ax.set_ylabel('Log Returns of IBM', fontsize = 18)
ax.set_xlabel('Log Returns of NASDAQ', fontsize = 18)
ax.set_xlim([-0.1, 0.1])
ax.set_ylim([-0.1, 0.1])

(-0.1, 0.1)

<img src = "Figure_6.png">

In [18]:
# print the Ftest, Farray, P(t) when beta = 1
print mod.f_test("nasdaq = 1")

<F test: F=array([[ 168.69341925]]), p=2.60074086591e-36, df_denom=1272, df_num=1>


In [19]:
# print the Ftest, Farray, P(t) when alpha = 0
print mod.f_test("Intercept = 0")

<F test: F=array([[ 0.75107292]]), p=0.386300194091, df_denom=1272, df_num=1>


In [20]:
# print the Ftest, Farray, P(t) when beta = 1 and alpha = 1
print mod.f_test("nasdaq = 1, Intercept = 0")

<F test: F=array([[ 85.63237151]]), p=1.28749390836e-35, df_denom=1272, df_num=2>


For the results we can infere that:
    
1. We cannot reject the null hipotesis that B0 (alpha-intercept) is equal that 0 due to the high value of the p-statistic, therefore confirming that the IBM stock analized almost performed equaly to the Nasdaq market since 2010 in general terms.  
2. We reject the null hypotesis that B1 (Beta-Slope) = 1 with a 95% of confidence given the low value of the P(t) statistic (0.00%), therefore confirming that IBM stock has less volatility thatn the market in terms of daily returns, making it less risky for investors.
3.For the combined test, we can reject the null hypotesis of both B0 = 0 (Alpha intercept) and B1 = 1 (Beta-Slope) due to the loew value of the p-statistic with a 95% confidence interval. 


In [50]:
# Excersie 3 - Import train.dta and perform a linnear regression

# Import data from train.dta file as a panda dataframe
traindata = pd.read_stata('train.dta')

#Calculating and printing statistics from dataframe variables
print('Summary Statistics')
print('x1 mean: ') + str(np.mean(traindata['x1']))
print('d mean: ') + str(np.mean(traindata['d']))
print('x1 Variance: ') + str(np.var(traindata['x1']))
print('d Variance: ') + str(np.var(traindata['d']))
print('x1 Std: ') + str(np.std(traindata['x1']))
print('d Std: ') + str(np.std(traindata['d']))

# Running a OLS regression
mod = smf.ols(formula='d ~ x1', data=traindata).fit()
print(mod.summary())

Summary Statistics
x1 mean: 0.487375587225
d mean: 0.476999998093
x1 Variance: 0.0802043154836
d Variance: 0.249469965696
x1 Std: 0.283203664319
d Std: 0.499469684462
                            OLS Regression Results                            
Dep. Variable:                      d   R-squared:                       0.332
Model:                            OLS   Adj. R-squared:                  0.331
Method:                 Least Squares   F-statistic:                     495.0
Date:                Fri, 25 Sep 2015   Prob (F-statistic):           2.21e-89
Time:                        09:07:10   Log-Likelihood:                -523.32
No. Observations:                1000   AIC:                             1051.
Df Residuals:                     998   BIC:                             1060.
Df Model:                           1                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t

- Given the results of the OLS regression of D given X1, and the p-value of this regression (0%), we can reject the null hypotesis that the coeficient of X1 = 0 with a 95% degree of confidence, thus confirming that changes in the predictor's value (x1) are related to changes in the response variable (D).  
- Given the results of the OLS regression for the intercept we accept the null hypotesis that it is equal to 0 with a 95% degree of confidence, due to the high value of it's p-statistic (0.486).

In [58]:
# Calculating D estimated value from estimated B0 and B1 Constants

B0 = -0.0180
B1 = 1.0155
X1 = 0.65

DE1 = B0 + X1* B1

print('DE1 =') + str(DE1)

X2 = 1.01

DE2 = B0 + X2* B1

print('DE2 =') + str(DE2)

DE1 =0.642075
DE2 =1.007655


For the first estimation (with a value of X1 = 0.65) we would not classify the email as spam; For the second estimation (with a value of X1 = 1.01) we would classify the email as spam.

In [24]:
# Excersice 4 - DGP Montecarlo simulation
# Question 1

# Generator normal error and feature
e1 = np.random.normal(0, 1, 1000)  
x = np.random.normal(0, 1, 1000) 

# DGPs for y1 
y1 = 1 + 2 * x + e1

In [25]:
#Create panda dataframes from DGP x and y1, merge them into a single Dataframe and process OLS regression
#print the results of the OLS
x = pd.DataFrame(x)
y1 = pd.DataFrame(y1)

data1 = pd.merge(y1, x, left_index=True, right_index=True)

mod1 = smf.ols(formula='y1 ~ x', data=data1).fit()
print(mod1.summary())

                            OLS Regression Results                            
Dep. Variable:                     y1   R-squared:                       0.794
Model:                            OLS   Adj. R-squared:                  0.794
Method:                 Least Squares   F-statistic:                     3857.
Date:                Wed, 23 Sep 2015   Prob (F-statistic):               0.00
Time:                        19:44:48   Log-Likelihood:                -1430.6
No. Observations:                1000   AIC:                             2865.
Df Residuals:                     998   BIC:                             2875.
Df Model:                           1                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [95.0% Conf. Int.]
------------------------------------------------------------------------------
Intercept      0.9688      0.032     30.219      0.0

In [26]:
# Question 2 - Perform tha same excersice 4 additional times

# Generator normal error and feature
e1 = np.random.normal(0, 1, 1000)  
x = np.random.normal(0, 1, 1000) 

# DGPs for y1 
y1 = 1 + 2 * x + e1

#Create panda dataframes from DGP x and y1, merge them into a single Dataframe and process OLS regression
#print the results of the OLS
x = pd.DataFrame(x)
y1 = pd.DataFrame(y1)

data1 = pd.merge(y1, x, left_index=True, right_index=True)

mod1 = smf.ols(formula='y1 ~ x', data=data1).fit()
print(mod1.summary())


                            OLS Regression Results                            
Dep. Variable:                     y1   R-squared:                       0.786
Model:                            OLS   Adj. R-squared:                  0.785
Method:                 Least Squares   F-statistic:                     3657.
Date:                Wed, 23 Sep 2015   Prob (F-statistic):               0.00
Time:                        19:44:49   Log-Likelihood:                -1439.1
No. Observations:                1000   AIC:                             2882.
Df Residuals:                     998   BIC:                             2892.
Df Model:                           1                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [95.0% Conf. Int.]
------------------------------------------------------------------------------
Intercept      1.0469      0.032     32.360      0.0

In [27]:
# Generator normal error and feature
e1 = np.random.normal(0, 1, 1000)  
x = np.random.normal(0, 1, 1000) 

# DGPs for y1 
y1 = 1 + 2 * x + e1

#Create panda dataframes from DGP x and y1, merge them into a single Dataframe and process OLS regression
#print the results of the OLS
x = pd.DataFrame(x)
y1 = pd.DataFrame(y1)

data1 = pd.merge(y1, x, left_index=True, right_index=True)

mod1 = smf.ols(formula='y1 ~ x', data=data1).fit()
print(mod1.summary())

                            OLS Regression Results                            
Dep. Variable:                     y1   R-squared:                       0.787
Model:                            OLS   Adj. R-squared:                  0.787
Method:                 Least Squares   F-statistic:                     3688.
Date:                Wed, 23 Sep 2015   Prob (F-statistic):               0.00
Time:                        19:44:50   Log-Likelihood:                -1456.2
No. Observations:                1000   AIC:                             2916.
Df Residuals:                     998   BIC:                             2926.
Df Model:                           1                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [95.0% Conf. Int.]
------------------------------------------------------------------------------
Intercept      1.0008      0.033     30.451      0.0

In [28]:
# Generator normal error and feature
e1 = np.random.normal(0, 1, 1000)  
x = np.random.normal(0, 1, 1000) 

# DGPs for y1 
y1 = 1 + 2 * x + e1

#Create panda dataframes from DGP x and y1, merge them into a single Dataframe and process OLS regression
#print the results of the OLS
x = pd.DataFrame(x)
y1 = pd.DataFrame(y1)

data1 = pd.merge(y1, x, left_index=True, right_index=True)

mod1 = smf.ols(formula='y1 ~ x', data=data1).fit()
print(mod1.summary())

                            OLS Regression Results                            
Dep. Variable:                     y1   R-squared:                       0.797
Model:                            OLS   Adj. R-squared:                  0.796
Method:                 Least Squares   F-statistic:                     3908.
Date:                Wed, 23 Sep 2015   Prob (F-statistic):               0.00
Time:                        19:44:51   Log-Likelihood:                -1416.6
No. Observations:                1000   AIC:                             2837.
Df Residuals:                     998   BIC:                             2847.
Df Model:                           1                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [95.0% Conf. Int.]
------------------------------------------------------------------------------
Intercept      1.0210      0.032     32.323      0.0

In [29]:
# Generator normal error and feature
e1 = np.random.normal(0, 1, 1000)  
x = np.random.normal(0, 1, 1000) 

# DGPs for y1 
y1 = 1 + 2 * x + e1

#Create panda dataframes from DGP x and y1, merge them into a single Dataframe and process OLS regression
#print the results of the OLS
x = pd.DataFrame(x)
y1 = pd.DataFrame(y1)

data1 = pd.merge(y1, x, left_index=True, right_index=True)

mod1 = smf.ols(formula='y1 ~ x', data=data1).fit()
print(mod1.summary())

                            OLS Regression Results                            
Dep. Variable:                     y1   R-squared:                       0.805
Model:                            OLS   Adj. R-squared:                  0.805
Method:                 Least Squares   F-statistic:                     4131.
Date:                Wed, 23 Sep 2015   Prob (F-statistic):               0.00
Time:                        19:44:51   Log-Likelihood:                -1415.8
No. Observations:                1000   AIC:                             2836.
Df Residuals:                     998   BIC:                             2845.
Df Model:                           1                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [95.0% Conf. Int.]
------------------------------------------------------------------------------
Intercept      1.0799      0.032     34.221      0.0

In [21]:
#Question 3 - Generate a 1000 repetition of the experiment, getting B1 vallues for each repetition and 
#and generatiing a histogram of this B1 vaues.

#Defining a empty list for the OLS parameters

params = []

# Generating a Loop to repeat the experiment 1000 times:

for i in range (0, 1000):
    e1 = np.random.normal(0, 1, 1000)  
    x = np.random.normal(0, 1, 1000) 

    # DGPs for y1 
    y1 = 1 + 2 * x + e1

    #Create panda dataframes from DGP x and y1, merge them into a single Dataframe and process OLS regression
    #print the results of the OLS
    x = pd.DataFrame(x)
    y1 = pd.DataFrame(y1)

    data1 = pd.merge(y1, x, left_index=True, right_index=True)
    mod1 = smf.ols(formula='y1 ~ x', data=data1).fit()
    
    #obtaining the paramenters (B0 and B1) of each repetition    
    f = mod1.params
    
    # Storing them in the list 
    params.append(f)

# Creating a Panda Dataframe of the list     
paramsdf = pd.DataFrame(params)
# Droping the Intercept parameter of the Dataframe
beta1dff = paramsdf.drop("Intercept", 1)
# proving the conditions of the dataframe
beta1dff.shape
# Plotting the dataframe of simulated B1's
plt.figure(figsize= (10,10))
beta1dff.plot(kind='hist', color='blue')
plt.title('Histogram of B1 results - 1000 repetitions', fontsize=20)
plt.ylabel('# of Ocurrences', fontsize=18)

<matplotlib.text.Text at 0x1ba75240>

<img src = "Figure_7.png">

In this case, when we run a simulation of this kind, we obtain from the histogram that the estimations of the B1 parameter for a a 1.000 sample Drawn seem to be normaly distributed, since they were obtained for 1.000 draws of 1.000 normally distributed samples of both X1 and E1. The dispersion of this different B1 meassures in the histogram thus represents the estimated distribution of the errors and the estimated X1 values that affect the estimations of the Y variable over the 1.000 samples.  

In [22]:
#Question 4 - Obtain the distribution of a functional transformation of B1 exp(b1).

#Modifing the dataframe to generate a new one with the funtional transformation

ebeta1dff = beta1dff.apply(np.exp)
# Plotting the dataframe of simulated B1's
plt.figure(figsize= (10,10))
ebeta1dff.plot(kind='hist', color='blue')
plt.title('Histogram of exp(B1) results - 1000 repetitions', fontsize=20)
plt.ylabel('# of Ocurrences', fontsize=18)

<matplotlib.text.Text at 0x1bcd67f0>

<img src = "Figure_8.png">

In [23]:
#Sample analysis using Anderson-Darling test to prove normality 
stats.anderson(ebeta1dff['x'], dist='norm')

(0.21825696999962929,
 array([ 0.574,  0.653,  0.784,  0.914,  1.088]),
 array([ 15. ,  10. ,   5. ,   2.5,   1. ]))

To test what type of distribution the data follows, we use 2 tools:

1. An histogram of the sample to visually analize if it is normaly distributed
2. Perform a Anderson-Darling test to statistically prove normality of the sample

In this case, given the histogram of the values of exp(B1) and the A2 value compared to the critical values of the Anderson-Darlin test (for 10%, 5%, 2.5% and 1%), we have no reason to reject the null hypotests of the data being normally distributed.