# 1 Load libraries and data

In [1]:
#Libraries
import numpy as np
import pandas as pd
import pandas_profiling as pp
import seaborn as sns
import matplotlib.pyplot as plt
import statsmodels.api as sm
import math 
import pickle
import pylab
from scipy import stats
from scipy.stats import kurtosis, skew
from statsmodels.stats import diagnostic as diag
from statsmodels.stats.outliers_influence import variance_inflation_factor
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error

#Settings
%matplotlib inline
pd.options.display.max_rows = 50
pd.options.display.max_columns = 50
pd.set_option('display.precision', 2)
pd.options.display.float_format = '{:8,.2f}'.format

In [2]:
#load data
data = pd.read_excel('..\\data\\LeadData_Customer_Mod.xlsx', index_col='OriginalCustomerNumber')
#Drop text 
data = data.drop([],axis = 1)
#data.head()
#data.shape
#Convert data to float for calculations
data = data.astype(float)
#Verify data types
#data.dtypes
data.shape

(4026, 110)

# 2 Validate data

In [6]:
#Filter data as assumption (BIAS TO BE DESCRIBED)
#data_filtered = data.drop(data.index[data.LoneStar == 1]) #LTDSMU should not be 99999 or higher
data_filtered = data.drop(columns=['LoneStar'                                 
                                    ,'Family_Accounts'
                                    ,'LN_Family_Accounts'
                                    ,'AverageDaysToPay'
                                    ,'LN_AverageDaysToPay'
                                    ,'Expenditure_12M_Equipment'
                                    ,'LN_Expenditure_12M_Equipment'
                                    ,'Expenditure_12M_PartsNServices'
#                                    ,'LN_Expenditure_12M_PartsNServices'
                                    ,'Expenditure_12M_Rentals'
                                    ,'LN_Expenditure_12M_Rentals'
                                    ,'Family_Expenditure_12M_Equipment'
                                    ,'LN_Family_Expenditure_12M_Equipment'
                                    ,'Family_Expenditure_12M_PartsNServices'
                                    ,'LN_Family_Expenditure_12M_PartsNServices'
                                    ,'Family_Expenditure_12M_Rentals'
                                    ,'LN_Family_Expenditure_12M_Rentals'
                                    ,'Family_CatFleetSize'
                                    ,'LN_Family_CatFleetSize'
                                    ,'Family_FleetSize'
                                    ,'LN_Family_FleetSize'
                                     ,'Count_CatFleetSize'
                                     ,'LN_Count_CatFleetSize'
                                    ,'Count_CatFleetSize_BINS_0'
                                     ,'Count_CatFleetSize_BINS_1'
                                    ,'Count_CatFleetSize_BINS_2_5'
                                    ,'Count_CatFleetSize_BINS_6_10'
                                    ,'Count_CatFleetSize_BINS_11_15'
                                    ,'Count_CatFleetSize_BINS_16_20'
                                    ,'Count_CatFleetSize_BINS_21-25'
                                    ,'Count_CatFleetSize_BINS_26_PLUS'
                                    ,'Count_FleetSize'
                                    ,'LN_Count_FleetSize'
                                    ,'Percentage_PLSubscriptionActive'
                                    ,'LN_Percentage_PLSubscriptionActive'
                                    ,'Percentage_Cat'
                                    ,'LN_Percentage_Cat'
                                    ,'PercentageFleetUtilized'
                                    ,'LN_PercentageFleetUtilized'
                                    ,'Avg_AnnualUsage'
#                                     ,'LN_Avg_AnnualUsage'
                                    ,'OLGAAmount'
#                                     ,'LN_OLGAAmount'
                                    ,'PartsDCAL'
#                                     ,'LN_PartsDCAL'
                                    ,'PartsDCAL_BINS_0'
                                   ,'PartsDCAL_BINS_0_10'
                                   ,'PartsDCAL_BINS_10_35'
                                   ,'PartsDCAL_BINS_35_70'
                                   ,'PartsDCAL_BINS_70_90'
                                   ,'PartsDCAL_BINS_90_100'
                                   ,'PartsDCAL_BINS_100'
                                    ,'ServiceDCAL'
#                                     ,'LN_ServiceDCAL'
                                   ,'ServiceDCAL_BINS_0'
                                   ,'ServiceDCAL_BINS_0_10'
                                   ,'ServiceDCAL_BINS_10_35'
                                   ,'ServiceDCAL_BINS_35_70'
                                   ,'ServiceDCAL_BINS_70_90'
                                   ,'ServiceDCAL_BINS_90_100'
                                    ,'ServiceDCAL_BINS_100'
                                    ,'Average_Fleet_Age_Months'
                                    ,'LN_Average_Fleet_Age_Months'
                                    ,'Average_Fleet_Hours'
                                    ,'LN_Average_Fleet_Hours'
                                    ,'PARTS_CU_0_12M'
                                    ,'LN_PARTS_CU_0_12M'
                                    ,'PARTS_Total_Per_CatFleetSize'
                                    ,'LN_PARTS_Total_Per_CatFleetSize'
                                    ,'PARTS_Total_Per_500HR_Utilized'
                                    ,'LN_PARTS_Total_Per_500HR_Utilized'
                                    ,'PARTS_COUNT_CU_0_12M'
                                    ,'PARTS_Count_Per_CatFleetSize'
                                    ,'LN_PARTS_Count_Per_CatFleetSize'
                                    ,'PARTS_Count_Per_500HR_Utilized'
                                    ,'LN_PARTS_Count_Per_500HR_Utilized'
                                    ,'PartsReturnPercentage_CU'
                                    ,'PartsReturnPercentage_BINS_0'
                                    ,'PartsReturnPercentage_BINS_0_10'
                                    ,'PartsReturnPercentage_BINS_10_35'
                                    ,'PartsReturnPercentage_BINS_35_70'
                                    ,'PartsReturnPercentage_BINS_70_90'
                                    ,'PartsReturnPercentage_BINS_90_100'
                                    ,'PartsReturnPercentage_BINS_100'
                                    ,'SERVICES_CU_0_12M'
                                    ,'LN_SERVICES_CU_0_12M'
                                    ,'SERVICES_Total_Per_CatFleetSize'
                                    ,'LN_SERVICES_Total_Per_CatFleetSize'
                                    ,'SERVICES_Total_Per_500HR_Utilized'
                                    ,'LN_SERVICES_Total_Per_500HR_Utilized'
                                    ,'SERVICES_COUNT_CU_0_12M'
                                    ,'LN_SERVICES_COUNT_CU_0_12M'
                                    ,'SERVICES_Count_Per_CatFleetSize'
                                    ,'LN_SERVICES_Count_Per_CatFleetSize'
                                    ,'SERVICES_Count_Per_500HR_Utilized'
                                    ,'LN_SERVICES_Count_Per_500HR_Utilized'
                                    ,'ServicesBackOutPercentage_CU'
                                    ,'ServiceBackOutPercentage_BINS_0'
                                    ,'ServiceBackOutPercentage_BINS_0_10'
                                    ,'ServiceBackOutPercentage_BINS_10_35'
                                    ,'ServiceBackOutPercentage_BINS_35_70'
                                    ,'ServiceBackOutPercentage_BINS_70_90'
                                    ,'ServiceBackOutPercentage_BINS_90_100'
                                    ,'ServiceBackOutPercentage_BINS_100'
                                    ,'SOS_COUNT_CU_0_12M'
                                    ,'LN_SOS_COUNT_CU_0_12M'
                                    ,'SOS_Count_Per_CatFleetSize'
                                    ,'LN_SOS_Count_Per_CatFleetSize'
                                    ,'SOS_Count_Per_500HR_Utilized'
                                    ,'LN_SOS_Count_Per_500HR_Utilized'
                                  ])
#data_filtered = data_filtered.drop(data_filtered.index[data_filtered.PartsDCAL < 10 ])   #Drop customers with POPS = 0
#data_filtered = data_filtered.drop(data_filtered.index[data_filtered.Count_CatFleetSize ==0 ])   #Drop customers with POPS = 0
#data_filtered = data_filtered.drop(data_filtered.index[data_filtered.OLGAAmount == 0]) #Eliminate OLGA = 0
data_filtered.columns

Index(['LN_Expenditure_12M_PartsNServices', 'LN_Avg_AnnualUsage',
       'LN_OLGAAmount', 'LN_PartsDCAL', 'LN_ServiceDCAL'],
      dtype='object')

In [8]:
pip install pandas-profiling

Note: you may need to restart the kernel to use updated packages.


In [9]:
#Describe data via panda profiler
eda = pp.ProfileReport(data_filtered)
display(eda)

TypeError: concat() got an unexpected keyword argument 'join_axes'

# 3 Correlation Matrix

In [None]:
# #Compute the correlation matrix
corr_matrix = round(data_filtered.corr(),2)
display(corr_matrix)

def get_sorted_corr(df):
    crr = df.corr().abs().unstack()
# #     labels_to_drop = get_redundant_pairs(df)
# #     crr = crr.drop(labels=labels_to_drop).sort_values(ascending=False)
    return crr

# def get_redundant_pairs(df):
#     pairs_to_drop = set()
#     cols = df.columns
# #     for i in range(0, df.shape[1]):
# #         for j in range(0, i+1):
# #             pairs_to_drop.add((cols[i], cols[j]))
#     return pairs_to_drop

# #Unstack and print to excel
get_sorted_corr(data_filtered)
sorted_corr = get_sorted_corr(data_filtered)
sorted_corr.to_csv('..\\data\\Output\\Sorted_Correlation_Equipment_Mod.csv')

In [None]:
# # Generate a mask for the upper triangle
mask = np.triu(np.ones_like(corr_matrix, dtype=np.bool))

# # Set up the matplotlib figure
fig, ax = plt.subplots(figsize=(4,4))

# # Generate a custom diverging colormap
cmap = sns.diverging_palette(240, 10, n=9, as_cmap=True)

heatmap = sns.heatmap(corr_matrix, mask=mask, cmap=cmap,
                      square=True, linewidths=3,
                      cbar_kws = {'shrink': 1},
                      vmin=-1, vmax=1,
                      annot=True, annot_kws = {'size': 1})

# add the column names as labels
ax.set_yticklabels(corr_matrix.columns)
ax.set_xticklabels(corr_matrix.columns)

plt.show()
# plt.savefig('..\\data\\Output\\Correlation_Matrix_Equipment')

# 4 Evaluate Variables Y = LN_Expenditure_12M_PartsNServices

## Variable test LN_Avg_AnnualUsage

In [5]:
#Create subset test for testing ideal element
test = data_dropna.loc[:, ['LN_Expenditure_12M_PartsNServices','LN_Avg_AnnualUsage']]

X = test[['LN_Avg_AnnualUsage']]
Y = test[['LN_Expenditure_12M_PartsNServices']]

#test.describe()
#scatter plot
sns.pairplot(test, kind='reg', diag_kind='hist',
             height=3, aspect=1,
             plot_kws={'scatter_kws': {'alpha': 0.7}})
#plt.savefig('..\\data\\Output\\LN_Avg_AnnualUsage')

NameError: name 'data_dropna' is not defined

In [None]:
test.corr()

In [None]:
test.hist(grid = True, color = 'CadetBlue')

In [None]:
# create a OLS model.
model = sm.OLS(Y, X)

#Print
results = model.fit()
print(results.summary())

In [None]:
# estimate the p-values
results.pvalues

In [None]:
# Split X and y into X_
x_train, x_test, y_train, y_test = train_test_split(X, Y, test_size=0.20, random_state=1)
# create a Linear Regression model object.
single_regression_model = LinearRegression()
# pass through the X_train & y_train data set.
single_regression_model.fit(x_train, y_train)

# create prediction value
y_predict = single_regression_model.predict(x_test)

# calculate the mean squared error.
model_mse = mean_squared_error(y_test, y_predict)
# calculate the mean absolute error.
model_mae = mean_absolute_error(y_test, y_predict)
# calulcate the root mean squared error
model_rmse =  math.sqrt(model_mse)

# display the output
print("MSE {:3}".format(model_mse))
print("MAE {:3}".format(model_mae))
print("RMSE {:3}".format(model_rmse))

In [None]:
model_r2 = r2_score(y_test, y_predict)
print("R2: {:.2}".format(model_r2))

In [None]:
# Grab the residuals & then call the hist() method
(y_test - y_predict).hist(grid = False, color = 'royalblue')
plt.title("Model Residuals")
plt.show()

In [None]:
# Plot outputs
plt.scatter(x_test, y_test,  color='red', label = 'Y')
plt.plot(x_test, y_predict, color='royalblue', linewidth = 3, linestyle= '-',label ='Regression Line')

plt.title("X VS Y")
plt.xlabel("x")
plt.ylabel("y")
plt.legend()
plt.show()

## Variable test LN_OLGAAmount

In [None]:
#Create subset test for testing ideal element
test = data_dropna.loc[:, ['LN_Expenditure_12M_PartsNServices','LN_OLGAAmount']]

X = test[['LN_OLGAAmount']]
Y = test[['LN_Expenditure_12M_PartsNServices']]

#test.describe()
#scatter plot
sns.pairplot(test, kind='reg', diag_kind='hist',
             height=3, aspect=1,
             plot_kws={'scatter_kws': {'alpha': 0.7}})
#plt.savefig('..\\data\\Output\\LN_Avg_AnnualUsage')

In [None]:
test.corr()

In [None]:
test.hist(grid = True, color = 'CadetBlue')

In [None]:
# create a OLS model.
model = sm.OLS(Y, X)

#Print
results = model.fit()
print(results.summary())

In [None]:
# estimate the p-values
results.pvalues

In [None]:
# Split X and y into X_
x_train, x_test, y_train, y_test = train_test_split(X, Y, test_size=0.20, random_state=1)
# create a Linear Regression model object.
single_regression_model = LinearRegression()
# pass through the X_train & y_train data set.
single_regression_model.fit(x_train, y_train)

# create prediction value
y_predict = single_regression_model.predict(x_test)

# calculate the mean squared error.
model_mse = mean_squared_error(y_test, y_predict)
# calculate the mean absolute error.
model_mae = mean_absolute_error(y_test, y_predict)
# calulcate the root mean squared error
model_rmse =  math.sqrt(model_mse)

# display the output
print("MSE {:3}".format(model_mse))
print("MAE {:3}".format(model_mae))
print("RMSE {:3}".format(model_rmse))

In [None]:
model_r2 = r2_score(y_test, y_predict)
print("R2: {:.2}".format(model_r2))

In [None]:
# Grab the residuals & then call the hist() method
(y_test - y_predict).hist(grid = False, color = 'royalblue')
plt.title("Model Residuals")
plt.show()

In [None]:
# Plot outputs
plt.scatter(x_test, y_test,  color='red', label = 'Y')
plt.plot(x_test, y_predict, color='royalblue', linewidth = 3, linestyle= '-',label ='Regression Line')

plt.title("X VS Y")
plt.xlabel("x")
plt.ylabel("y")
plt.legend()
plt.show()

## Variable test LN_PartsDCAL

In [None]:
#Create subset test for testing ideal element
test = data_dropna.loc[:, ['LN_Expenditure_12M_PartsNServices','LN_PartsDCAL']]

X = test[['LN_PartsDCAL']]
Y = test[['LN_Expenditure_12M_PartsNServices']]

#test.describe()
#scatter plot
sns.pairplot(test, kind='reg', diag_kind='hist',
             height=3, aspect=1,
             plot_kws={'scatter_kws': {'alpha': 0.7}})
#plt.savefig('..\\data\\Output\\LN_Avg_AnnualUsage')

In [None]:
test.corr()

In [None]:
test.hist(grid = True, color = 'CadetBlue')

In [None]:
# create a OLS model.
model = sm.OLS(Y, X)

#Print
results = model.fit()
print(results.summary())

In [None]:
# estimate the p-values
results.pvalues

In [None]:
# Split X and y into X_
x_train, x_test, y_train, y_test = train_test_split(X, Y, test_size=0.20, random_state=1)
# create a Linear Regression model object.
single_regression_model = LinearRegression()
# pass through the X_train & y_train data set.
single_regression_model.fit(x_train, y_train)

# create prediction value
y_predict = single_regression_model.predict(x_test)

# calculate the mean squared error.
model_mse = mean_squared_error(y_test, y_predict)
# calculate the mean absolute error.
model_mae = mean_absolute_error(y_test, y_predict)
# calulcate the root mean squared error
model_rmse =  math.sqrt(model_mse)

# display the output
print("MSE {:3}".format(model_mse))
print("MAE {:3}".format(model_mae))
print("RMSE {:3}".format(model_rmse))

In [None]:
model_r2 = r2_score(y_test, y_predict)
print("R2: {:.2}".format(model_r2))

In [None]:
# Grab the residuals & then call the hist() method
(y_test - y_predict).hist(grid = False, color = 'royalblue')
plt.title("Model Residuals")
plt.show()

In [None]:
# Plot outputs
plt.scatter(x_test, y_test,  color='red', label = 'Y')
plt.plot(x_test, y_predict, color='royalblue', linewidth = 3, linestyle= '-',label ='Regression Line')

plt.title("X VS Y")
plt.xlabel("x")
plt.ylabel("y")
plt.legend()
plt.show()

## Variable test LN_ServiceDCAL

In [None]:
#Create subset test for testing ideal element
test = data_dropna.loc[:, ['LN_Expenditure_12M_PartsNServices','LN_ServiceDCAL']]

X = test[['LN_ServiceDCAL']]
Y = test[['LN_Expenditure_12M_PartsNServices']]

#test.describe()
#scatter plot
sns.pairplot(test, kind='reg', diag_kind='hist',
             height=3, aspect=1,
             plot_kws={'scatter_kws': {'alpha': 0.7}})
#plt.savefig('..\\data\\Output\\LN_Avg_AnnualUsage')

In [None]:
test.corr()

In [None]:
test.hist(grid = True, color = 'CadetBlue')

In [None]:
# create a OLS model.
model = sm.OLS(Y, X)

#Print
results = model.fit()
print(results.summary())

In [None]:
# estimate the p-values
results.pvalues

In [None]:
# Split X and y into X_
x_train, x_test, y_train, y_test = train_test_split(X, Y, test_size=0.20, random_state=1)
# create a Linear Regression model object.
single_regression_model = LinearRegression()
# pass through the X_train & y_train data set.
single_regression_model.fit(x_train, y_train)

# create prediction value
y_predict = single_regression_model.predict(x_test)

# calculate the mean squared error.
model_mse = mean_squared_error(y_test, y_predict)
# calculate the mean absolute error.
model_mae = mean_absolute_error(y_test, y_predict)
# calulcate the root mean squared error
model_rmse =  math.sqrt(model_mse)

# display the output
print("MSE {:3}".format(model_mse))
print("MAE {:3}".format(model_mae))
print("RMSE {:3}".format(model_rmse))

In [None]:
model_r2 = r2_score(y_test, y_predict)
print("R2: {:.2}".format(model_r2))

In [None]:
# Grab the residuals & then call the hist() method
(y_test - y_predict).hist(grid = False, color = 'royalblue')
plt.title("Model Residuals")
plt.show()

In [None]:
# Plot outputs
plt.scatter(x_test, y_test,  color='red', label = 'Y')
plt.plot(x_test, y_predict, color='royalblue', linewidth = 3, linestyle= '-',label ='Regression Line')

plt.title("X VS Y")
plt.xlabel("x")
plt.ylabel("y")
plt.legend()
plt.show()

## Variable test ?

In [None]:
#Create subset test for testing ideal element
test = data_dropna.loc[:, ['LN_Expenditure_12M_PartsNServices','LN_Avg_AnnualUsage']]

X = test[['LN_Avg_AnnualUsage']]
Y = test[['LN_Expenditure_12M_PartsNServices']]

#test.describe()
#scatter plot
sns.pairplot(test, kind='reg', diag_kind='hist',
             height=3, aspect=1,
             plot_kws={'scatter_kws': {'alpha': 0.7}})
#plt.savefig('..\\data\\Output\\LN_Avg_AnnualUsage')

In [None]:
test.corr()

In [None]:
test.hist(grid = True, color = 'CadetBlue')

In [None]:
# create a OLS model.
model = sm.OLS(Y, X)

#Print
results = model.fit()
print(results.summary())

In [None]:
# estimate the p-values
results.pvalues

In [None]:
# Split X and y into X_
x_train, x_test, y_train, y_test = train_test_split(X, Y, test_size=0.20, random_state=1)
# create a Linear Regression model object.
single_regression_model = LinearRegression()
# pass through the X_train & y_train data set.
single_regression_model.fit(x_train, y_train)

# create prediction value
y_predict = single_regression_model.predict(x_test)

# calculate the mean squared error.
model_mse = mean_squared_error(y_test, y_predict)
# calculate the mean absolute error.
model_mae = mean_absolute_error(y_test, y_predict)
# calulcate the root mean squared error
model_rmse =  math.sqrt(model_mse)

# display the output
print("MSE {:3}".format(model_mse))
print("MAE {:3}".format(model_mae))
print("RMSE {:3}".format(model_rmse))

In [None]:
model_r2 = r2_score(y_test, y_predict)
print("R2: {:.2}".format(model_r2))

In [None]:
# Grab the residuals & then call the hist() method
(y_test - y_predict).hist(grid = False, color = 'royalblue')
plt.title("Model Residuals")
plt.show()

In [None]:
# Plot outputs
plt.scatter(x_test, y_test,  color='red', label = '')
plt.plot(x_test, y_predict, color='royalblue', linewidth = 3, linestyle= '-',label ='Regression Line')

plt.title("X VS Y")
plt.xlabel("x")
plt.ylabel("y")
plt.legend()
plt.show()

# 5 Regression Model Y = LN_Expenditure_12M_PartsNServices

## 1 OLS

In [None]:
data_dropna = data_filtered.dropna()
# define our input variable (X) & output variable
X = data_dropna.drop('LN_Expenditure_12M_PartsNServices', axis = 1)
Y = data_dropna[['LN_Expenditure_12M_PartsNServices']]

#OLS Model
model = sm.OLS(Y, X)

results = model.fit()
print(results.summary())

In [None]:
results.conf_int()

In [None]:
results.pvalues

## 2 Regression Model

In [None]:
# Split X and y into X_
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.20, random_state=1)

# create a Linear Regression model object
regression_model = LinearRegression()

# pass through the X_train & y_train data set
regression_model.fit(X_train, y_train)

# prediction model
y_predict = regression_model.predict(X_test)

# calculate the mean squared error
model_mse = mean_squared_error(y_test, y_predict)

# calculate the mean absolute error
model_mae = mean_absolute_error(y_test, y_predict)

# calulcate the root mean squared error
model_rmse =  math.sqrt(model_mse)

# display the output
print("MSE {:.3}".format(model_mse))
print("MAE {:.3}".format(model_mae))
print("RMSE {:.3}".format(model_rmse))

In [None]:
model_r2 = r2_score(y_test, y_predict)
print("R2: {:.2}".format(model_r2))

## 3 Test for Multicollinearity /  VIF Test

In [None]:
# define two data frames one before the drop and one after the drop
df_before = data_dropna
df_after = data_dropna.drop([], axis = 1)

# the VFI does expect a constant term in the data, so we need to add one using the add_constant method
X1 = sm.tools.add_constant(df_before)
X2 = sm.tools.add_constant(df_after)

# create the series for both
series_before = pd.Series([variance_inflation_factor(X1.values, i) for i in range(X1.shape[1])], index=X1.columns)
series_after = pd.Series([variance_inflation_factor(X2.values, i) for i in range(X2.shape[1])], index=X2.columns)

# display the series
print('DATA BEFORE')
print('-'*100)
display(series_before)

print('DATA AFTER')
print('-'*100)
display(series_after)

## 4 Testing for Autocorrelation

In [None]:
# define our input variable (X) & output variable
X = data_dropna.drop('LN_Expenditure_12M_PartsNServices', axis = 1)
Y = data_dropna[['LN_Expenditure_12M_PartsNServices']]

# test for Autocorrelation
from statsmodels.stats.stattools import durbin_watson

# define our intput
X2 = sm.add_constant(X)

# create a OLS model
model = sm.OLS(Y, X2)

# fit the data
est = model.fit()

# calculate the lag, optional
lag = min(10, (len(X)//5))
print('The number of lags will be {}'.format(lag))
print('-'*100)

# run the Ljung-Box test for no autocorrelation of residuals
# test_results = diag.acorr_breusch_godfrey(est, nlags = lag, store = True)
test_results = diag.acorr_ljungbox(est.resid, lags = lag)

# grab the p-values and the test statistics
ibvalue, p_val = test_results

# print the results of the test
if min(p_val) > 0.05:
    print("The lowest p-value found was {:.4}".format(min(p_val)))
    print("We fail to reject the null hypthoesis, so there is no autocorrelation.")
    print('-'*100)
else:
    print("The lowest p-value found was {:.4}".format(min(p_val)))
    print("We reject the null hypthoesis, so there is autocorrelation.")
    print('-'*100)

# plot autocorrelation
sm.graphics.tsa.plot_acf(est.resid)
plt.show()

## 5 Testing the Mean of the Residuals Equals 0

In [None]:
# check for the normality of the residuals
sm.qqplot(results.resid, line='s')
pylab.show()

# also check that the mean of the residuals is approx. 0.
mean_residuals = sum(results.resid)/ len(results.resid)
print("The mean of the residuals is {:.4}".format(mean_residuals))

## 6 Testing for Heteroskedasticity

In [None]:
# Run the Breusch-Pagan test
_, pval, __, f_pval = diag.het_breuschpagan(results.resid, results.model.exog)
print(pval, f_pval)
print('-'*100)

# print the results of the test
if pval > 0.05:
    print("For the Breusch-Pagan's Test")
    print("The p-value was {:.4}".format(pval))
    print("We fail to reject the null hypthoesis, so there is no heterosecdasticity.")

else:
    print("For the Breusch-Pagan's Test")
    print("The p-value was {:.4}".format(pval))
    print("We reject the null hypthoesis, so there is heterosecdasticity.")

