# Lead generation customer value exploration

In [None]:
## Load libraries (o_o)=<

import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import statsmodels.api as sm
import math 
import pickle
import pylab

from scipy import stats
from scipy.stats import kurtosis, skew
from statsmodels.stats import diagnostic as diag
from statsmodels.stats.outliers_influence import variance_inflation_factor
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error

%matplotlib inline
pd.options.display.max_rows = 10
pd.set_option('display.precision', 2)
pd.options.display.float_format = '{:8,.2f}'.format

In [None]:
## Load and verify data set (o_o)=====<

#load data
data = pd.read_excel('..\\data\\LeadData_Customer.xlsx', index_col='OriginalCustomerNumber')

#Show summary
#data.head()

#Verify data types
data = data.astype(float)
#data.dtypes

#Verify data size
#data.shape

# Print data columns for copy and paste
data.columns

# Explore the data in multiple linear regression (o_o)!~/\

## Verify Data and building correlation matrix

In [None]:
# check for missing values
display(data.isna().any())

# verify/drop any missing values
#customer_filtered.isna().any()
#data_dropna = data.dropna() 
#instead of dropna. i decided to drop columns instead.
data_dropna = data.drop(['Average_Fleet_Age_Months','Average_Fleet_Hours'], axis = 1)
# verify data size
data_dropna.shape

In [None]:
#remove extreme outliers:
#If you have multiple columns in your dataframe and would like to remove all 
#rows that have outliers in at least one column, the following expression 
#would do that in one shot: select absolute Z score for the row <3 (). .all or .any

data_filtered_by_z = data_dropna[(np.abs(stats.zscore(data_dropna)) < 3).all(axis=1)]
data_filtered_by_z.shape

In [None]:
# Compute the correlation matrix
corr_matrix = round(data_filtered_by_z.corr(),2)

# get sorted correlation
# def get_redundant_pairs(df):
#     pairs_to_drop = set()
#     cols = df.columns
#     for i in range(0, df.shape[1]):
#         for j in range(0, i+1):
#             pairs_to_drop.add((cols[i], cols[j]))
#     return pairs_to_drop

def get_sorted_corr(df):
    crr = df.corr().abs().unstack()
#     labels_to_drop = get_redundant_pairs(df)
#     crr = crr.drop(labels=labels_to_drop).sort_values(ascending=False)
    return crr

# I did not remove duplicates because i will be using excel filters

get_sorted_corr(data_filtered_by_z)

sorted_corr = get_sorted_corr(data_dropna)
sorted_corr.to_csv('..\\data\\Output\\Sorted_Correlation_Customer_filtered_by_z.csv')

display(corr_matrix)

In [None]:
#Generate a chart for correlations

# Generate a mask for the upper triangle
mask = np.triu(np.ones_like(corr_matrix, dtype=np.bool))

# Set up the matplotlib figure
fig, ax = plt.subplots(figsize=(120, 120))

# Generate a custom diverging colormap
cmap = sns.diverging_palette(240, 10, n=9, as_cmap=True)

heatmap = sns.heatmap(corr_matrix, mask=mask, cmap=cmap,
                      square=True, linewidths=3,
                      cbar_kws = {'shrink': 0.6},
                      vmin=-1, vmax=1,
                      annot=True, annot_kws = {'size': 2})

# add the column names as labels
ax.set_yticklabels(corr_matrix.columns)
ax.set_xticklabels(corr_matrix.columns)

# plt.show()
plt.savefig('..\\data\\Output\\Correlation_Matrix_Customer_filtered_by_Z')

In [None]:
#OR using yoyo's plot (sns)
#plt.style.use('seaborn-ticks')

#sns.pairplot(data_filtered_by_z, kind='reg', diag_kind='hist',
#             height=2.5, aspect=1,
#             plot_kws={'scatter_kws': {'alpha': 0.7}})
#save the plot
#plt.savefig('..\\data\\Output\\Scatter_Plot_Customer_filtered_by_z')

## Testing 1 assumption: y = 'Expenditure_12M_Equipment'

### Drop variables with very low correlation to Y

In [None]:
#Drop columns with very low correlations
customer_filtered = data_dropna.drop([
#     'Average_Fleet_Annual_Utilization'#0.175846059
#     ,'Average_Count_WorkOrder_Per_Unit_0_12M'#0.170213446
#     ,'PARTS_CU_0_12M'#0.169683379
#     ,'Average_Count_SOS_Per_Unit_0_12M'#0.163032163
#     ,'Family_Expenditure_12M_Rentals'#0.146013307
#     ,'Mean_SMU_Between_SOS'#0.135341631
#     ,'Percentage_SubscriptionActive'#0.123444156
#     ,'ServiceDCAL'#0.111352903
#     ,'Family_Expenditure_12M_Equipment'#0.101919145
#     ,'PartsDCAL'#0.10153469
#     ,'Family_Accounts'#0.090045625
#    ,'Family_Expenditure_12M_PartsNServices'#0.089948971
#    ,'Percentage_PLDeviceSerialNumber'#0.089306841
    'Family_CatFleetSize'#0.017900534
    ,'Family_FleetSize'#0.016308968
    ,'Mean_SMU_Between_Repairs'#0.008058017
     ],axis = 1)
customer_filtered.shape

### Testing for Multicollinearity

In [None]:
# define two data frames one before the drop and one after the drop
customer_filtered_before = customer_filtered
customer_filtered_after = customer_filtered.drop([
    'Expenditure_12M_Total'
    ,'Count_FleetSize'
    ,'SERVICES_CU_0_12M'
    ,'WO_COUNT_CU_0_12M'
    ,'Family_Expenditure_12M_PartsNServices'
    ,'Family_Expenditure_12M_Equipment'
    ,'Family_Expenditure_12M_Rentals'
     # first to remove
    ],axis = 1)

#Step by step:
#look for pairs in correlation matrix, find extreme high correlations and drop 1 of the two
#review before/after series and correlation matrix back and forth Multiculinearity 
#shows high and somewhat similiar variance inflation factor
#inf means extremely large and should be concerned
#for pairs, drop the variable that is less complete/reliable, or if they could be calculated
#from one another, or  has lower correlation to Y

#set maximum display row to 100
pd.options.display.max_rows = 100

# the VFI does expect a constant term in the data, so we need to add one using the add_constant method
X1 = sm.tools.add_constant(customer_filtered_before)
X2 = sm.tools.add_constant(customer_filtered_after)

# create the series for both
series_before = pd.Series([variance_inflation_factor(X1.values, i) for i in range(X1.shape[1])], index=X1.columns)
series_after = pd.Series([variance_inflation_factor(X2.values, i) for i in range(X2.shape[1])], index=X2.columns)

# display the series

print('DATA BEFORE')
print('-'*100)
display(series_before)
#series_before.to_csv('..\\data\\Output\\series_before.csv')

print('DATA AFTER')
print('-'*100)
display(series_after)
#series_after.to_csv('..\\data\\Output\\series_after.csv')

In [None]:
# get the summary
desc_df = X2.describe()

# add the standard deviation metric
desc_df.loc['+3_std'] = desc_df.loc['mean'] + (desc_df.loc['std'] * 3)
desc_df.loc['-3_std'] = desc_df.loc['mean'] - (desc_df.loc['std'] * 3)

# display it
desc_df

### Create regression model using split

In [None]:
# define our input variable (X) & output variable
X = customer_filtered_after.drop('Expenditure_12M_Equipment', axis = 1)
Y = customer_filtered_after[['Expenditure_12M_Equipment']]

# Split X and y into X_
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.20, random_state=1)
# create a Linear Regression model object
regression_model = LinearRegression()
# pass through the X_train & y_train data set
regression_model.fit(X_train, y_train)

# let's grab the coefficient of our model and the intercept
intercept2 = regression_model.intercept_[0]
coefficent2 = regression_model.coef_[0][0]

print("The intercept for our model is {:.4}".format(intercept2))
print('-'*100)

# loop through the dictionary and print the data
for coef in zip(X.columns, regression_model.coef_[0]):
    print("The Coefficient for {} is {:.2}".format(coef[0],coef[1]))

In [None]:
# #Print trained model OLS
model = sm.OLS(y_train, X_train)
results = model.fit()
print(results.summary())

### Measures of Error

In [None]:
y_predict = regression_model.predict(X_test)

# calculate the mean squared error
model_mse = mean_squared_error(y_test, y_predict)

# calculate the mean absolute error
model_mae = mean_absolute_error(y_test, y_predict)

# calulcate the root mean squared error
model_rmse =  math.sqrt(model_mse)

# display the output
print("MSE {:.3}".format(model_mse))
print("MAE {:.3}".format(model_mae))
print("RMSE {:.3}".format(model_rmse))

In [None]:
model_r2 = r2_score(y_test, y_predict)
print("R2: {:.2}".format(model_r2))

## Testing 2 assumption: y = 'Expenditure_12M_PartsNServices'

### Drop variables with very low correlation to Y

In [None]:
#Drop columns with very low correlations
customer_filtered_2 = data_dropna.drop([
#     'Average_Count_WorkOrder_Per_Unit_0_12M'  #0.166633334661175
#     ,'Average_Count_SOS_Per_Unit_0_12M'  #0.156288888416924
#     ,'Average_Fleet_Annual_Utilization'  #0.153399972348449
#     ,'PartsDCAL'  #0.145070067775836
#     ,'ServiceDCAL'  #0.137168296369414
#     ,'PercentageFleet_Utilized_0_12M'  #0.122034899759774
#     ,'Family_Expenditure_12M_Rentals'  #0.108104086618808
#     'Mean_SMU_Between_SOS'  #0.0854501915530597
#    ,'Family_Expenditure_12M_PartsNServices'  #0.0825992300016435
#    ,'Family_Accounts'  #0.0667757747635436
#    ,'Family_Expenditure_12M_Equipment'  #0.0664845302668116
    'Family_CatFleetSize'  #0.0490952789636112
    ,'Family_FleetSize'  #0.0478747284722545
    ,'Mean_SMU_Between_Repairs'  #0.0245275784741454
    ,'Percentage_SubscriptionActive'  #0.0212732964124215
    ,'Percentage_PLDeviceSerialNumber'  #0.0034875756755241
     ],axis = 1)
customer_filtered_2.shape

### Testing for Multicollinearity

In [None]:
# define two data frames one before the drop and one after the drop
customer_filtered_before = customer_filtered_2
customer_filtered_after = customer_filtered_2.drop([
    'Expenditure_12M_Total'
    ,'Count_FleetSize'
    ,'SERVICES_CU_0_12M'
    ,'PARTS_CU_0_12M'
    ,'WO_COUNT_CU_0_12M'
    ,'Family_Accounts'
     ,'Family_Expenditure_12M_Rentals'  #0.108104086618808
    ,'Family_Expenditure_12M_PartsNServices'  #0.0825992300016435
    ,'Family_Expenditure_12M_Equipment'  #0.0664845302668116
     # first to remove
    ],axis = 1)

#Step by step:
#look for pairs in correlation matrix, find extreme high correlations and drop 1 of the two
#review before/after series and correlation matrix back and forth Multiculinearity 
#shows high and somewhat similiar variance inflation factor
#inf means extremely large and should be concerned
#for pairs, drop the variable that is less complete/reliable, or if they could be calculated
#from one another, or  has lower correlation to Y

#set maximum display row to 100
pd.options.display.max_rows = 100

# the VFI does expect a constant term in the data, so we need to add one using the add_constant method
X1 = sm.tools.add_constant(customer_filtered_before)
X2 = sm.tools.add_constant(customer_filtered_after)

# create the series for both
series_before = pd.Series([variance_inflation_factor(X1.values, i) for i in range(X1.shape[1])], index=X1.columns)
series_after = pd.Series([variance_inflation_factor(X2.values, i) for i in range(X2.shape[1])], index=X2.columns)

# display the series

print('DATA BEFORE')
print('-'*100)
display(series_before)
#series_before.to_csv('..\\data\\Output\\series_before.csv')

print('DATA AFTER')
print('-'*100)
display(series_after)
#series_after.to_csv('..\\data\\Output\\series_after.csv')

In [None]:
# get the summary
desc_df = X2.describe()

# add the standard deviation metric
desc_df.loc['+3_std'] = desc_df.loc['mean'] + (desc_df.loc['std'] * 3)
desc_df.loc['-3_std'] = desc_df.loc['mean'] - (desc_df.loc['std'] * 3)

# display it
desc_df

### Create regression model using split

In [None]:
# define our input variable (X) & output variable
X = customer_filtered_after.drop('Expenditure_12M_PartsNServices', axis = 1)
Y = customer_filtered_after[['Expenditure_12M_PartsNServices']]

# Split X and y into X_
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.20, random_state=1)
# create a Linear Regression model object
regression_model = LinearRegression()
# pass through the X_train & y_train data set
regression_model.fit(X_train, y_train)

# let's grab the coefficient of our model and the intercept
intercept2 = regression_model.intercept_[0]
coefficent2 = regression_model.coef_[0][0]

print("The intercept for our model is {:.4}".format(intercept2))
print('-'*100)

# loop through the dictionary and print the data
for coef in zip(X.columns, regression_model.coef_[0]):
    print("The Coefficient for {} is {:.2}".format(coef[0],coef[1]))

In [None]:
model = sm.OLS(y_train, X_train)
results = model.fit()
print(results.summary())

### Measures of Error

In [None]:
y_predict = regression_model.predict(X_test)

# calculate the mean squared error
model_mse = mean_squared_error(y_test, y_predict)

# calculate the mean absolute error
model_mae = mean_absolute_error(y_test, y_predict)

# calulcate the root mean squared error
model_rmse =  math.sqrt(model_mse)

# display the output
print("MSE {:.3}".format(model_mse))
print("MAE {:.3}".format(model_mae))
print("RMSE {:.3}".format(model_rmse))

In [None]:
model_r2 = r2_score(y_test, y_predict)
print("R2: {:.2}".format(model_r2))

## Testing 3 assumption: y = 'Expenditure_12M_Rentals'

### Drop variables with very low correlation to Y

In [None]:
#Drop columns with very low correlations
customer_filtered_3 = data_dropna.drop([
#    'Family_Expenditure_12M_Rentals'  #0.120975475580277
#     ,'Average_Count_WorkOrder_Per_Unit_0_12M'  #0.0980163342347926
#     ,'ServiceDCAL'  #0.0853271623755782
#     ,'Average_Fleet_Annual_Utilization'  #0.0794614031324676
#     ,'PercentageFleet_Utilized_0_12M'  #0.0753220870037637
#     ,'PartsDCAL'  #0.0685646052572167
#     ,'Average_Count_SOS_Per_Unit_0_12M'  #0.0676082558054839
    'Mean_SMU_Between_SOS'  #0.0470133914198075
    ,'Mean_SMU_Between_Repairs'  #0.0145017106442887
    ,'Percentage_SubscriptionActive'  #0.013981050846743
    ,'Family_Expenditure_12M_Equipment'  #0.0104706272426273
    ,'Family_Expenditure_12M_PartsNServices'  #0.0101370377282621
    ,'Family_Accounts'  #0.00653449338871061
    ,'Percentage_PLDeviceSerialNumber'  #0.0043947503190068
    ,'Family_CatFleetSize'  #0.00295002581263044
    ,'Family_FleetSize'  #0.00272325099532973
     ],axis = 1)
customer_filtered_3.shape

### Testing for Multicollinearity

In [None]:
# define two data frames one before the drop and one after the drop
customer_filtered_before = customer_filtered_3
customer_filtered_after = customer_filtered_3.drop([
    'Expenditure_12M_Total'
    ,'Count_FleetSize'
    ,'SERVICES_CU_0_12M'
    ,'PARTS_CU_0_12M'
    ,'WO_COUNT_CU_0_12M'
     # first to remove
    ],axis = 1)

#Step by step:
#look for pairs in correlation matrix, find extreme high correlations and drop 1 of the two
#review before/after series and correlation matrix back and forth Multiculinearity 
#shows high and somewhat similiar variance inflation factor
#inf means extremely large and should be concerned
#for pairs, drop the variable that is less complete/reliable, or if they could be calculated
#from one another, or  has lower correlation to Y

#set maximum display row to 100
pd.options.display.max_rows = 100

# the VFI does expect a constant term in the data, so we need to add one using the add_constant method
X1 = sm.tools.add_constant(customer_filtered_before)
X2 = sm.tools.add_constant(customer_filtered_after)

# create the series for both
series_before = pd.Series([variance_inflation_factor(X1.values, i) for i in range(X1.shape[1])], index=X1.columns)
series_after = pd.Series([variance_inflation_factor(X2.values, i) for i in range(X2.shape[1])], index=X2.columns)

# display the series

print('DATA BEFORE')
print('-'*100)
display(series_before)
#series_before.to_csv('..\\data\\Output\\series_before.csv')

print('DATA AFTER')
print('-'*100)
display(series_after)
#series_after.to_csv('..\\data\\Output\\series_after.csv')

In [None]:
# get the summary
desc_df = X2.describe()

# add the standard deviation metric
desc_df.loc['+3_std'] = desc_df.loc['mean'] + (desc_df.loc['std'] * 3)
desc_df.loc['-3_std'] = desc_df.loc['mean'] - (desc_df.loc['std'] * 3)

# display it
desc_df

### Create regression model using split

In [None]:
# define our input variable (X) & output variable
X = customer_filtered_after.drop('Expenditure_12M_Rentals', axis = 1)
Y = customer_filtered_after[['Expenditure_12M_Rentals']]

# Split X and y into X_
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.20, random_state=1)
# create a Linear Regression model object
regression_model = LinearRegression()
# pass through the X_train & y_train data set
regression_model.fit(X_train, y_train)

# let's grab the coefficient of our model and the intercept
intercept2 = regression_model.intercept_[0]
coefficent2 = regression_model.coef_[0][0]

print("The intercept for our model is {:.4}".format(intercept2))
print('-'*100)

# loop through the dictionary and print the data
for coef in zip(X.columns, regression_model.coef_[0]):
    print("The Coefficient for {} is {:.2}".format(coef[0],coef[1]))

In [None]:
model = sm.OLS(y_train, X_train)
results = model.fit()
print(results.summary())

### Measures of Error

In [None]:
y_predict = regression_model.predict(X_test)

# calculate the mean squared error
model_mse = mean_squared_error(y_test, y_predict)

# calculate the mean absolute error
model_mae = mean_absolute_error(y_test, y_predict)

# calulcate the root mean squared error
model_rmse =  math.sqrt(model_mse)

# display the output
print("MSE {:.3}".format(model_mse))
print("MAE {:.3}".format(model_mae))
print("RMSE {:.3}".format(model_rmse))

In [None]:
model_r2 = r2_score(y_test, y_predict)
print("R2: {:.2}".format(model_r2))

## Testing 4 assumption: y = 'Expenditure_12M_Total'

### Drop variables with very low correlation to Y

In [None]:
#Drop columns with very low correlations
customer_filtered_4 = data_dropna.drop([
      'Mean_SMU_Between_SOS'  #0.136025671692045
#     'Family_Expenditure_12M_PartsNServices'  #0.101149367476691
#     ,'Family_Expenditure_12M_Equipment'  #0.0992118087131292
#      ,'Family_Accounts'  #0.091683866279908
     ,'Percentage_SubscriptionActive'  #0.0871301151222554
     ,'Percentage_PLDeviceSerialNumber'  #0.0553382796156029
    ,'Family_CatFleetSize'  #0.0386292750913722
    ,'Family_FleetSize'  #0.0369664040617472
    ,'Mean_SMU_Between_Repairs'  #0.011595883273253
     ],axis = 1)
customer_filtered_4.shape

### Testing for Multicollinearity

In [None]:
# define two data frames one before the drop and one after the drop
customer_filtered_before = customer_filtered_4
customer_filtered_after = customer_filtered_4.drop([
    'Expenditure_12M_Rentals' #0.525139222
    ,'Expenditure_12M_Equipment'  #0.792534850730603
    ,'Expenditure_12M_PartsNServices'  #0.783863227622462
    ,'SERVICES_CU_0_12M'  #0.696393345405361
    ,'PARTS_CU_0_12M'  #0.633353238877687
    ,'Count_FleetSize'
    ,'WO_COUNT_CU_0_12M'
    ,'Family_Accounts'
    ,'Family_Expenditure_12M_PartsNServices'  #0.101149367476691
    ,'Family_Expenditure_12M_Equipment'  #0.0992118087131292
    ],axis = 1)

#Step by step:
#look for pairs in correlation matrix, find extreme high correlations and drop 1 of the two
#review before/after series and correlation matrix back and forth Multiculinearity 
#shows high and somewhat similiar variance inflation factor
#inf means extremely large and should be concerned
#for pairs, drop the variable that is less complete/reliable, or if they could be calculated
#from one another, or  has lower correlation to Y

#set maximum display row to 100
pd.options.display.max_rows = 100

# the VFI does expect a constant term in the data, so we need to add one using the add_constant method
X1 = sm.tools.add_constant(customer_filtered_before)
X2 = sm.tools.add_constant(customer_filtered_after)

# create the series for both
series_before = pd.Series([variance_inflation_factor(X1.values, i) for i in range(X1.shape[1])], index=X1.columns)
series_after = pd.Series([variance_inflation_factor(X2.values, i) for i in range(X2.shape[1])], index=X2.columns)

# display the series

print('DATA BEFORE')
print('-'*100)
display(series_before)
#series_before.to_csv('..\\data\\Output\\series_before.csv')

print('DATA AFTER')
print('-'*100)
display(series_after)
#series_after.to_csv('..\\data\\Output\\series_after.csv')

In [None]:
# get the summary
desc_df = X2.describe()

# add the standard deviation metric
desc_df.loc['+3_std'] = desc_df.loc['mean'] + (desc_df.loc['std'] * 3)
desc_df.loc['-3_std'] = desc_df.loc['mean'] - (desc_df.loc['std'] * 3)

# display it
desc_df

### Create regression model using split

In [None]:
# define our input variable (X) & output variable
X = customer_filtered_after.drop('Expenditure_12M_Total', axis = 1)
Y = customer_filtered_after[['Expenditure_12M_Total']]

# Split X and y into X_
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.20, random_state=1)
# create a Linear Regression model object
regression_model = LinearRegression()
# pass through the X_train & y_train data set
regression_model.fit(X_train, y_train)

# let's grab the coefficient of our model and the intercept
intercept2 = regression_model.intercept_[0]
coefficent2 = regression_model.coef_[0][0]

print("The intercept for our model is {:.4}".format(intercept2))
print('-'*100)

# loop through the dictionary and print the data
for coef in zip(X.columns, regression_model.coef_[0]):
    print("The Coefficient for {} is {:.2}".format(coef[0],coef[1]))

In [None]:
model = sm.OLS(y_train, X_train)
results = model.fit()
print(results.summary())

In [None]:
results.pvalues

### Measures of Error

In [None]:
y_predict = regression_model.predict(X_test)

# calculate the mean squared error
model_mse = mean_squared_error(y_test, y_predict)

# calculate the mean absolute error
model_mae = mean_absolute_error(y_test, y_predict)

# calulcate the root mean squared error
model_rmse =  math.sqrt(model_mse)

# display the output
print("MSE {:.3}".format(model_mse))
print("MAE {:.3}".format(model_mae))
print("RMSE {:.3}".format(model_rmse))

### R-Squared

In [None]:
model_r2 = r2_score(y_test, y_predict)
print("R2: {:.2}".format(model_r2))

## Regression model for Expenditure_12M_Total

### Testing for Autocorrelation

In [None]:
# test for Autocorrelation
from statsmodels.stats.stattools import durbin_watson

# define our intput
X2 = sm.add_constant(X)

# create a OLS model
model = sm.OLS(Y, X2)

# fit the data
est = model.fit()

# calculate the lag, optional
lag = min(10, (len(X)//5))
print('The number of lags will be {}'.format(lag))
print('-'*100)

# run the Ljung-Box test for no autocorrelation of residuals
# test_results = diag.acorr_breusch_godfrey(est, nlags = lag, store = True)
test_results = diag.acorr_ljungbox(est.resid, lags = lag)

# grab the p-values and the test statistics
ibvalue, p_val = test_results

# print the results of the test
if min(p_val) > 0.05:
    print("The lowest p-value found was {:.4}".format(min(p_val)))
    print("We fail to reject the null hypthoesis, so there is no autocorrelation.")
    print('-'*100)
else:
    print("The lowest p-value found was {:.4}".format(min(p_val)))
    print("We reject the null hypthoesis, so there is autocorrelation.")
    print('-'*100)

# plot autocorrelation
sm.graphics.tsa.plot_acf(est.resid)
plt.show()

### Testing for Heteroscedasticity

In [None]:
#Run the White's test
_, pval, __, f_pval = diag.het_white(est.resid, est.model.exog)
# print(pval, f_pval)
# print('-'*100)

# # print the results of the test
# if pval > 0.05:
#     print("For the White's Test")
#     print("The p-value was {:.4}".format(pval))
#     print("We fail to reject the null hypthoesis, so there is no heterosecdasticity. \n")
    
# else:
#     print("For the White's Test")
#     print("The p-value was {:.4}".format(pval))
#     print("We reject the null hypthoesis, so there is heterosecdasticity. \n")

# # Run the Breusch-Pagan test
# _, pval, __, f_pval = diag.het_breuschpagan(est.resid, est.model.exog)
# print(pval, f_pval)
# print('-'*100)

# # print the results of the test
# if pval > 0.05:
#     print("For the Breusch-Pagan's Test")
#     print("The p-value was {:.4}".format(pval))
#     print("We fail to reject the null hypthoesis, so there is no heterosecdasticity.")

# else:
#     print("For the Breusch-Pagan's Test")
#     print("The p-value was {:.4}".format(pval))
#     print("We reject the null hypthoesis, so there is heterosecdasticity.")

### Testing the Mean of the Residuals Equals 0

In [None]:
# check for the normality of the residuals
sm.qqplot(est.resid, line='s')
pylab.show()

# also check that the mean of the residuals is approx. 0.
mean_residuals = sum(est.resid)/ len(est.resid)
print("The mean of the residuals is {:.4}".format(mean_residuals))

### Confidence Intervals

In [None]:
# make some confidence intervals, 95% by default
est.conf_int()

### Hypothesis Testing

In [None]:
# estimate the p-values
est.pvalues

### Adjust the model

In [None]:
# define our input variable (X) & output variable
customer_filtered_again = customer_filtered_after.drop([
    'ServiceDCAL'
    ,'Average_Count_WorkOrder_Per_Unit_0_12M'
    ,'Average_Count_SOS_Per_Unit_0_12M'
#    ,'Percentage_PLDeviceSerialNumber'
#    ,'PercentageFleet_Utilized_0_12M'
#    ,'Mean_SMU_Between_SOS'
     ],axis = 1)

X = customer_filtered_again.drop('Expenditure_12M_Total', axis = 1)
Y = customer_filtered_again[['Expenditure_12M_Total']]

# Split X and y into X_
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.20, random_state=1)
# create a Linear Regression model object
regression_model = LinearRegression()
# pass through the X_train & y_train data set
regression_model.fit(X_train, y_train)

#Summary
model = sm.OLS(y_test, X_test)
results = model.fit()
print(results.summary())

In [None]:
y_predict = regression_model.predict(X_test)

# calculate the mean squared error
model_mse = mean_squared_error(y_test, y_predict)

# calculate the mean absolute error
model_mae = mean_absolute_error(y_test, y_predict)

# calulcate the root mean squared error
model_rmse =  math.sqrt(model_mse)

# display the output
print("MSE {:.3}".format(model_mse))
print("MAE {:.3}".format(model_mae))
print("RMSE {:.3}".format(model_rmse))

### R-Squared

In [None]:
model_r2 = r2_score(y_test, y_predict)
print("R2: {:.2}".format(model_r2))

# Explore the variables in linear regression (o_o)!===<

## Test 1: x = Utilization_Fleet_0_12M_Total y= Expenditure_12M_Total

### Assess the variables

In [None]:
#Create subset test for testing ideal element
test = data_dropna.loc[:, ['Utilization_Fleet_0_12M_Total', 'Expenditure_12M_Total']]
#define the x & y data
x = test['Utilization_Fleet_0_12M_Total']
y = test['Expenditure_12M_Total']

test.describe()

In [None]:
#create the scatter plot from data1
plt.plot(x,y,'o',color = 'Pink', label = '')
# add the column names as labels
plt.title('X VS Y')
plt.xlabel('Utilization_Fleet_0_12M_Total')
plt.ylabel('Expenditure_12M_Total')
plt.legend()
plt.show()
#Scatter plot shows a few extreme samples. Total rows 456

### Remove outliers

In [None]:
#remove extreme outliers:
#If you have multiple columns in your dataframe and would like to remove all rows that have outliers in at least one column, 
#the following expression would do that in one shot: select absolute Z score for the row <1.65 (). .all or .any
test_reduced_by_z = test[(np.abs(stats.zscore(test)) < 3).all(axis=1)]

x = test_reduced_by_z['Utilization_Fleet_0_12M_Total']
y = test_reduced_by_z['Expenditure_12M_Total']
test_reduced_by_z.describe()

In [None]:
#create the new scatter plot
plt.plot(x,y,'o',color = 'pink', label = '')
# add the column names as labels
plt.title('X VS Y')
plt.xlabel('Utilization_Fleet_0_12M_Total')
plt.ylabel('Expenditure_12M_Total')
plt.legend()
plt.show()

In [None]:
test_reduced_by_z.corr()

In [None]:
test_reduced_by_z.hist(grid = True, color = 'CadetBlue')

In [None]:
x_kurtosis = kurtosis(test_reduced_by_z['Utilization_Fleet_0_12M_Total'], fisher = True)
y_kurtosis = kurtosis(test_reduced_by_z['Expenditure_12M_Total'], fisher = True)

x_skew = skew(test_reduced_by_z['Utilization_Fleet_0_12M_Total'])
y_skew = skew(test_reduced_by_z['Expenditure_12M_Total'])

display("Utilization_Fleet_0_12M_Total_kurtosis: {:2}".format(x_kurtosis))
display(stats.kurtosistest(test_reduced_by_z['Utilization_Fleet_0_12M_Total']))
print('\n')
display("Utilization_Fleet_0_12M_Total_skew: {:2}".format(x_skew))
display(stats.skewtest(test_reduced_by_z['Utilization_Fleet_0_12M_Total']))
print('\n')
display("Expenditure_12M_Total: {:2}".format(y_kurtosis))
display(stats.kurtosistest(test_reduced_by_z['Expenditure_12M_Total']))
print('\n')
display("Expenditure_12M_Total: {:2}".format(y_skew))
display(stats.skewtest(test_reduced_by_z['Expenditure_12M_Total']))

### Build a model

In [None]:
# define our input variable & output variable
x = test_reduced_by_z[['Utilization_Fleet_0_12M_Total']]
y = test_reduced_by_z[['Expenditure_12M_Total']]

# Split X and y into X_
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.20, random_state=1)
# create a Linear Regression model object.
single_regression_model = LinearRegression()
# pass through the X_train & y_train data set.
single_regression_model.fit(x_train, y_train)
# create OLS model for training data
model = sm.OLS(y_train, x_train)
# pass through the X_train & y_train data set.
est = model.fit()
print(est.summary())

### Verify model is valid by adding constant

In [None]:
#Evaluate Traning model
# add constant
x_train2 = sm.add_constant(x_train)
# create a OLS model.
model2 = sm.OLS(y_train, x_train2)
# fit the data
est2 = model2.fit()
# make some confidence intervals, 95% by default.
est2.conf_int()

In [None]:
# estimate the p-values
est2.pvalues

### Explore model

In [None]:
# calculate and display intercept, coefficient
intercept = single_regression_model.intercept_[0]
coefficient = single_regression_model.coef_[0][0]
# print out
print("The Coefficient for training model is {:2}".format(coefficient))
print("The intercept for training model is {:4}".format(intercept))

In [None]:
# create prediction value and calculate errors
y_predict = single_regression_model.predict(x_test)
# calculate the mean squared error.
model_mse = mean_squared_error(y_test, y_predict)
# calculate the mean absolute error.
model_mae = mean_absolute_error(y_test, y_predict)
# calulcate the root mean squared error
model_rmse =  math.sqrt(model_mse)
# display the output
print("MSE {:3}".format(model_mse))
print("MAE {:3}".format(model_mae))
print("RMSE {:3}".format(model_rmse))

In [None]:
model_r2 = r2_score(y_test, y_predict)
print("R2 of the prediction is: {:.2}".format(model_r2))

In [None]:
# Grab the residuals & then call the hist() method
(y_test - y_predict).hist(grid = False, color = 'royalblue')
plt.title("Model Residuals")
plt.show()

In [None]:
# Plot outputs
plt.scatter(x_test, y_test,  color='gainsboro', label = '')
plt.plot(x_test, y_predict, color='royalblue', linewidth = 3, linestyle= '-',label ='Regression Line')

plt.title("Usage_Total VS Expenditure_Total")
plt.xlabel("x")
plt.ylabel("y")
plt.legend()
plt.show()

# The coefficients
print('Coefficient:' + '\033[1m' + '{:.2}''\033[0m'.format(single_regression_model.coef_[0][0]))

# The mean squared error
print('Mean squared error: ' + '\033[1m' + '{:.4}''\033[0m'.format(model_mse))

# The mean squared error
print('Root Mean squared error: ' + '\033[1m' + '{:.4}''\033[0m'.format(math.sqrt(model_mse)))

# Explained variance score: 1 is perfect prediction
print('R2 score: '+ '\033[1m' + '{:.2}''\033[0m'.format(r2_score(y_test,y_predict)))

### Conclusion

## Test 2: x = OLGAAmount y= Expenditure_12M_PartsNServices

### Test 2

In [None]:
#Create subset data1 for testing ideal element
test = data_dropna.loc[:, ['OLGAAmount', 'Expenditure_12M_PartsNServices']]

#remove extreme outliers:
#If you have multiple columns in your dataframe and would like to remove all rows that have outliers in at least one column, 
#the following expression would do that in one shot: select absolute Z score for the row <1.65 (). .all or .any
test_reduced_by_z = test[(np.abs(stats.zscore(test)) < 3).all(axis=1)]

#define the x & y data
x = test_reduced_by_z['OLGAAmount']
y = test_reduced_by_z['Expenditure_12M_PartsNServices']

test_reduced_by_z.describe()

In [None]:
#create the new scatter plot
plt.plot(x,y,'o',color = 'pink', label = '')
# add the column names as labels
plt.title('X VS Y')
plt.xlabel('X')
plt.ylabel('Y')
plt.legend()
plt.show()

In [None]:
test_reduced_by_z.corr()

In [None]:
test_reduced_by_z.hist(grid = True, color = 'Purple')

In [None]:
display(stats.kurtosistest(test_reduced_by_z['OLGAAmount']))
display(stats.skewtest(test_reduced_by_z['OLGAAmount']))
print('\n')
display(stats.kurtosistest(test_reduced_by_z['Expenditure_12M_PartsNServices']))
display(stats.skewtest(test_reduced_by_z['Expenditure_12M_PartsNServices']))

In [None]:
# define our input variable & output variable
x = test_reduced_by_z[['OLGAAmount']]
y = test_reduced_by_z[['Expenditure_12M_PartsNServices']]

# Split X and y into X_
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.20, random_state=1)
# create a Linear Regression model object.
single_regression_model = LinearRegression()
# pass through the X_train & y_train data set.
single_regression_model.fit(x_train, y_train)
# create OLS model for training data
model = sm.OLS(y_train, x_train)
# pass through the X_train & y_train data set.
est = model.fit()
print(est.summary())

In [None]:
# make some confidence intervals, 95% by default.
est.conf_int()

In [None]:
# estimate the p-values
est.pvalues

In [None]:
# calculate and display intercept, coefficient
intercept = single_regression_model.intercept_[0]
coefficient = single_regression_model.coef_[0][0]
# print out
print("The Coefficient for training model is {:2}".format(coefficient))
print("The intercept for training model is {:4}".format(intercept))

In [None]:
# create prediction value and calculate errors
y_predict = single_regression_model.predict(x_test)
# calculate the mean squared error.
model_mse = mean_squared_error(y_test, y_predict)
# calculate the mean absolute error.
model_mae = mean_absolute_error(y_test, y_predict)
# calulcate the root mean squared error
model_rmse =  math.sqrt(model_mse)
# display the output
print("MSE {:3}".format(model_mse))
print("MAE {:3}".format(model_mae))
print("RMSE {:3}".format(model_rmse))

In [None]:
model_r2 = r2_score(y_test, y_predict)
print("R2 of the prediction is: {:.2}".format(model_r2))

In [None]:
# Grab the residuals & then call the hist() method
(y_test - y_predict).hist(grid = False, color = 'Gold')
plt.title("Model Residuals")
plt.show()

In [None]:
# Plot outputs
plt.scatter(x_test, y_test,  color='pink', label = '')
plt.plot(x_test, y_predict, color='royalblue', linewidth = 3, linestyle= '-',label ='Regression Line')

plt.title("OLGAAmount VS Expenditure_12M_PartsNServices")
plt.xlabel("x")
plt.ylabel("y")
plt.legend()
plt.show()

# The coefficients
print('Coefficient:' + '\033[1m' + '{:.2}''\033[0m'.format(single_regression_model.coef_[0][0]))

# The mean squared error
print('Mean squared error: ' + '\033[1m' + '{:.4}''\033[0m'.format(model_mse))

# The mean squared error
print('Root Mean squared error: ' + '\033[1m' + '{:.4}''\033[0m'.format(math.sqrt(model_mse)))

# Explained variance score: 1 is perfect prediction
print('R2 score: '+ '\033[1m' + '{:.2}''\033[0m'.format(r2_score(y_test,y_predict)))

### Test 2 Conclusion

## Test 3: x = Count_CatFleetSize y= Expenditure_12M_PartsNServices

### Test 3

In [None]:
#Create subset data1 for testing ideal element
test = data_dropna.loc[:, ['Count_CatFleetSize', 'Expenditure_12M_PartsNServices']]

#remove extreme outliers:
#If you have multiple columns in your dataframe and would like to remove all rows that have outliers in at least one column, 
#the following expression would do that in one shot: select absolute Z score for the row <1.65 (). .all or .any
test_reduced_by_z = test[(np.abs(stats.zscore(test)) < 3).all(axis=1)]

#define the x & y data
x = test_reduced_by_z['Count_CatFleetSize']
y = test_reduced_by_z['Expenditure_12M_PartsNServices']

test_reduced_by_z.describe()

In [None]:
#create the new scatter plot
plt.plot(x,y,'o',color = 'pink', label = '')
# add the column names as labels
plt.title('X VS Y')
plt.xlabel('X')
plt.ylabel('Y')
plt.legend()
plt.show()

In [None]:
test_reduced_by_z.corr()

In [None]:
test_reduced_by_z.hist(grid = True, color = 'Purple')

In [None]:
display(stats.kurtosistest(test_reduced_by_z['Count_CatFleetSize']))
display(stats.skewtest(test_reduced_by_z['Count_CatFleetSize']))

In [None]:
# define our input variable & output variable
x = test_reduced_by_z[['Count_CatFleetSize']]
y = test_reduced_by_z[['Expenditure_12M_PartsNServices']]

# Split X and y into X_
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.20, random_state=1)
# create a Linear Regression model object.
single_regression_model = LinearRegression()
# pass through the X_train & y_train data set.
single_regression_model.fit(x_train, y_train)
# create OLS model for training data
model = sm.OLS(y_train, x_train)
# pass through the X_train & y_train data set.
est = model.fit()
print(est.summary())

In [None]:
# make some confidence intervals, 95% by default.
est.conf_int()

In [None]:
# estimate the p-values
est.pvalues

In [None]:
# calculate and display intercept, coefficient
intercept = single_regression_model.intercept_[0]
coefficient = single_regression_model.coef_[0][0]
# print out
print("The Coefficient for training model is {:2}".format(coefficient))
print("The intercept for training model is {:4}".format(intercept))

In [None]:
# create prediction value and calculate errors
y_predict = single_regression_model.predict(x_test)
# calculate the mean squared error.
model_mse = mean_squared_error(y_test, y_predict)
# calculate the mean absolute error.
model_mae = mean_absolute_error(y_test, y_predict)
# calulcate the root mean squared error
model_rmse =  math.sqrt(model_mse)
# display the output
print("MSE {:3}".format(model_mse))
print("MAE {:3}".format(model_mae))
print("RMSE {:3}".format(model_rmse))

In [None]:
model_r2 = r2_score(y_test, y_predict)
print("R2 of the prediction is: {:.2}".format(model_r2))

In [None]:
# Grab the residuals & then call the hist() method
(y_test - y_predict).hist(grid = False, color = 'Gold')
plt.title("Model Residuals")
plt.show()

In [None]:
# Plot outputs
plt.scatter(x_test, y_test,  color='pink', label = '')
plt.plot(x_test, y_predict, color='royalblue', linewidth = 3, linestyle= '-',label ='Regression Line')

plt.title("Count_CatFleetSize VS Expenditure_12M_PartsNServices")
plt.xlabel("x")
plt.ylabel("y")
plt.legend()
plt.show()

# The coefficients
print('Coefficient:' + '\033[1m' + '{:.2}''\033[0m'.format(single_regression_model.coef_[0][0]))

# The mean squared error
print('Mean squared error: ' + '\033[1m' + '{:.4}''\033[0m'.format(model_mse))

# The mean squared error
print('Root Mean squared error: ' + '\033[1m' + '{:.4}''\033[0m'.format(math.sqrt(model_mse)))

# Explained variance score: 1 is perfect prediction
print('R2 score: '+ '\033[1m' + '{:.2}''\033[0m'.format(r2_score(y_test,y_predict)))

### Test 3 Conclusion

## Test 4: x = SOS_COUNT_CU_0_12M  y= Expenditure_12M_PartsNServices

### Test 4

In [None]:
#Create subset data1 for testing ideal element
test = data_dropna.loc[:, ['SOS_COUNT_CU_0_12M', 'Expenditure_12M_PartsNServices']]

#remove extreme outliers:
#If you have multiple columns in your dataframe and would like to remove all rows that have outliers in at least one column, 
#the following expression would do that in one shot: select absolute Z score for the row <1.65 (). .all or .any
test_reduced_by_z = test[(np.abs(stats.zscore(test)) < 3).all(axis=1)]

#define the x & y data
x = test_reduced_by_z['SOS_COUNT_CU_0_12M']
y = test_reduced_by_z['Expenditure_12M_PartsNServices']

test_reduced_by_z.describe()

In [None]:
#create the new scatter plot
plt.plot(x,y,'o',color = 'pink', label = '')
# add the column names as labels
plt.title('X VS Y')
plt.xlabel('X')
plt.ylabel('Y')
plt.legend()
plt.show()

In [None]:
test_reduced_by_z.corr()

In [None]:
test_reduced_by_z.hist(grid = True, color = 'Purple')

In [None]:
display(stats.kurtosistest(test_reduced_by_z['SOS_COUNT_CU_0_12M']))
display(stats.skewtest(test_reduced_by_z['SOS_COUNT_CU_0_12M']))

In [None]:
# define our input variable & output variable
x = test_reduced_by_z[['SOS_COUNT_CU_0_12M']]
y = test_reduced_by_z[['Expenditure_12M_PartsNServices']]

# Split X and y into X_
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.20, random_state=1)
# create a Linear Regression model object.
single_regression_model = LinearRegression()
# pass through the X_train & y_train data set.
single_regression_model.fit(x_train, y_train)
# create OLS model for training data
model = sm.OLS(y_train, x_train)
# pass through the X_train & y_train data set.
est = model.fit()
print(est.summary())

In [None]:
# make some confidence intervals, 95% by default.
est.conf_int()

In [None]:
# estimate the p-values
est.pvalues

In [None]:
# calculate and display intercept, coefficient
intercept = single_regression_model.intercept_[0]
coefficient = single_regression_model.coef_[0][0]
# print out
print("The Coefficient for training model is {:2}".format(coefficient))
print("The intercept for training model is {:4}".format(intercept))

In [None]:
# create prediction value and calculate errors
y_predict = single_regression_model.predict(x_test)
# calculate the mean squared error.
model_mse = mean_squared_error(y_test, y_predict)
# calculate the mean absolute error.
model_mae = mean_absolute_error(y_test, y_predict)
# calulcate the root mean squared error
model_rmse =  math.sqrt(model_mse)
# display the output
print("MSE {:3}".format(model_mse))
print("MAE {:3}".format(model_mae))
print("RMSE {:3}".format(model_rmse))

In [None]:
model_r2 = r2_score(y_test, y_predict)
print("R2 of the prediction is: {:.2}".format(model_r2))

In [None]:
# Grab the residuals & then call the hist() method
(y_test - y_predict).hist(grid = False, color = 'Gold')
plt.title("Model Residuals")
plt.show()

In [None]:
# Plot outputs
plt.scatter(x_test, y_test,  color='pink', label = '')
plt.plot(x_test, y_predict, color='royalblue', linewidth = 3, linestyle= '-',label ='Regression Line')

plt.title("SOS_COUNT_CU_0_12M VS PartsNServices")
plt.xlabel("x")
plt.ylabel("y")
plt.legend()
plt.show()

# The coefficients
print('Coefficient:' + '\033[1m' + '{:.2}''\033[0m'.format(single_regression_model.coef_[0][0]))

# The mean squared error
print('Mean squared error: ' + '\033[1m' + '{:.4}''\033[0m'.format(model_mse))

# The mean squared error
print('Root Mean squared error: ' + '\033[1m' + '{:.4}''\033[0m'.format(math.sqrt(model_mse)))

# Explained variance score: 1 is perfect prediction
print('R2 score: '+ '\033[1m' + '{:.2}''\033[0m'.format(r2_score(y_test,y_predict)))

### Test 4 Conclusion

## Test 5: x = Family_Expenditure_12M_Rentals y= Expenditure_12M_Total

### Test 5

In [None]:
#Create subset data1 for testing ideal element
test = data_dropna.loc[:, ['Family_Expenditure_12M_Rentals', 'Expenditure_12M_Total']]

#remove extreme outliers:
#If you have multiple columns in your dataframe and would like to remove all rows that have outliers in at least one column, 
#the following expression would do that in one shot: select absolute Z score for the row <1.65 (). .all or .any
test_reduced_by_z = test[(np.abs(stats.zscore(test)) < 3).all(axis=1)]

#define the x & y data
x = test_reduced_by_z['Family_Expenditure_12M_Rentals']
y = test_reduced_by_z['Expenditure_12M_Total']

test_reduced_by_z.describe()

In [None]:
#create the new scatter plot
plt.plot(x,y,'o',color = 'pink', label = '')
# add the column names as labels
plt.title('X VS Y')
plt.xlabel('X')
plt.ylabel('Y')
plt.legend()
plt.show()

In [None]:
test_reduced_by_z.corr()

In [None]:
test_reduced_by_z.hist(grid = True, color = 'Purple')

In [None]:
display(stats.kurtosistest(test_reduced_by_z['Family_Expenditure_12M_Rentals']))
display(stats.skewtest(test_reduced_by_z['Family_Expenditure_12M_Rentals']))

In [None]:
# define our input variable & output variable
x = test_reduced_by_z[['Family_Expenditure_12M_Rentals']]
y = test_reduced_by_z[['Expenditure_12M_Total']]

# Split X and y into X_
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.20, random_state=1)
# create a Linear Regression model object.
single_regression_model = LinearRegression()
# pass through the X_train & y_train data set.
single_regression_model.fit(x_train, y_train)
# create OLS model for training data
model = sm.OLS(y_train, x_train)
# pass through the X_train & y_train data set.
est = model.fit()
print(est.summary())

In [None]:
# make some confidence intervals, 95% by default.
est.conf_int()

In [None]:
# estimate the p-values
est.pvalues

In [None]:
# calculate and display intercept, coefficient
intercept = single_regression_model.intercept_[0]
coefficient = single_regression_model.coef_[0][0]
# print out
print("The Coefficient for training model is {:2}".format(coefficient))
print("The intercept for training model is {:4}".format(intercept))

In [None]:
# create prediction value and calculate errors
y_predict = single_regression_model.predict(x_test)
# calculate the mean squared error.
model_mse = mean_squared_error(y_test, y_predict)
# calculate the mean absolute error.
model_mae = mean_absolute_error(y_test, y_predict)
# calulcate the root mean squared error
model_rmse =  math.sqrt(model_mse)
# display the output
print("MSE {:3}".format(model_mse))
print("MAE {:3}".format(model_mae))
print("RMSE {:3}".format(model_rmse))

In [None]:
model_r2 = r2_score(y_test, y_predict)
print("R2 of the prediction is: {:.2}".format(model_r2))

In [None]:
# Grab the residuals & then call the hist() method
(y_test - y_predict).hist(grid = False, color = 'Gold')
plt.title("Model Residuals")
plt.show()

In [None]:
# Plot outputs
plt.scatter(x_test, y_test,  color='pink', label = '')
plt.plot(x_test, y_predict, color='royalblue', linewidth = 3, linestyle= '-',label ='Regression Line')

plt.title("FamilyExpenditure_Rentals VS Expenditure_12M_Total")
plt.xlabel("x")
plt.ylabel("y")
plt.legend()
plt.show()

# The coefficients
print('Coefficient:' + '\033[1m' + '{:.2}''\033[0m'.format(single_regression_model.coef_[0][0]))

# The mean squared error
print('Mean squared error: ' + '\033[1m' + '{:.4}''\033[0m'.format(model_mse))

# The mean squared error
print('Root Mean squared error: ' + '\033[1m' + '{:.4}''\033[0m'.format(math.sqrt(model_mse)))

# Explained variance score: 1 is perfect prediction
print('R2 score: '+ '\033[1m' + '{:.2}''\033[0m'.format(r2_score(y_test,y_predict)))

### Test 5 Conclusion

## Test 6: x = PercentageFleet_Utilized_0_12M y= Expenditure_12M_Equipment

### Test 6

In [None]:
#Create subset data1 for testing ideal element
test = data_dropna.loc[:, ['PercentageFleet_Utilized_0_12M', 'Expenditure_12M_Equipment']]

#remove extreme outliers:
#If you have multiple columns in your dataframe and would like to remove all rows that have outliers in at least one column, 
#the following expression would do that in one shot: select absolute Z score for the row <1.65 (). .all or .any
test_reduced_by_z = test[(np.abs(stats.zscore(test)) < 3).all(axis=1)]

#define the x & y data
x = test_reduced_by_z['PercentageFleet_Utilized_0_12M']
y = test_reduced_by_z['Expenditure_12M_Equipment']

test_reduced_by_z.describe()

In [None]:
#create the new scatter plot
plt.plot(x,y,'o',color = 'pink', label = '')
# add the column names as labels
plt.title('X VS Y')
plt.xlabel('X')
plt.ylabel('Y')
plt.legend()
plt.show()

In [None]:
test_reduced_by_z.corr()

In [None]:
test_reduced_by_z.hist(grid = True, color = 'Purple')

In [None]:
display(stats.kurtosistest(test_reduced_by_z['PercentageFleet_Utilized_0_12M']))
display(stats.skewtest(test_reduced_by_z['PercentageFleet_Utilized_0_12M']))

In [None]:
# define our input variable & output variable
x = test_reduced_by_z[['PercentageFleet_Utilized_0_12M']]
y = test_reduced_by_z[['Expenditure_12M_Equipment']]

# Split X and y into X_
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.20, random_state=1)
# create a Linear Regression model object.
single_regression_model = LinearRegression()
# pass through the X_train & y_train data set.
single_regression_model.fit(x_train, y_train)
# create OLS model for training data
model = sm.OLS(y_train, x_train)
# pass through the X_train & y_train data set.
est = model.fit()
print(est.summary())

In [None]:
# make some confidence intervals, 95% by default.
est.conf_int()

In [None]:
# estimate the p-values
est.pvalues

In [None]:
# calculate and display intercept, coefficient
intercept = single_regression_model.intercept_[0]
coefficient = single_regression_model.coef_[0][0]
# print out
print("The Coefficient for training model is {:2}".format(coefficient))
print("The intercept for training model is {:4}".format(intercept))

In [None]:
# create prediction value and calculate errors
y_predict = single_regression_model.predict(x_test)
# calculate the mean squared error.
model_mse = mean_squared_error(y_test, y_predict)
# calculate the mean absolute error.
model_mae = mean_absolute_error(y_test, y_predict)
# calulcate the root mean squared error
model_rmse =  math.sqrt(model_mse)
# display the output
print("MSE {:3}".format(model_mse))
print("MAE {:3}".format(model_mae))
print("RMSE {:3}".format(model_rmse))

In [None]:
model_r2 = r2_score(y_test, y_predict)
print("R2 of the prediction is: {:.2}".format(model_r2))

In [None]:
# Grab the residuals & then call the hist() method
(y_test - y_predict).hist(grid = False, color = 'Gold')
plt.title("Model Residuals")
plt.show()

In [None]:
# Plot outputs
plt.scatter(x_test, y_test,  color='pink', label = '')
plt.plot(x_test, y_predict, color='royalblue', linewidth = 3, linestyle= '-',label ='Regression Line')

plt.title("PercentageFleet_Utilized_0_12M VS Expenditure_12M_Equipment")
plt.xlabel("x")
plt.ylabel("y")
plt.legend()
plt.show()

# The coefficients
print('Coefficient:' + '\033[1m' + '{:.2}''\033[0m'.format(single_regression_model.coef_[0][0]))

# The mean squared error
print('Mean squared error: ' + '\033[1m' + '{:.4}''\033[0m'.format(model_mse))

# The mean squared error
print('Root Mean squared error: ' + '\033[1m' + '{:.4}''\033[0m'.format(math.sqrt(model_mse)))

# Explained variance score: 1 is perfect prediction
print('R2 score: '+ '\033[1m' + '{:.2}''\033[0m'.format(r2_score(y_test,y_predict)))

### Test 6 Conclusion

## Test 7: x = Percentage_SubscriptionActive y= Expenditure_12M_Equipment

### Test 7

In [None]:
#Create subset data1 for testing ideal element
test = data_dropna.loc[:, ['PercentageFleet_Utilized_0_12M', 'Expenditure_12M_Equipment']]

#remove extreme outliers:
#If you have multiple columns in your dataframe and would like to remove all rows that have outliers in at least one column, 
#the following expression would do that in one shot: select absolute Z score for the row <1.65 (). .all or .any
test_reduced_by_z = test[(np.abs(stats.zscore(test)) < 3).all(axis=1)]

#define the x & y data
x = test_reduced_by_z['PercentageFleet_Utilized_0_12M']
y = test_reduced_by_z['Expenditure_12M_Equipment']

test_reduced_by_z.describe()

In [None]:
#create the new scatter plot
plt.plot(x,y,'o',color = 'pink', label = '')
# add the column names as labels
plt.title('X VS Y')
plt.xlabel('X')
plt.ylabel('Y')
plt.legend()
plt.show()

In [None]:
test_reduced_by_z.corr()

In [None]:
test_reduced_by_z.hist(grid = True, color = 'Purple')

In [None]:
display(stats.kurtosistest(test_reduced_by_z['PercentageFleet_Utilized_0_12M']))
display(stats.skewtest(test_reduced_by_z['PercentageFleet_Utilized_0_12M']))

In [None]:
# define our input variable & output variable
x = test_reduced_by_z[['PercentageFleet_Utilized_0_12M']]
y = test_reduced_by_z[['Expenditure_12M_Equipment']]

# Split X and y into X_
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.20, random_state=1)
# create a Linear Regression model object.
single_regression_model = LinearRegression()
# pass through the X_train & y_train data set.
single_regression_model.fit(x_train, y_train)
# create OLS model for training data
model = sm.OLS(y_train, x_train)
# pass through the X_train & y_train data set.
est = model.fit()
print(est.summary())

In [None]:
# make some confidence intervals, 95% by default.
est.conf_int()

In [None]:
# estimate the p-values
est.pvalues

In [None]:
# calculate and display intercept, coefficient
intercept = single_regression_model.intercept_[0]
coefficient = single_regression_model.coef_[0][0]
# print out
print("The Coefficient for training model is {:2}".format(coefficient))
print("The intercept for training model is {:4}".format(intercept))

In [None]:
# create prediction value and calculate errors
y_predict = single_regression_model.predict(x_test)
# calculate the mean squared error.
model_mse = mean_squared_error(y_test, y_predict)
# calculate the mean absolute error.
model_mae = mean_absolute_error(y_test, y_predict)
# calulcate the root mean squared error
model_rmse =  math.sqrt(model_mse)
# display the output
print("MSE {:3}".format(model_mse))
print("MAE {:3}".format(model_mae))
print("RMSE {:3}".format(model_rmse))

In [None]:
model_r2 = r2_score(y_test, y_predict)
print("R2 of the prediction is: {:.2}".format(model_r2))

In [None]:
# Grab the residuals & then call the hist() method
(y_test - y_predict).hist(grid = False, color = 'Gold')
plt.title("Model Residuals")
plt.show()

In [None]:
# Plot outputs
plt.scatter(x_test, y_test,  color='pink', label = '')
plt.plot(x_test, y_predict, color='royalblue', linewidth = 3, linestyle= '-',label ='Regression Line')

plt.title("PercentageFleet_Utilized_0_12M VS Expenditure_12M_Equipment")
plt.xlabel("x")
plt.ylabel("y")
plt.legend()
plt.show()

# The coefficients
print('Coefficient:' + '\033[1m' + '{:.2}''\033[0m'.format(single_regression_model.coef_[0][0]))

# The mean squared error
print('Mean squared error: ' + '\033[1m' + '{:.4}''\033[0m'.format(model_mse))

# The mean squared error
print('Root Mean squared error: ' + '\033[1m' + '{:.4}''\033[0m'.format(math.sqrt(model_mse)))

# Explained variance score: 1 is perfect prediction
print('R2 score: '+ '\033[1m' + '{:.2}''\033[0m'.format(r2_score(y_test,y_predict)))

### Test 7 Conclusion

## Test 8: x = Average_Fleet_Annual_Utilization y= Expenditure_12M_Total

### Test 8

In [None]:
#Create subset data1 for testing ideal element
test = data_dropna.loc[:, ['Average_Fleet_Annual_Utilization', 'Expenditure_12M_Total']]

#remove extreme outliers:
#If you have multiple columns in your dataframe and would like to remove all rows that have outliers in at least one column, 
#the following expression would do that in one shot: select absolute Z score for the row <1.65 (). .all or .any
test_reduced_by_z = test[(np.abs(stats.zscore(test)) < 3).all(axis=1)]

#define the x & y data
x = test_reduced_by_z['Average_Fleet_Annual_Utilization']
y = test_reduced_by_z['Expenditure_12M_Total']

test_reduced_by_z.describe()

In [None]:
#create the new scatter plot
plt.plot(x,y,'o',color = 'pink', label = '')
# add the column names as labels
plt.title('X VS Y')
plt.xlabel('X')
plt.ylabel('Y')
plt.legend()
plt.show()

In [None]:
# #Remove negative x
# test_absolute_x = test_reduced_by_z[test_reduced_by_z['Average_Fleet_Annual_Utilization'] >= 0]

# x = test_absolute_x['Average_Fleet_Annual_Utilization']
# y = test_absolute_x['Expenditure_12M_Total']
# test_reduced_by_z.describe()

In [None]:
# #create the new scatter plot
# plt.plot(x,y,'o',color = 'pink', label = '')
# # add the column names as labels
# plt.title('X VS Y')
# plt.xlabel('X')
# plt.ylabel('Y')
# plt.legend()
# plt.show()

In [None]:
test_reduced_by_z.corr()

In [None]:
test_reduced_by_z.hist(grid = True, color = 'Purple')

In [None]:
display(stats.kurtosistest(test_reduced_by_z['Average_Fleet_Annual_Utilization']))
display(stats.skewtest(test_reduced_by_z['Average_Fleet_Annual_Utilization']))

In [None]:
# define our input variable & output variable
x = test_reduced_by_z[['Average_Fleet_Annual_Utilization']]
y = test_reduced_by_z[['Expenditure_12M_Total']]

# Split X and y into X_
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.20, random_state=1)
# create a Linear Regression model object.
single_regression_model = LinearRegression()
# pass through the X_train & y_train data set.
single_regression_model.fit(x_train, y_train)
# create OLS model for training data
model = sm.OLS(y_train, x_train)
# pass through the X_train & y_train data set.
est = model.fit()
print(est.summary())

In [None]:
# make some confidence intervals, 95% by default.
est.conf_int()

In [None]:
# estimate the p-values
est.pvalues

In [None]:
# calculate and display intercept, coefficient
intercept = single_regression_model.intercept_[0]
coefficient = single_regression_model.coef_[0][0]
# print out
print("The Coefficient for training model is {:2}".format(coefficient))
print("The intercept for training model is {:4}".format(intercept))

In [None]:
# create prediction value and calculate errors
y_predict = single_regression_model.predict(x_test)
# calculate the mean squared error.
model_mse = mean_squared_error(y_test, y_predict)
# calculate the mean absolute error.
model_mae = mean_absolute_error(y_test, y_predict)
# calulcate the root mean squared error
model_rmse =  math.sqrt(model_mse)
# display the output
print("MSE {:3}".format(model_mse))
print("MAE {:3}".format(model_mae))
print("RMSE {:3}".format(model_rmse))

In [None]:
model_r2 = r2_score(y_test, y_predict)
print("R2 of the prediction is: {:.2}".format(model_r2))

In [None]:
# Grab the residuals & then call the hist() method
(y_test - y_predict).hist(grid = False, color = 'Gold')
plt.title("Model Residuals")
plt.show()

In [None]:
# Plot outputs
plt.scatter(x_test, y_test,  color='pink', label = '')
plt.plot(x_test, y_predict, color='royalblue', linewidth = 3, linestyle= '-',label ='Regression Line')

plt.title("Average_Fleet_Annual_Utilization VS Expenditure_12M_Total")
plt.xlabel("x")
plt.ylabel("y")
plt.legend()
plt.show()

# The coefficients
print('Coefficient:' + '\033[1m' + '{:.2}''\033[0m'.format(single_regression_model.coef_[0][0]))

# The mean squared error
print('Mean squared error: ' + '\033[1m' + '{:.4}''\033[0m'.format(model_mse))

# The mean squared error
print('Root Mean squared error: ' + '\033[1m' + '{:.4}''\033[0m'.format(math.sqrt(model_mse)))

# Explained variance score: 1 is perfect prediction
print('R2 score: '+ '\033[1m' + '{:.2}''\033[0m'.format(r2_score(y_test,y_predict)))

### Test 8 Conclusion

## Test 9: x = Mean_SMU_Between_SOS y= Expenditure_12M_Total


### Test 9

In [None]:
#Create subset data1 for testing ideal element
test = data_dropna.loc[:, ['Mean_SMU_Between_SOS', 'Expenditure_12M_Total']]

#remove extreme outliers:
#If you have multiple columns in your dataframe and would like to remove all rows that have outliers in at least one column, 
#the following expression would do that in one shot: select absolute Z score for the row <1.65 (). .all or .any
test_reduced_by_z = test[(np.abs(stats.zscore(test)) < 3).all(axis=1)]

#define the x & y data
x = test_reduced_by_z['Mean_SMU_Between_SOS']
y = test_reduced_by_z['Expenditure_12M_Total']

test_reduced_by_z.describe()

In [None]:
#create the new scatter plot
plt.plot(x,y,'o',color = 'pink', label = '')
# add the column names as labels
plt.title('X VS Y')
plt.xlabel('X')
plt.ylabel('Y')
plt.legend()
plt.show()

In [None]:
test_reduced_by_z.corr()

In [None]:
# define our input variable & output variable
x = test_reduced_by_z[['Mean_SMU_Between_SOS']]
y = test_reduced_by_z[['Expenditure_12M_Total']]

# Split X and y into X_
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.20, random_state=1)
# create a Linear Regression model object.
single_regression_model = LinearRegression()
# pass through the X_train & y_train data set.
single_regression_model.fit(x_train, y_train)
# create OLS model for training data
model = sm.OLS(y_train, x_train)
# pass through the X_train & y_train data set.
est = model.fit()
print(est.summary())

In [None]:
# make some confidence intervals, 95% by default.
est.conf_int()

In [None]:
# estimate the p-values
est.pvalues

In [None]:
# calculate and display intercept, coefficient
intercept = single_regression_model.intercept_[0]
coefficient = single_regression_model.coef_[0][0]
# print out
print("The Coefficient for training model is {:2}".format(coefficient))
print("The intercept for training model is {:4}".format(intercept))

In [None]:
# create prediction value and calculate errors
y_predict = single_regression_model.predict(x_test)
# calculate the mean squared error.
model_mse = mean_squared_error(y_test, y_predict)
# calculate the mean absolute error.
model_mae = mean_absolute_error(y_test, y_predict)
# calulcate the root mean squared error
model_rmse =  math.sqrt(model_mse)
# display the output
print("MSE {:3}".format(model_mse))
print("MAE {:3}".format(model_mae))
print("RMSE {:3}".format(model_rmse))

In [None]:
model_r2 = r2_score(y_test, y_predict)
print("R2 of the prediction is: {:.2}".format(model_r2))

In [None]:
# Grab the residuals & then call the hist() method
(y_test - y_predict).hist(grid = False, color = 'Gold')
plt.title("Model Residuals")
plt.show()

In [None]:
# Plot outputs
plt.scatter(x_test, y_test,  color='pink', label = '')
plt.plot(x_test, y_predict, color='royalblue', linewidth = 3, linestyle= '-',label ='Regression Line')

plt.title("Mean_SMU_Between_SOS VS Expenditure_12M_Total")
plt.xlabel("x")
plt.ylabel("y")
plt.legend()
plt.show()

# The coefficients
print('Coefficient:' + '\033[1m' + '{:.2}''\033[0m'.format(single_regression_model.coef_[0][0]))

# The mean squared error
print('Mean squared error: ' + '\033[1m' + '{:.4}''\033[0m'.format(model_mse))

# The mean squared error
print('Root Mean squared error: ' + '\033[1m' + '{:.4}''\033[0m'.format(math.sqrt(model_mse)))

# Explained variance score: 1 is perfect prediction
print('R2 score: '+ '\033[1m' + '{:.2}''\033[0m'.format(r2_score(y_test,y_predict)))

### Test 9 Conclusion

## Test 10: x = Mean_SMU_Between_Repairs y= Expenditure_12M_PartsNServices


### Test 10

In [None]:
#Create subset data1 for testing ideal element
test = data_dropna.loc[:, ['Mean_SMU_Between_Repairs', 'Expenditure_12M_PartsNServices']]

#remove extreme outliers:
#If you have multiple columns in your dataframe and would like to remove all rows that have outliers in at least one column, 
#the following expression would do that in one shot: select absolute Z score for the row <1.65 (). .all or .any
test_reduced_by_z = test[(np.abs(stats.zscore(test)) < 3).all(axis=1)]

#define the x & y data
x = test_reduced_by_z['Mean_SMU_Between_Repairs']
y = test_reduced_by_z['Expenditure_12M_PartsNServices']

test_reduced_by_z.describe()

In [None]:
#create the new scatter plot
plt.plot(x,y,'o',color = 'pink', label = '')
# add the column names as labels
plt.title('X VS Y')
plt.xlabel('X')
plt.ylabel('Y')
plt.legend()
plt.show()

In [None]:
#Remove negative x
test_absolute_x = test_reduced_by_z[test_reduced_by_z['Mean_SMU_Between_Repairs'] >= 0]
x = test_absolute_x['Mean_SMU_Between_Repairs']
y = test_absolute_x['Expenditure_12M_PartsNServices']
#create the new scatter plot
plt.plot(x,y,'o',color = 'pink', label = '')
# add the column names as labels
plt.title('X VS Y')
plt.xlabel('X')
plt.ylabel('Y')
plt.legend()
plt.show()

In [None]:
test_absolute_x.corr()

In [None]:
# define our input variable & output variable
x = test_absolute_x[['Mean_SMU_Between_Repairs']]
y = test_absolute_x[['Expenditure_12M_PartsNServices']]

# Split X and y into X_
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.20, random_state=1)
# create a Linear Regression model object.
single_regression_model = LinearRegression()
# pass through the X_train & y_train data set.
single_regression_model.fit(x_train, y_train)
# create OLS model for training data
model = sm.OLS(y_train, x_train)
# pass through the X_train & y_train data set.
est = model.fit()
print(est.summary())

In [None]:
# make some confidence intervals, 95% by default.
est.conf_int()

In [None]:
# estimate the p-values
est.pvalues

In [None]:
# calculate and display intercept, coefficient
intercept = single_regression_model.intercept_[0]
coefficient = single_regression_model.coef_[0][0]
# print out
print("The Coefficient for training model is {:2}".format(coefficient))
print("The intercept for training model is {:4}".format(intercept))

In [None]:
# create prediction value and calculate errors
y_predict = single_regression_model.predict(x_test)
# calculate the mean squared error.
model_mse = mean_squared_error(y_test, y_predict)
# calculate the mean absolute error.
model_mae = mean_absolute_error(y_test, y_predict)
# calulcate the root mean squared error
model_rmse =  math.sqrt(model_mse)
# display the output
print("MSE {:3}".format(model_mse))
print("MAE {:3}".format(model_mae))
print("RMSE {:3}".format(model_rmse))

In [None]:
model_r2 = r2_score(y_test, y_predict)
print("R2 of the prediction is: {:.2}".format(model_r2))

In [None]:
# Grab the residuals & then call the hist() method
(y_test - y_predict).hist(grid = False, color = 'Gold')
plt.title("Model Residuals")
plt.show()

In [None]:
# Plot outputs
plt.scatter(x_test, y_test,  color='pink', label = '')
plt.plot(x_test, y_predict, color='royalblue', linewidth = 3, linestyle= '-',label ='Regression Line')

plt.title("Mean_SMU_Between_Repairs VS Expenditure_12M_PartsNServices")
plt.xlabel("x")
plt.ylabel("y")
plt.legend()
plt.show()

# The coefficients
print('Coefficient:' + '\033[1m' + '{:.2}''\033[0m'.format(single_regression_model.coef_[0][0]))

# The mean squared error
print('Mean squared error: ' + '\033[1m' + '{:.4}''\033[0m'.format(model_mse))

# The mean squared error
print('Root Mean squared error: ' + '\033[1m' + '{:.4}''\033[0m'.format(math.sqrt(model_mse)))

# Explained variance score: 1 is perfect prediction
print('R2 score: '+ '\033[1m' + '{:.2}''\033[0m'.format(r2_score(y_test,y_predict)))

### Test 10 Conclusion

## Test 11: x = Family_Accounts y= Expenditure_12M_Total


### Test 11

In [None]:
#Create subset data1 for testing ideal element
test = data_dropna.loc[:, ['Family_Accounts', 'Expenditure_12M_Total']]

#remove extreme outliers:
#If you have multiple columns in your dataframe and would like to remove all rows that have outliers in at least one column, 
#the following expression would do that in one shot: select absolute Z score for the row <1.65 (). .all or .any
test_reduced_by_z = test[(np.abs(stats.zscore(test)) < 3).all(axis=1)]

#define the x & y data
x = test_reduced_by_z['Family_Accounts']
y = test_reduced_by_z['Expenditure_12M_Total']

test_reduced_by_z.describe()

In [None]:
#create the new scatter plot
plt.plot(x,y,'o',color = 'pink', label = '')
# add the column names as labels
plt.title('X VS Y')
plt.xlabel('X')
plt.ylabel('Y')
plt.legend()
plt.show()

In [None]:
test_reduced_by_z.corr()

In [None]:
# define our input variable & output variable
x = test_reduced_by_z[['Family_Accounts']]
y = test_reduced_by_z[['Expenditure_12M_Total']]

# Split X and y into X_
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.20, random_state=1)
# create a Linear Regression model object.
single_regression_model = LinearRegression()
# pass through the X_train & y_train data set.
single_regression_model.fit(x_train, y_train)
# create OLS model for training data
model = sm.OLS(y_train, x_train)
# pass through the X_train & y_train data set.
est = model.fit()
print(est.summary())

In [None]:
# make some confidence intervals, 95% by default.
est.conf_int()

In [None]:
# estimate the p-values
est.pvalues

In [None]:
# calculate and display intercept, coefficient
intercept = single_regression_model.intercept_[0]
coefficient = single_regression_model.coef_[0][0]
# print out
print("The Coefficient for training model is {:2}".format(coefficient))
print("The intercept for training model is {:4}".format(intercept))

In [None]:
# create prediction value and calculate errors
y_predict = single_regression_model.predict(x_test)
# calculate the mean squared error.
model_mse = mean_squared_error(y_test, y_predict)
# calculate the mean absolute error.
model_mae = mean_absolute_error(y_test, y_predict)
# calulcate the root mean squared error
model_rmse =  math.sqrt(model_mse)
# display the output
print("MSE {:3}".format(model_mse))
print("MAE {:3}".format(model_mae))
print("RMSE {:3}".format(model_rmse))

In [None]:
model_r2 = r2_score(y_test, y_predict)
print("R2 of the prediction is: {:.2}".format(model_r2))

In [None]:
# Grab the residuals & then call the hist() method
(y_test - y_predict).hist(grid = False, color = 'Gold')
plt.title("Model Residuals")
plt.show()

In [None]:
# Plot outputs
plt.scatter(x_test, y_test,  color='pink', label = '')
plt.plot(x_test, y_predict, color='royalblue', linewidth = 3, linestyle= '-',label ='Regression Line')

plt.title("Family_Accounts VS Expenditure_12M_Total")
plt.xlabel("x")
plt.ylabel("y")
plt.legend()
plt.show()

# The coefficients
print('Coefficient:' + '\033[1m' + '{:.2}''\033[0m'.format(single_regression_model.coef_[0][0]))

# The mean squared error
print('Mean squared error: ' + '\033[1m' + '{:.4}''\033[0m'.format(model_mse))

# The mean squared error
print('Root Mean squared error: ' + '\033[1m' + '{:.4}''\033[0m'.format(math.sqrt(model_mse)))

# Explained variance score: 1 is perfect prediction
print('R2 score: '+ '\033[1m' + '{:.2}''\033[0m'.format(r2_score(y_test,y_predict)))

### Test 11 Conclusion

## Test 12: x = Average_Count_SOS_Per_Unit_0_12M y= Expenditure_12M_Total


### Test 12

In [None]:
#Create subset data1 for testing ideal element
test = data_dropna.loc[:, ['Average_Count_SOS_Per_Unit_0_12M', 'Expenditure_12M_Total']]

#remove extreme outliers:
#If you have multiple columns in your dataframe and would like to remove all rows that have outliers in at least one column, 
#the following expression would do that in one shot: select absolute Z score for the row <1.65 (). .all or .any
test_reduced_by_z = test[(np.abs(stats.zscore(test)) < 3).all(axis=1)]

#define the x & y data
x = test_reduced_by_z['Average_Count_SOS_Per_Unit_0_12M']
y = test_reduced_by_z['Expenditure_12M_Total']

test_reduced_by_z.describe()

In [None]:
#create the new scatter plot
plt.plot(x,y,'o',color = 'pink', label = '')
# add the column names as labels
plt.title('X VS Y')
plt.xlabel('X')
plt.ylabel('Y')
plt.legend()
plt.show()

### Normalize Expenditure to ExpenditurePerUnit !!! Not sure how to create array need time

In [None]:
#Create subset data1 for testing ideal element
test = data_dropna.loc[:, ['Average_Count_SOS_Per_Unit_0_12M', 'Expenditure_12M_Total','Count_CatFleetSize']]

#remove extreme outliers:
#If you have multiple columns in your dataframe and would like to remove all rows that have outliers in at least one column, 
#the following expression would do that in one shot: select absolute Z score for the row <1.65 (). .all or .any
test_reduced_by_z = test[(np.abs(stats.zscore(test)) < 3).all(axis=1)]

#define the x & y data
x = test_reduced_by_z['Average_Count_SOS_Per_Unit_0_12M']
y = test_reduced_by_z['Expenditure_12M_Total'] / test_reduced_by_z['Count_CatFleetSize']

test_reduced_by_z.describe()

In [None]:
test_reduced_by_z.corr()

In [None]:
# define our input variable & output variable
x = test_reduced_by_z[['Average_Count_SOS_Per_Unit_0_12M']]
y = test_reduced_by_z[['Expenditure_12M_Total']]

# Split X and y into X_
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.20, random_state=1)
# create a Linear Regression model object.
single_regression_model = LinearRegression()
# pass through the X_train & y_train data set.
single_regression_model.fit(x_train, y_train)
# create OLS model for training data
model = sm.OLS(y_train, x_train)
# pass through the X_train & y_train data set.
est = model.fit()
print(est.summary())

In [None]:
# make some confidence intervals, 95% by default.
est.conf_int()

In [None]:
# estimate the p-values
est.pvalues

In [None]:
# calculate and display intercept, coefficient
intercept = single_regression_model.intercept_[0]
coefficient = single_regression_model.coef_[0][0]
# print out
print("The Coefficient for training model is {:2}".format(coefficient))
print("The intercept for training model is {:4}".format(intercept))

In [None]:
# create prediction value and calculate errors
y_predict = single_regression_model.predict(x_test)
# calculate the mean squared error.
model_mse = mean_squared_error(y_test, y_predict)
# calculate the mean absolute error.
model_mae = mean_absolute_error(y_test, y_predict)
# calulcate the root mean squared error
model_rmse =  math.sqrt(model_mse)
# display the output
print("MSE {:3}".format(model_mse))
print("MAE {:3}".format(model_mae))
print("RMSE {:3}".format(model_rmse))

In [None]:
model_r2 = r2_score(y_test, y_predict)
print("R2 of the prediction is: {:.2}".format(model_r2))

In [None]:
# Grab the residuals & then call the hist() method
(y_test - y_predict).hist(grid = False, color = 'Gold')
plt.title("Model Residuals")
plt.show()

In [None]:
# Plot outputs
plt.scatter(x_test, y_test,  color='pink', label = '')
plt.plot(x_test, y_predict, color='royalblue', linewidth = 3, linestyle= '-',label ='Regression Line')

plt.title("Average_Count_SOS_Per_Unit_0_12M VS Expenditure_12M_Total")
plt.xlabel("x")
plt.ylabel("y")
plt.legend()
plt.show()

# The coefficients
print('Coefficient:' + '\033[1m' + '{:.2}''\033[0m'.format(single_regression_model.coef_[0][0]))

# The mean squared error
print('Mean squared error: ' + '\033[1m' + '{:.4}''\033[0m'.format(model_mse))

# The mean squared error
print('Root Mean squared error: ' + '\033[1m' + '{:.4}''\033[0m'.format(math.sqrt(model_mse)))

# Explained variance score: 1 is perfect prediction
print('R2 score: '+ '\033[1m' + '{:.2}''\033[0m'.format(r2_score(y_test,y_predict)))

### Test 12 Conclusion

## Test 13: x = Average_Count_WorkOrder_Per_Unit_0_12M y= Expenditure_12M_Total


### Test 13

In [None]:
#Create subset data1 for testing ideal element
test = data_dropna.loc[:, ['Average_Count_WorkOrder_Per_Unit_0_12M', 'Expenditure_12M_Total']]

#remove extreme outliers:
#If you have multiple columns in your dataframe and would like to remove all rows that have outliers in at least one column, 
#the following expression would do that in one shot: select absolute Z score for the row <1.65 (). .all or .any
test_reduced_by_z = test[(np.abs(stats.zscore(test)) < 3).all(axis=1)]

#define the x & y data
x = test_reduced_by_z['Average_Count_WorkOrder_Per_Unit_0_12M']
y = test_reduced_by_z['Expenditure_12M_Total']

test_reduced_by_z.describe()

In [None]:
#create the new scatter plot
plt.plot(x,y,'o',color = 'pink', label = '')
# add the column names as labels
plt.title('X VS Y')
plt.xlabel('X')
plt.ylabel('Y')
plt.legend()
plt.show()

### Normalize Expenditure to ExpenditurePerUnit !!! Not sure how to create array need time

In [None]:
#Create subset data1 for testing ideal element
test = data_dropna.loc[:, ['Average_Count_WorkOrder_Per_Unit_0_12M', 'Expenditure_12M_Total']]

#remove extreme outliers:
#If you have multiple columns in your dataframe and would like to remove all rows that have outliers in at least one column, 
#the following expression would do that in one shot: select absolute Z score for the row <1.65 (). .all or .any
test_reduced_by_z = test[(np.abs(stats.zscore(test)) < 3).all(axis=1)]

#define the x & y data
x = test_reduced_by_z['Average_Count_WorkOrder_Per_Unit_0_12M']
y = test_reduced_by_z['Expenditure_12M_Total']

test_reduced_by_z.describe()

In [None]:
#create the new scatter plot
plt.plot(x,y,'o',color = 'pink', label = '')
# add the column names as labels
plt.title('X VS Y')
plt.xlabel('X')
plt.ylabel('Y')
plt.legend()
plt.show()

In [None]:
test_reduced_by_z.corr()

In [None]:
# define our input variable & output variable
x = test_reduced_by_z[['Average_Count_WorkOrder_Per_Unit_0_12M']]
y = test_reduced_by_z[['Expenditure_12M_Total']]

# Split X and y into X_
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.20, random_state=1)
# create a Linear Regression model object.
single_regression_model = LinearRegression()
# pass through the X_train & y_train data set.
single_regression_model.fit(x_train, y_train)
# create OLS model for training data
model = sm.OLS(y_train, x_train)
# pass through the X_train & y_train data set.
est = model.fit()
print(est.summary())

In [None]:
# make some confidence intervals, 95% by default.
est.conf_int()

In [None]:
# estimate the p-values
est.pvalues

In [None]:
# calculate and display intercept, coefficient
intercept = single_regression_model.intercept_[0]
coefficient = single_regression_model.coef_[0][0]
# print out
print("The Coefficient for training model is {:2}".format(coefficient))
print("The intercept for training model is {:4}".format(intercept))

In [None]:
# create prediction value and calculate errors
y_predict = single_regression_model.predict(x_test)
# calculate the mean squared error.
model_mse = mean_squared_error(y_test, y_predict)
# calculate the mean absolute error.
model_mae = mean_absolute_error(y_test, y_predict)
# calulcate the root mean squared error
model_rmse =  math.sqrt(model_mse)
# display the output
print("MSE {:3}".format(model_mse))
print("MAE {:3}".format(model_mae))
print("RMSE {:3}".format(model_rmse))

In [None]:
model_r2 = r2_score(y_test, y_predict)
print("R2 of the prediction is: {:.2}".format(model_r2))

In [None]:
# Grab the residuals & then call the hist() method
(y_test - y_predict).hist(grid = False, color = 'Gold')
plt.title("Model Residuals")
plt.show()

In [None]:
# Plot outputs
plt.scatter(x_test, y_test,  color='pink', label = '')
plt.plot(x_test, y_predict, color='royalblue', linewidth = 3, linestyle= '-',label ='Regression Line')

plt.title("Average_Count_WorkOrder_Per_Unit_0_12M VS Expenditure_12M_Total")
plt.xlabel("x")
plt.ylabel("y")
plt.legend()
plt.show()

# The coefficients
print('Coefficient:' + '\033[1m' + '{:.2}''\033[0m'.format(single_regression_model.coef_[0][0]))

# The mean squared error
print('Mean squared error: ' + '\033[1m' + '{:.4}''\033[0m'.format(model_mse))

# The mean squared error
print('Root Mean squared error: ' + '\033[1m' + '{:.4}''\033[0m'.format(math.sqrt(model_mse)))

# Explained variance score: 1 is perfect prediction
print('R2 score: '+ '\033[1m' + '{:.2}''\033[0m'.format(r2_score(y_test,y_predict)))

### Test 13 Conclusion

## Test 14: x = PartsDCAL y= Expenditure_12M_PartsNServices

### Test 14

In [None]:
#Create subset data1 for testing ideal element
test = data_dropna.loc[:, ['PartsDCAL', 'Expenditure_12M_PartsNServices']]

#remove extreme outliers:
#If you have multiple columns in your dataframe and would like to remove all rows that have outliers in at least one column, 
#the following expression would do that in one shot: select absolute Z score for the row <1.65 (). .all or .any
test_reduced_by_z = test[(np.abs(stats.zscore(test)) < 3).all(axis=1)]

#define the x & y data
x = test_reduced_by_z['PartsDCAL']
y = test_reduced_by_z['Expenditure_12M_PartsNServices']

test_reduced_by_z.describe()

In [None]:
#create the new scatter plot
plt.plot(x,y,'o',color = 'pink', label = '')
# add the column names as labels
plt.title('X VS Y')
plt.xlabel('X')
plt.ylabel('Y')
plt.legend()
plt.show()

In [None]:
test_reduced_by_z.corr()

In [None]:
# define our input variable & output variable
x = test_reduced_by_z[['PartsDCAL']]
y = test_reduced_by_z[['Expenditure_12M_PartsNServices']]

# Split X and y into X_
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.20, random_state=1)
# create a Linear Regression model object.
single_regression_model = LinearRegression()
# pass through the X_train & y_train data set.
single_regression_model.fit(x_train, y_train)
# create OLS model for training data
model = sm.OLS(y_train, x_train)
# pass through the X_train & y_train data set.
est = model.fit()
print(est.summary())

In [None]:
# make some confidence intervals, 95% by default.
est.conf_int()

In [None]:
# estimate the p-values
est.pvalues

In [None]:
# calculate and display intercept, coefficient
intercept = single_regression_model.intercept_[0]
coefficient = single_regression_model.coef_[0][0]
# print out
print("The Coefficient for training model is {:2}".format(coefficient))
print("The intercept for training model is {:4}".format(intercept))

In [None]:
# create prediction value and calculate errors
y_predict = single_regression_model.predict(x_test)
# calculate the mean squared error.
model_mse = mean_squared_error(y_test, y_predict)
# calculate the mean absolute error.
model_mae = mean_absolute_error(y_test, y_predict)
# calulcate the root mean squared error
model_rmse =  math.sqrt(model_mse)
# display the output
print("MSE {:3}".format(model_mse))
print("MAE {:3}".format(model_mae))
print("RMSE {:3}".format(model_rmse))

In [None]:
model_r2 = r2_score(y_test, y_predict)
print("R2 of the prediction is: {:.2}".format(model_r2))

In [None]:
# Grab the residuals & then call the hist() method
(y_test - y_predict).hist(grid = False, color = 'Gold')
plt.title("Model Residuals")
plt.show()

In [None]:
# Plot outputs
plt.scatter(x_test, y_test,  color='pink', label = '')
plt.plot(x_test, y_predict, color='royalblue', linewidth = 3, linestyle= '-',label ='Regression Line')

plt.title("PartsDCAL VS Expenditure_12M_PartsNServices")
plt.xlabel("x")
plt.ylabel("y")
plt.legend()
plt.show()

# The coefficients
print('Coefficient:' + '\033[1m' + '{:.2}''\033[0m'.format(single_regression_model.coef_[0][0]))

# The mean squared error
print('Mean squared error: ' + '\033[1m' + '{:.4}''\033[0m'.format(model_mse))

# The mean squared error
print('Root Mean squared error: ' + '\033[1m' + '{:.4}''\033[0m'.format(math.sqrt(model_mse)))

# Explained variance score: 1 is perfect prediction
print('R2 score: '+ '\033[1m' + '{:.2}''\033[0m'.format(r2_score(y_test,y_predict)))

### Test 14 Conclusion

## Test 15: x = ServiceDCAL y= Expenditure_12M_PartsNServices

### Test 15

In [None]:
#Create subset data1 for testing ideal element
test = data_dropna.loc[:, ['ServiceDCAL', 'Expenditure_12M_PartsNServices']]

#remove extreme outliers:
#If you have multiple columns in your dataframe and would like to remove all rows that have outliers in at least one column, 
#the following expression would do that in one shot: select absolute Z score for the row <1.65 (). .all or .any
test_reduced_by_z = test[(np.abs(stats.zscore(test)) < 3).all(axis=1)]

#define the x & y data
x = test_reduced_by_z['ServiceDCAL']
y = test_reduced_by_z['Expenditure_12M_PartsNServices']

test_reduced_by_z.describe()

In [None]:
#create the new scatter plot
plt.plot(x,y,'o',color = 'pink', label = '')
# add the column names as labels
plt.title('X VS Y')
plt.xlabel('X')
plt.ylabel('Y')
plt.legend()
plt.show()

In [None]:
test_reduced_by_z.corr()

In [None]:
# define our input variable & output variable
x = test_reduced_by_z[['ServiceDCAL']]
y = test_reduced_by_z[['Expenditure_12M_PartsNServices']]

# Split X and y into X_
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.20, random_state=1)
# create a Linear Regression model object.
single_regression_model = LinearRegression()
# pass through the X_train & y_train data set.
single_regression_model.fit(x_train, y_train)
# create OLS model for training data
model = sm.OLS(y_train, x_train)
# pass through the X_train & y_train data set.
est = model.fit()
print(est.summary())

In [None]:
# make some confidence intervals, 95% by default.
est.conf_int()

In [None]:
# estimate the p-values
est.pvalues

In [None]:
# calculate and display intercept, coefficient
intercept = single_regression_model.intercept_[0]
coefficient = single_regression_model.coef_[0][0]
# print out
print("The Coefficient for training model is {:2}".format(coefficient))
print("The intercept for training model is {:4}".format(intercept))

In [None]:
# create prediction value and calculate errors
y_predict = single_regression_model.predict(x_test)
# calculate the mean squared error.
model_mse = mean_squared_error(y_test, y_predict)
# calculate the mean absolute error.
model_mae = mean_absolute_error(y_test, y_predict)
# calulcate the root mean squared error
model_rmse =  math.sqrt(model_mse)
# display the output
print("MSE {:3}".format(model_mse))
print("MAE {:3}".format(model_mae))
print("RMSE {:3}".format(model_rmse))

In [None]:
model_r2 = r2_score(y_test, y_predict)
print("R2 of the prediction is: {:.2}".format(model_r2))

In [None]:
# Grab the residuals & then call the hist() method
(y_test - y_predict).hist(grid = False, color = 'Gold')
plt.title("Model Residuals")
plt.show()

In [None]:
# Plot outputs
plt.scatter(x_test, y_test,  color='pink', label = '')
plt.plot(x_test, y_predict, color='royalblue', linewidth = 3, linestyle= '-',label ='Regression Line')

plt.title("ServiceDCAL VS Expenditure_12M_PartsNServices")
plt.xlabel("x")
plt.ylabel("y")
plt.legend()
plt.show()

# The coefficients
print('Coefficient:' + '\033[1m' + '{:.2}''\033[0m'.format(single_regression_model.coef_[0][0]))

# The mean squared error
print('Mean squared error: ' + '\033[1m' + '{:.4}''\033[0m'.format(model_mse))

# The mean squared error
print('Root Mean squared error: ' + '\033[1m' + '{:.4}''\033[0m'.format(math.sqrt(model_mse)))

# Explained variance score: 1 is perfect prediction
print('R2 score: '+ '\033[1m' + '{:.2}''\033[0m'.format(r2_score(y_test,y_predict)))

### Test 15 conclusion

## Test 16: x = Count_FleetSize y= Count_CatFleetSize

### Test 16

In [None]:
#Create subset data1 for testing ideal element
test = data_dropna.loc[:, ['Count_FleetSize', 'Count_CatFleetSize']]

#remove extreme outliers:
#If you have multiple columns in your dataframe and would like to remove all rows that have outliers in at least one column, 
#the following expression would do that in one shot: select absolute Z score for the row <1.65 (). .all or .any
test_reduced_by_z = test[(np.abs(stats.zscore(test)) < 3).all(axis=1)]

#define the x & y data
x = test_reduced_by_z['Count_FleetSize']
y = test_reduced_by_z['Count_CatFleetSize']

test_reduced_by_z.describe()

In [None]:
#create the new scatter plot
plt.plot(x,y,'o',color = 'pink', label = '')
# add the column names as labels
plt.title('X VS Y')
plt.xlabel('X')
plt.ylabel('Y')
plt.legend()
plt.show()

In [None]:
test_reduced_by_z.corr()

### Test 16 conclusion

In [None]:
# I agree Fleet size is highly correlated with cat fleet size and should not be used

## Test 17: x = Family_CatFleetSize y= Count_CatFleetSize

### Test 17

In [None]:
#Create subset data1 for testing ideal element
test = data_dropna.loc[:, ['Family_CatFleetSize', 'Count_CatFleetSize']]

#remove extreme outliers:
#If you have multiple columns in your dataframe and would like to remove all rows that have outliers in at least one column, 
#the following expression would do that in one shot: select absolute Z score for the row <1.65 (). .all or .any
test_reduced_by_z = test[(np.abs(stats.zscore(test)) < 3).all(axis=1)]

#define the x & y data
x = test_reduced_by_z['Family_CatFleetSize']
y = test_reduced_by_z['Count_CatFleetSize']

test_reduced_by_z.describe()

In [None]:
#create the new scatter plot
plt.plot(x,y,'o',color = 'pink', label = '')
# add the column names as labels
plt.title('X VS Y')
plt.xlabel('X')
plt.ylabel('Y')
plt.legend()
plt.show()

In [None]:
test_reduced_by_z.corr()

In [None]:
# define our input variable & output variable
x = test_reduced_by_z[['Family_CatFleetSize']]
y = test_reduced_by_z[['Count_CatFleetSize']]

# Split X and y into X_
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.20, random_state=1)
# create a Linear Regression model object.
single_regression_model = LinearRegression()
# pass through the X_train & y_train data set.
single_regression_model.fit(x_train, y_train)
# create OLS model for training data
model = sm.OLS(y_train, x_train)
# pass through the X_train & y_train data set.
est = model.fit()
print(est.summary())

In [None]:
# make some confidence intervals, 95% by default.
est.conf_int()

In [None]:
# estimate the p-values
est.pvalues

In [None]:
# calculate and display intercept, coefficient
intercept = single_regression_model.intercept_[0]
coefficient = single_regression_model.coef_[0][0]
# print out
print("The Coefficient for training model is {:2}".format(coefficient))
print("The intercept for training model is {:4}".format(intercept))

In [None]:
# create prediction value and calculate errors
y_predict = single_regression_model.predict(x_test)
# calculate the mean squared error.
model_mse = mean_squared_error(y_test, y_predict)
# calculate the mean absolute error.
model_mae = mean_absolute_error(y_test, y_predict)
# calulcate the root mean squared error
model_rmse =  math.sqrt(model_mse)
# display the output
print("MSE {:3}".format(model_mse))
print("MAE {:3}".format(model_mae))
print("RMSE {:3}".format(model_rmse))

In [None]:
model_r2 = r2_score(y_test, y_predict)
print("R2 of the prediction is: {:.2}".format(model_r2))

In [None]:
# Grab the residuals & then call the hist() method
(y_test - y_predict).hist(grid = False, color = 'Gold')
plt.title("Model Residuals")
plt.show()

In [None]:
# Plot outputs
plt.scatter(x_test, y_test,  color='pink', label = '')
plt.plot(x_test, y_predict, color='royalblue', linewidth = 3, linestyle= '-',label ='Regression Line')

plt.title("Family_CatFleetSize VS Count_CatFleetSize")
plt.xlabel("x")
plt.ylabel("y")
plt.legend()
plt.show()

# The coefficients
print('Coefficient:' + '\033[1m' + '{:.2}''\033[0m'.format(single_regression_model.coef_[0][0]))

# The mean squared error
print('Mean squared error: ' + '\033[1m' + '{:.4}''\033[0m'.format(model_mse))

# The mean squared error
print('Root Mean squared error: ' + '\033[1m' + '{:.4}''\033[0m'.format(math.sqrt(model_mse)))

# Explained variance score: 1 is perfect prediction
print('R2 score: '+ '\033[1m' + '{:.2}''\033[0m'.format(r2_score(y_test,y_predict)))

### Test 17 Conclusion

## Test 18: x = Family_FleetSize y= Family_CatFleetSize

### Test 18

In [None]:
#Create subset data1 for testing ideal element
test = data_dropna.loc[:, ['Family_FleetSize', 'Family_CatFleetSize']]

#remove extreme outliers:
#If you have multiple columns in your dataframe and would like to remove all rows that have outliers in at least one column, 
#the following expression would do that in one shot: select absolute Z score for the row <1.65 (). .all or .any
test_reduced_by_z = test[(np.abs(stats.zscore(test)) < 3).all(axis=1)]

#define the x & y data
x = test_reduced_by_z['Family_FleetSize']
y = test_reduced_by_z['Family_CatFleetSize']

test_reduced_by_z.describe()

In [None]:
#create the new scatter plot
plt.plot(x,y,'o',color = 'pink', label = '')
# add the column names as labels
plt.title('X VS Y')
plt.xlabel('X')
plt.ylabel('Y')
plt.legend()
plt.show()

In [None]:
test_reduced_by_z.corr()

In [None]:
# define our input variable & output variable
x = test_reduced_by_z[['Family_FleetSize']]
y = test_reduced_by_z[['Family_CatFleetSize']]

# Split X and y into X_
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.20, random_state=1)
# create a Linear Regression model object.
single_regression_model = LinearRegression()
# pass through the X_train & y_train data set.
single_regression_model.fit(x_train, y_train)
# create OLS model for training data
model = sm.OLS(y_train, x_train)
# pass through the X_train & y_train data set.
est = model.fit()
print(est.summary())

In [None]:
# make some confidence intervals, 95% by default.
est.conf_int()

In [None]:
# estimate the p-values
est.pvalues

In [None]:
# calculate and display intercept, coefficient
intercept = single_regression_model.intercept_[0]
coefficient = single_regression_model.coef_[0][0]
# print out
print("The Coefficient for training model is {:2}".format(coefficient))
print("The intercept for training model is {:4}".format(intercept))

In [None]:
# create prediction value and calculate errors
y_predict = single_regression_model.predict(x_test)
# calculate the mean squared error.
model_mse = mean_squared_error(y_test, y_predict)
# calculate the mean absolute error.
model_mae = mean_absolute_error(y_test, y_predict)
# calulcate the root mean squared error
model_rmse =  math.sqrt(model_mse)
# display the output
print("MSE {:3}".format(model_mse))
print("MAE {:3}".format(model_mae))
print("RMSE {:3}".format(model_rmse))

In [None]:
model_r2 = r2_score(y_test, y_predict)
print("R2 of the prediction is: {:.2}".format(model_r2))

### Test 18 conclusion

## Test 19: x = Family_CatFleetSize y= Expenditure_12M_PartsNServices

### Test 19

In [None]:
#Create subset data1 for testing ideal element
test = data_dropna.loc[:, ['Family_CatFleetSize', 'Expenditure_12M_PartsNServices']]

#remove extreme outliers:
#If you have multiple columns in your dataframe and would like to remove all rows that have outliers in at least one column, 
#the following expression would do that in one shot: select absolute Z score for the row <1.65 (). .all or .any
test_reduced_by_z = test[(np.abs(stats.zscore(test)) < 3).all(axis=1)]

#define the x & y data
x = test_reduced_by_z['Family_CatFleetSize']
y = test_reduced_by_z['Expenditure_12M_PartsNServices']

test_reduced_by_z.describe()

In [None]:
#create the new scatter plot
plt.plot(x,y,'o',color = 'pink', label = '')
# add the column names as labels
plt.title('X VS Y')
plt.xlabel('X')
plt.ylabel('Y')
plt.legend()
plt.show()

In [None]:
# #reduce X to under 1500
test_reduced_by_z = test_reduced_by_z[test_reduced_by_z['Family_CatFleetSize'] <= 1500]

x = test_reduced_by_z['Family_CatFleetSize']
y = test_reduced_by_z['Expenditure_12M_PartsNServices']
test_reduced_by_z.describe()

In [None]:
#create the new scatter plot
plt.plot(x,y,'o',color = 'pink', label = '')
# add the column names as labels
plt.title('X VS Y')
plt.xlabel('X')
plt.ylabel('Y')
plt.legend()
plt.show()

In [None]:
test_reduced_by_z.corr()

In [None]:
# define our input variable & output variable
x = test_reduced_by_z[['Family_CatFleetSize']]
y = test_reduced_by_z[['Expenditure_12M_PartsNServices']]

# Split X and y into X_
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.20, random_state=1)
# create a Linear Regression model object.
single_regression_model = LinearRegression()
# pass through the X_train & y_train data set.
single_regression_model.fit(x_train, y_train)
# create OLS model for training data
model = sm.OLS(y_train, x_train)
# pass through the X_train & y_train data set.
est = model.fit()
print(est.summary())

In [None]:
# make some confidence intervals, 95% by default.
est.conf_int()

In [None]:
# estimate the p-values
est.pvalues

In [None]:
# calculate and display intercept, coefficient
intercept = single_regression_model.intercept_[0]
coefficient = single_regression_model.coef_[0][0]
# print out
print("The Coefficient for training model is {:2}".format(coefficient))
print("The intercept for training model is {:4}".format(intercept))

In [None]:
# create prediction value and calculate errors
y_predict = single_regression_model.predict(x_test)
# calculate the mean squared error.
model_mse = mean_squared_error(y_test, y_predict)
# calculate the mean absolute error.
model_mae = mean_absolute_error(y_test, y_predict)
# calulcate the root mean squared error
model_rmse =  math.sqrt(model_mse)
# display the output
print("MSE {:3}".format(model_mse))
print("MAE {:3}".format(model_mae))
print("RMSE {:3}".format(model_rmse))

In [None]:
model_r2 = r2_score(y_test, y_predict)
print("R2 of the prediction is: {:.2}".format(model_r2))

In [None]:
# Grab the residuals & then call the hist() method
(y_test - y_predict).hist(grid = False, color = 'Gold')
plt.title("Model Residuals")
plt.show()

In [None]:
# Plot outputs
plt.scatter(x_test, y_test,  color='pink', label = '')
plt.plot(x_test, y_predict, color='royalblue', linewidth = 3, linestyle= '-',label ='Regression Line')

plt.title("Family_CatFleetSize VS Expenditure_12M_PartsNServices")
plt.xlabel("x")
plt.ylabel("y")
plt.legend()
plt.show()

# The coefficients
print('Coefficient:' + '\033[1m' + '{:.2}''\033[0m'.format(single_regression_model.coef_[0][0]))

# The mean squared error
print('Mean squared error: ' + '\033[1m' + '{:.4}''\033[0m'.format(model_mse))

# The mean squared error
print('Root Mean squared error: ' + '\033[1m' + '{:.4}''\033[0m'.format(math.sqrt(model_mse)))

# Explained variance score: 1 is perfect prediction
print('R2 score: '+ '\033[1m' + '{:.2}''\033[0m'.format(r2_score(y_test,y_predict)))

### Test 19 Conclusion

## Test 20: x = Family_Expenditure_12M_Equipment y= Expenditure_12M_PartsNServices

### Test 20

In [None]:
#Create subset data1 for testing ideal element
test = data_dropna.loc[:, ['Family_Expenditure_12M_Equipment', 'Expenditure_12M_PartsNServices']]

#remove extreme outliers:
#If you have multiple columns in your dataframe and would like to remove all rows that have outliers in at least one column, 
#the following expression would do that in one shot: select absolute Z score for the row <1.65 (). .all or .any
test_reduced_by_z = test[(np.abs(stats.zscore(test)) < 3).all(axis=1)]

#define the x & y data
x = test_reduced_by_z['Family_Expenditure_12M_Equipment']
y = test_reduced_by_z['Expenditure_12M_PartsNServices']

test_reduced_by_z.describe()

In [None]:
#create the new scatter plot
plt.plot(x,y,'o',color = 'pink', label = '')
# add the column names as labels
plt.title('X VS Y')
plt.xlabel('X')
plt.ylabel('Y')
plt.legend()
plt.show()

In [None]:
test_reduced_by_z.corr()

In [None]:
# define our input variable & output variable
x = test_reduced_by_z[['Family_Expenditure_12M_Equipment']]
y = test_reduced_by_z[['Expenditure_12M_PartsNServices']]

# Split X and y into X_
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.20, random_state=1)
# create a Linear Regression model object.
single_regression_model = LinearRegression()
# pass through the X_train & y_train data set.
single_regression_model.fit(x_train, y_train)
# create OLS model for training data
model = sm.OLS(y_train, x_train)
# pass through the X_train & y_train data set.
est = model.fit()
print(est.summary())

In [None]:
# make some confidence intervals, 95% by default.
est.conf_int()

In [None]:
# estimate the p-values
est.pvalues

In [None]:
# calculate and display intercept, coefficient
intercept = single_regression_model.intercept_[0]
coefficient = single_regression_model.coef_[0][0]
# print out
print("The Coefficient for training model is {:2}".format(coefficient))
print("The intercept for training model is {:4}".format(intercept))

In [None]:
# create prediction value and calculate errors
y_predict = single_regression_model.predict(x_test)
# calculate the mean squared error.
model_mse = mean_squared_error(y_test, y_predict)
# calculate the mean absolute error.
model_mae = mean_absolute_error(y_test, y_predict)
# calulcate the root mean squared error
model_rmse =  math.sqrt(model_mse)
# display the output
print("MSE {:3}".format(model_mse))
print("MAE {:3}".format(model_mae))
print("RMSE {:3}".format(model_rmse))

In [None]:
model_r2 = r2_score(y_test, y_predict)
print("R2 of the prediction is: {:.2}".format(model_r2))

In [None]:
# Grab the residuals & then call the hist() method
(y_test - y_predict).hist(grid = False, color = 'Gold')
plt.title("Model Residuals")
plt.show()

In [None]:
# Plot outputs
plt.scatter(x_test, y_test,  color='pink', label = '')
plt.plot(x_test, y_predict, color='royalblue', linewidth = 3, linestyle= '-',label ='Regression Line')

plt.title("Family_Expenditure_12M_Equipment VS Expenditure_12M_PartsNServices")
plt.xlabel("x")
plt.ylabel("y")
plt.legend()
plt.show()

# The coefficients
print('Coefficient:' + '\033[1m' + '{:.2}''\033[0m'.format(single_regression_model.coef_[0][0]))

# The mean squared error
print('Mean squared error: ' + '\033[1m' + '{:.4}''\033[0m'.format(model_mse))

# The mean squared error
print('Root Mean squared error: ' + '\033[1m' + '{:.4}''\033[0m'.format(math.sqrt(model_mse)))

# Explained variance score: 1 is perfect prediction
print('R2 score: '+ '\033[1m' + '{:.2}''\033[0m'.format(r2_score(y_test,y_predict)))

### Test 20 conclusion

## Test 21: x = Family_Expenditure_12M_PartsNServices y= Expenditure_12M_PartsNServices

### Test 21

In [None]:
#Create subset data1 for testing ideal element
test = data_dropna.loc[:, ['Family_Expenditure_12M_PartsNServices', 'Expenditure_12M_PartsNServices']]

#remove extreme outliers:
#If you have multiple columns in your dataframe and would like to remove all rows that have outliers in at least one column, 
#the following expression would do that in one shot: select absolute Z score for the row <1.65 (). .all or .any
test_reduced_by_z = test[(np.abs(stats.zscore(test)) < 3).all(axis=1)]

#define the x & y data
x = test_reduced_by_z['Family_Expenditure_12M_PartsNServices']
y = test_reduced_by_z['Expenditure_12M_PartsNServices']

test_reduced_by_z.describe()

In [None]:
#create the new scatter plot
plt.plot(x,y,'o',color = 'pink', label = '')
# add the column names as labels
plt.title('X VS Y')
plt.xlabel('X')
plt.ylabel('Y')
plt.legend()
plt.show()

In [None]:
test_reduced_by_z.corr()

In [None]:
# define our input variable & output variable
x = test_reduced_by_z[['PartsDCAL']]
y = test_reduced_by_z[['Expenditure_12M_PartsNServices']]

# Split X and y into X_
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.20, random_state=1)
# create a Linear Regression model object.
single_regression_model = LinearRegression()
# pass through the X_train & y_train data set.
single_regression_model.fit(x_train, y_train)
# create OLS model for training data
model = sm.OLS(y_train, x_train)
# pass through the X_train & y_train data set.
est = model.fit()
print(est.summary())

In [None]:
# make some confidence intervals, 95% by default.
est.conf_int()

In [None]:
# estimate the p-values
est.pvalues

In [None]:
# calculate and display intercept, coefficient
intercept = single_regression_model.intercept_[0]
coefficient = single_regression_model.coef_[0][0]
# print out
print("The Coefficient for training model is {:2}".format(coefficient))
print("The intercept for training model is {:4}".format(intercept))

In [None]:
# create prediction value and calculate errors
y_predict = single_regression_model.predict(x_test)
# calculate the mean squared error.
model_mse = mean_squared_error(y_test, y_predict)
# calculate the mean absolute error.
model_mae = mean_absolute_error(y_test, y_predict)
# calulcate the root mean squared error
model_rmse =  math.sqrt(model_mse)
# display the output
print("MSE {:3}".format(model_mse))
print("MAE {:3}".format(model_mae))
print("RMSE {:3}".format(model_rmse))

In [None]:
model_r2 = r2_score(y_test, y_predict)
print("R2 of the prediction is: {:.2}".format(model_r2))

In [None]:
# Grab the residuals & then call the hist() method
(y_test - y_predict).hist(grid = False, color = 'Gold')
plt.title("Model Residuals")
plt.show()

In [None]:
# Plot outputs
plt.scatter(x_test, y_test,  color='pink', label = '')
plt.plot(x_test, y_predict, color='royalblue', linewidth = 3, linestyle= '-',label ='Regression Line')

plt.title("Family_Expenditure_12M_PartsNServices VS Expenditure_12M_PartsNServices")
plt.xlabel("x")
plt.ylabel("y")
plt.legend()
plt.show()

# The coefficients
print('Coefficient:' + '\033[1m' + '{:.2}''\033[0m'.format(single_regression_model.coef_[0][0]))

# The mean squared error
print('Mean squared error: ' + '\033[1m' + '{:.4}''\033[0m'.format(model_mse))

# The mean squared error
print('Root Mean squared error: ' + '\033[1m' + '{:.4}''\033[0m'.format(math.sqrt(model_mse)))

# Explained variance score: 1 is perfect prediction
print('R2 score: '+ '\033[1m' + '{:.2}''\033[0m'.format(r2_score(y_test,y_predict)))

### Test 21 Conclusion

# Next Steps

## regularization

## Split dataset by POPS