# Lead generation Equipment value exploration

### Load libraries and data

In [None]:
#Libraries
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import statsmodels.api as sm
import math 
import pickle
import pylab
from scipy import stats
from scipy.stats import kurtosis, skew
from statsmodels.stats import diagnostic as diag
from statsmodels.stats.outliers_influence import variance_inflation_factor
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error

#Settings
%matplotlib inline
pd.options.display.max_rows = 50
pd.options.display.max_columns = 50
pd.set_option('display.precision', 2)
pd.options.display.float_format = '{:8,.2f}'.format

In [None]:
#load data
data = pd.read_excel('..\\data\\LeadData_Equipment_All.xlsx', index_col='SerialNumber')

#Summary
#data.head()
data.shape

### Validate data

In [None]:
#check for missing values
display(data.isna().any())

# Print data columns for copy and paste
#data.columns

#Verify data types
#data.dtypes

#Convert data to float for calculations
#data = data.astype(float)

#Verify data size
#data.shape

#check for null for removal
data_dropna = data.drop(['OwnerCustomerNumber','FuelConsumptionPercentageOverAverage',
                   'IdlingPercent'], axis = 1)

data_dropna = data_dropna.dropna()

data_dropna.shape

### Filtering Data

In [None]:
#Filtering data

#Reduce by to 3 standard deviation
#data_filtered_by_z = data_dropna[(np.abs(stats.zscore(data_dropna)) < 3).all(axis=1)]
#data_filtered_by_z.shape
#data_dropna.index.difference(data_filtered_by_z.index)

#Put Cap on low/high at 1% and 99% percentile
#print(quant_df)
# data_clipped_upper = data_dropna.apply(lambda x: x.clip_upper(np.percentile(x, 99)))
# data_clipped = data_clipped_upper.apply(lambda x: x.clip_lower(np.percentile(x, 1)))

#Hard cap to drop columns
data_filtered = data_dropna.drop(data_dropna.index[data_dropna.LTDSMU >= 99998])
data_filtered.shape

In [None]:
# get the summary
desc_df = data_filtered.describe()

# add the standard deviation metric
desc_df.loc['+3_std'] = desc_df.loc['mean'] + (desc_df.loc['std'] * 3)
desc_df.loc['-3_std'] = desc_df.loc['mean'] - (desc_df.loc['std'] * 3)
desc_df

In [None]:
#Compute the correlation matrix
corr_matrix = round(data_filtered.corr(),2)
display(corr_matrix)

def get_sorted_corr(df):
    crr = df.corr().abs().unstack()
#     labels_to_drop = get_redundant_pairs(df)
#     crr = crr.drop(labels=labels_to_drop).sort_values(ascending=False)
    return crr

# def get_redundant_pairs(df):
#     pairs_to_drop = set()
#     cols = df.columns
#     for i in range(0, df.shape[1]):
#         for j in range(0, i+1):
#             pairs_to_drop.add((cols[i], cols[j]))
#     return pairs_to_drop

#Unstack and print to excel
get_sorted_corr(data_filtered)
sorted_corr = get_sorted_corr(data_filtered)
sorted_corr.to_csv('..\\data\\Output\\Sorted_Correlation_Equipment.csv')

In [None]:
#Create chart for correlation

# Generate a mask for the upper triangle
mask = np.triu(np.ones_like(corr_matrix, dtype=np.bool))

# Set up the matplotlib figure
fig, ax = plt.subplots(figsize=(120, 120))

# Generate a custom diverging colormap
cmap = sns.diverging_palette(240, 10, n=9, as_cmap=True)

heatmap = sns.heatmap(corr_matrix, mask=mask, cmap=cmap,
                      square=True, linewidths=3,
                      cbar_kws = {'shrink': 0.6},
                      vmin=-1, vmax=1,
                      annot=True, annot_kws = {'size': 2})

# add the column names as labels
ax.set_yticklabels(corr_matrix.columns)
ax.set_xticklabels(corr_matrix.columns)

# plt.show()
plt.savefig('..\\data\\Output\\Correlation_Matrix_Equipment')

## Assumption: Y = PARTS_EQ_0_12M

### Start

In [None]:
#Print columns
data_filtered.columns

In [None]:
#Drop columns with very low correlations to Y
data_filtered_by_coe = data_filtered.drop([
'ServiceDCAL'              #0.0499822973976113
,'Warranty_Days_Remaining'              #0.042695939798081
,'UtilizationOverFleetAverage'              #0.0420180012748909
,'SERVICES_Count_Per_CatFleetSize'              #0.0332625199089713
,'Age_Months'              #0.0276883817480741
,'PARTS_Count_Per_500HR_Utilized'              #0.0275643425736726
,'PMContract_Only'              #0.0242680199242601
,'MARC'              #0.0231321538648038
,'Average_Fleet_Age_Months'              #0.0228474665102275
,'PartsReturnPercentage_CU'              #0.0156453002524307
,'CSA_Days_Remaining'              #0.0127407929009288
,'SERVICES_Count_Per_500HR_Utilized'              #0.0107504356479196
,'Percentage_Cat'              #0.00305093718675929
,'Lease_Days_Remaining'              #0.00276979550655567
,'ServicesBackOutPercentage_CU'              #0.00207836673250886
,'SOS_Count_Per_500HR_Utilized'              #0.00176269379001512
,'AverageDaysToPay'              #0.000305755645728231
     ],axis = 1)
data_filtered_by_coe.columns

### Test for Multicollinearity

#### VIF Test

In [None]:
# define our input variable (X) & output variable
X = data_filtered_by_coe.drop('PARTS_EQ_0_12M', axis = 1)
Y = data_filtered_by_coe[['PARTS_EQ_0_12M']]

# define two data frames one before the drop and one after the drop
df_before = data_filtered_by_coe
df_after = data_filtered_by_coe.drop(['Percentage_PLDeviceSerialNumber'
                                      ,'Expenditure_12M_PartsNServices'
                                     ], axis = 1)

# the VFI does expect a constant term in the data, so we need to add one using the add_constant method
X1 = sm.tools.add_constant(df_before)
X2 = sm.tools.add_constant(df_after)

# create the series for both
series_before = pd.Series([variance_inflation_factor(X1.values, i) for i in range(X1.shape[1])], index=X1.columns)
series_after = pd.Series([variance_inflation_factor(X2.values, i) for i in range(X2.shape[1])], index=X2.columns)

# display the series
print('DATA BEFORE')
print('-'*100)
display(series_before)

print('DATA AFTER')
print('-'*100)
display(series_after)

In [None]:
y_kurtosis = kurtosis(data_filtered_by_coe['PARTS_EQ_0_12M'], fisher = True)
display("PARTS_EQ_0_12M kurtosis: {:2}".format(y_kurtosis))
display(stats.kurtosistest(data_filtered_by_coe['PARTS_EQ_0_12M']))

y_skew = skew(data_filtered_by_coe['PARTS_EQ_0_12M'])
display("PARTS_EQ_0_12M skew: {:2}".format(y_skew))
display(stats.skewtest(data_filtered_by_coe['PARTS_EQ_0_12M']))

#### Variable test 1 WODollar_Exclude12M

In [None]:
#Create subset test for testing ideal element
test = data_filtered_by_coe.loc[:, ['WODollar_Exclude12M','PARTS_EQ_0_12M']]

#test.describe()
#scatter plot
sns.pairplot(test, kind='reg', diag_kind='hist',
             height=6, aspect=1,
             plot_kws={'scatter_kws': {'alpha': 0.7}})
plt.savefig('..\\data\\Output\\Scatter_Plot_PARTS_EQ_0_12M_EQ_VS_WODollar_Exclude12M')

In [None]:
y_kurtosis = kurtosis(test['WODollar_Exclude12M'], fisher = True)
display("WODollar_Exclude12M kurtosis: {:2}".format(y_kurtosis))
display(stats.kurtosistest(test['WODollar_Exclude12M']))

y_skew = skew(test['WODollar_Exclude12M'])
display("WODollar_Exclude12M skew: {:2}".format(y_skew))
display(stats.skewtest(test['WODollar_Exclude12M']))

In [None]:
test.corr()

#### Variable test 2 SERVICES_COUNT_EQ_0_12M

In [None]:
#Create subset test for testing ideal element
test = data_filtered_by_coe.loc[:, ['SERVICES_COUNT_EQ_0_12M','PARTS_EQ_0_12M']]

#scatter plot
sns.pairplot(test, kind='reg', diag_kind='hist',
             height=6, aspect=1,
             plot_kws={'scatter_kws': {'alpha': 0.7}})
plt.savefig('..\\data\\Output\\Scatter_Plot_PARTS_EQ_0_12M_EQ_VS_SERVICES_COUNT_EQ_0_12M')

In [None]:
y_kurtosis = kurtosis(test['SERVICES_COUNT_EQ_0_12M'], fisher = True)
display("SERVICES_COUNT_EQ_0_12M kurtosis: {:2}".format(y_kurtosis))
display(stats.kurtosistest(test['SERVICES_COUNT_EQ_0_12M']))

y_skew = skew(test['SERVICES_COUNT_EQ_0_12M'])
display("SERVICES_COUNT_EQ_0_12M skew: {:2}".format(y_skew))
display(stats.skewtest(test['SERVICES_COUNT_EQ_0_12M']))

In [None]:
test.corr()

#### Variable test 3 SERVICES_Total_Per_CatFleetSize

In [None]:
#Create subset test for testing ideal element
test = data_filtered_by_coe.loc[:, ['SERVICES_Total_Per_CatFleetSize','PARTS_EQ_0_12M']]
#define the x & y data

#test.describe()
#scatter plot
sns.pairplot(test, kind='reg', diag_kind='hist',
             height=6, aspect=1,
             plot_kws={'scatter_kws': {'alpha': 0.7}})
plt.savefig('..\\data\\Output\\SERVICES_Total_Per_CatFleetSize')

In [None]:
y_kurtosis = kurtosis(test['SERVICES_Total_Per_CatFleetSize'], fisher = True)
display("SERVICES_Total_Per_CatFleetSize kurtosis: {:2}".format(y_kurtosis))
display(stats.kurtosistest(test['SERVICES_Total_Per_CatFleetSize']))

y_skew = skew(test['SERVICES_Total_Per_CatFleetSize'])
display("SERVICES_Total_Per_CatFleetSize skew: {:2}".format(y_skew))
display(stats.skewtest(test['SERVICES_Total_Per_CatFleetSize']))

In [None]:
test.corr()

#### Variable test 4 SOS_COUNT_EQ_0_12M

In [None]:
#Create subset test for testing ideal element
test = data_filtered_by_coe.loc[:, ['SOS_COUNT_EQ_0_12M','PARTS_EQ_0_12M']]

#scatter plot
sns.pairplot(test, kind='reg', diag_kind='hist',
             height=6, aspect=1,
             plot_kws={'scatter_kws': {'alpha': 0.7}})
plt.savefig('..\\data\\Output\\SOS_COUNT_EQ_0_12M')

In [None]:
y_kurtosis = kurtosis(test['SOS_COUNT_EQ_0_12M'], fisher = True)
display("SOS_COUNT_EQ_0_12M kurtosis: {:2}".format(y_kurtosis))
display(stats.kurtosistest(test['SOS_COUNT_EQ_0_12M']))

y_skew = skew(test['SOS_COUNT_EQ_0_12M'])
display("SOS_COUNT_EQ_0_12M skew: {:2}".format(y_skew))
display(stats.skewtest(test['SOS_COUNT_EQ_0_12M']))

In [None]:
test.corr()

#### Variable test 5 SOS_Count_Per_CatFleetSize

In [None]:
#Create subset test for testing ideal element
test = data_filtered_by_coe.loc[:, ['SOS_Count_Per_CatFleetSize','PARTS_EQ_0_12M']]

#test.describe()
#scatter plot
sns.pairplot(test, kind='reg', diag_kind='hist',
             height=6, aspect=1,
             plot_kws={'scatter_kws': {'alpha': 0.7}})
plt.savefig('..\\data\\Output\\SOS_Count_Per_CatFleetSize')

In [None]:
y_kurtosis = kurtosis(test1['Age_Months'], fisher = True)
display("Age_Months kurtosis: {:2}".format(y_kurtosis))
display(stats.kurtosistest(test1['Age_Months']))

y_skew = skew(test1['Age_Months'])
display("Age_Months skew: {:2}".format(y_skew))
display(stats.skewtest(test1['Age_Months']))

In [None]:
test.corr()

#### Variable test 6 SERVICES_Count_Per_CatFleetSize

In [None]:
#Create subset test for testing ideal element
test = data_filtered_by_coe.loc[:, ['SERVICES_Count_Per_CatFleetSize','PARTS_EQ_0_12M']]
#define the x & y data

#test.describe()
#scatter plot
sns.pairplot(test, kind='reg', diag_kind='hist',
             height=6, aspect=1,
             plot_kws={'scatter_kws': {'alpha': 0.7}})
plt.savefig('..\\data\\Output\\SERVICES_Count_Per_CatFleetSize')

In [None]:
y_kurtosis = kurtosis(test['SERVICES_Count_Per_CatFleetSize'], fisher = True)
display("SERVICES_Count_Per_CatFleetSize kurtosis: {:2}".format(y_kurtosis))
display(stats.kurtosistest(test['SERVICES_Count_Per_CatFleetSize']))

y_skew = skew(test['SERVICES_Count_Per_CatFleetSize'])
display("SERVICES_Count_Per_CatFleetSize skew: {:2}".format(y_skew))
display(stats.skewtest(test['SERVICES_Count_Per_CatFleetSize']))

In [None]:
test.corr()

#### Variable test 7 ServiceDCAL

In [None]:
#Create subset test for testing ideal element
test = data_filtered_by_coe.loc[:, ['ServiceDCAL','PARTS_EQ_0_12M']]
#define the x & y data

#test.describe()
#scatter plot
sns.pairplot(test, kind='reg', diag_kind='hist',
             height=6, aspect=1,
             plot_kws={'scatter_kws': {'alpha': 0.7}})
plt.savefig('..\\data\\Output\\ServiceDCAL')

In [None]:
y_kurtosis = kurtosis(test['ServiceDCAL'], fisher = True)
display("ServiceDCAL kurtosis: {:2}".format(y_kurtosis))
display(stats.kurtosistest(test['ServiceDCAL']))

y_skew = skew(test['ServiceDCAL'])
display("ServiceDCAL skew: {:2}".format(y_skew))
display(stats.skewtest(test['ServiceDCAL']))

In [None]:
test.corr()

#### Variable test 8 MARC

In [None]:
#Create subset test for testing ideal element
test = data_filtered_by_coe.loc[:, ['MARC','PARTS_EQ_0_12M']]
#define the x & y data

#test.describe()
#scatter plot
sns.pairplot(test, kind='reg', diag_kind='hist',
             height=6, aspect=1,
             plot_kws={'scatter_kws': {'alpha': 0.7}})
plt.savefig('..\\data\\Output\\MARC')

In [None]:
y_kurtosis = kurtosis(test['MARC'], fisher = True)
display("MARC kurtosis: {:2}".format(y_kurtosis))
display(stats.kurtosistest(test['MARC']))

y_skew = skew(test['MARC'])
display("MARC skew: {:2}".format(y_skew))
display(stats.skewtest(test['MARC']))

In [None]:
test.corr()

#### Variable test 9 Avg_AnnualUsage

In [None]:
#Create subset test for testing ideal element
test = data_filtered_by_coe.loc[:, ['Avg_AnnualUsage','PARTS_EQ_0_12M']]
#define the x & y data

#test.describe()
#scatter plot
sns.pairplot(test, kind='reg', diag_kind='hist',
             height=6, aspect=1,
             plot_kws={'scatter_kws': {'alpha': 0.7}})
plt.savefig('..\\data\\Output\\Avg_AnnualUsage')

In [None]:
y_kurtosis = kurtosis(test['Avg_AnnualUsage'], fisher = True)
display("Avg_AnnualUsage kurtosis: {:2}".format(y_kurtosis))
display(stats.kurtosistest(test['Avg_AnnualUsage']))

y_skew = skew(test['Avg_AnnualUsage'])
display("Avg_AnnualUsage skew: {:2}".format(y_skew))
display(stats.skewtest(test['Avg_AnnualUsage']))

In [None]:
test.corr()

#### Variable test 10 CalculatedAnnualUsage

In [None]:
#Create subset test for testing ideal element
test = data_filtered_by_coe.loc[:, ['CalculatedAnnualUsage','PARTS_EQ_0_12M']]
#define the x & y data

#test.describe()
#scatter plot
sns.pairplot(test, kind='reg', diag_kind='hist',
             height=6, aspect=1,
             plot_kws={'scatter_kws': {'alpha': 0.7}})
plt.savefig('..\\data\\Output\\CalculatedAnnualUsage')

In [None]:
y_kurtosis = kurtosis(test['CalculatedAnnualUsage'], fisher = True)
display("CalculatedAnnualUsage kurtosis: {:2}".format(y_kurtosis))
display(stats.kurtosistest(test['CalculatedAnnualUsage']))

y_skew = skew(test['CalculatedAnnualUsage'])
display("CalculatedAnnualUsage skew: {:2}".format(y_skew))
display(stats.skewtest(test['CalculatedAnnualUsage']))

In [None]:
test.corr()

#### Variable test 11 PLSubscriptionActive

In [None]:
#Create subset test for testing ideal element
test = data_filtered_by_coe.loc[:, ['PLSubscriptionActive','PARTS_EQ_0_12M']]
#define the x & y data

#test.describe()
#scatter plot
sns.pairplot(test, kind='reg', diag_kind='hist',
             height=6, aspect=1,
             plot_kws={'scatter_kws': {'alpha': 0.7}})
plt.savefig('..\\data\\Output\\PLSubscriptionActive')

In [None]:
y_kurtosis = kurtosis(test['PLSubscriptionActive'], fisher = True)
display("PLSubscriptionActive kurtosis: {:2}".format(y_kurtosis))
display(stats.kurtosistest(test['PLSubscriptionActive']))

y_skew = skew(test['PLSubscriptionActive'])
display("PLSubscriptionActive skew: {:2}".format(y_skew))
display(stats.skewtest(test['PLSubscriptionActive']))

In [None]:
test.corr()

#### Variable test 12 PercentageFleetUtilized

In [None]:
#Create subset test for testing ideal element
test = data_filtered_by_coe.loc[:, ['PercentageFleetUtilized','PARTS_EQ_0_12M']]
#define the x & y data

#test.describe()
#scatter plot
sns.pairplot(test, kind='reg', diag_kind='hist',
             height=6, aspect=1,
             plot_kws={'scatter_kws': {'alpha': 0.7}})
plt.savefig('..\\data\\Output\\PercentageFleetUtilized')

In [None]:
y_kurtosis = kurtosis(test['PercentageFleetUtilized'], fisher = True)
display("PercentageFleetUtilized kurtosis: {:2}".format(y_kurtosis))
display(stats.kurtosistest(test['PercentageFleetUtilized']))

y_skew = skew(test['PercentageFleetUtilized'])
display("PercentageFleetUtilized skew: {:2}".format(y_skew))
display(stats.skewtest(test['PercentageFleetUtilized']))

In [None]:
test.corr()

#### Variable test 13 CSA_Days_Remaining

In [None]:
#Create subset test for testing ideal element
test = data_filtered_by_coe.loc[:, ['CSA_Days_Remaining','PARTS_EQ_0_12M']]
#define the x & y data

#test.describe()
#scatter plot
sns.pairplot(test, kind='reg', diag_kind='hist',
             height=6, aspect=1,
             plot_kws={'scatter_kws': {'alpha': 0.7}})
plt.savefig('..\\data\\Output\\CSA_Days_Remaining')

In [None]:
y_kurtosis = kurtosis(test['CSA_Days_Remaining'], fisher = True)
display("CSA_Days_Remaining kurtosis: {:2}".format(y_kurtosis))
display(stats.kurtosistest(test['CSA_Days_Remaining']))

y_skew = skew(test['CSA_Days_Remaining'])
display("CSA_Days_Remaining skew: {:2}".format(y_skew))
display(stats.skewtest(test['CSA_Days_Remaining']))

In [None]:
test.corr()

#### Variable test 14 Percentage_PLSubscriptionActive

In [None]:
#Create subset test for testing ideal element
test = data_filtered_by_coe.loc[:, ['Percentage_PLSubscriptionActive','PARTS_EQ_0_12M']]
#define the x & y data

#test.describe()
#scatter plot
sns.pairplot(test, kind='reg', diag_kind='hist',
             height=6, aspect=1,
             plot_kws={'scatter_kws': {'alpha': 0.7}})
plt.savefig('..\\data\\Output\\Percentage_PLSubscriptionActive')

In [None]:
y_kurtosis = kurtosis(test['Percentage_PLSubscriptionActive'], fisher = True)
display("Percentage_PLSubscriptionActive kurtosis: {:2}".format(y_kurtosis))
display(stats.kurtosistest(test['Percentage_PLSubscriptionActive']))

y_skew = skew(test['Percentage_PLSubscriptionActive'])
display("Percentage_PLSubscriptionActive skew: {:2}".format(y_skew))
display(stats.skewtest(test['Percentage_PLSubscriptionActive']))

In [None]:
test.corr()

#### Variable test 15 Percentage_PLDeviceSerialNumber

In [None]:
#Create subset test for testing ideal element
test = data_filtered_by_coe.loc[:, ['Percentage_PLDeviceSerialNumber','PARTS_EQ_0_12M']]
#define the x & y data

#test.describe()
#scatter plot
sns.pairplot(test, kind='reg', diag_kind='hist',
             height=6, aspect=1,
             plot_kws={'scatter_kws': {'alpha': 0.7}})
plt.savefig('..\\data\\Output\\Percentage_PLDeviceSerialNumber')

In [None]:
y_kurtosis = kurtosis(test['Percentage_PLDeviceSerialNumber'], fisher = True)
display("Percentage_PLDeviceSerialNumber kurtosis: {:2}".format(y_kurtosis))
display(stats.kurtosistest(test['Percentage_PLDeviceSerialNumber']))

y_skew = skew(test['Percentage_PLDeviceSerialNumber'])
display("Percentage_PLDeviceSerialNumber skew: {:2}".format(y_skew))
display(stats.skewtest(test['Percentage_PLDeviceSerialNumber']))

In [None]:
test.corr()

#### Variable test 16 OLGAAmount

In [None]:
#Create subset test for testing ideal element
test = data_filtered_by_coe.loc[:, ['OLGAAmount','PARTS_EQ_0_12M']]
#define the x & y data

#test.describe()
#scatter plot
sns.pairplot(test, kind='reg', diag_kind='hist',
             height=6, aspect=1,
             plot_kws={'scatter_kws': {'alpha': 0.7}})
plt.savefig('..\\data\\Output\\OLGAAmount')

In [None]:
y_kurtosis = kurtosis(test['OLGAAmount'], fisher = True)
display("OLGAAmount kurtosis: {:2}".format(y_kurtosis))
display(stats.kurtosistest(test['OLGAAmount']))

y_skew = skew(test['OLGAAmount'])
display("OLGAAmount skew: {:2}".format(y_skew))
display(stats.skewtest(test['OLGAAmount']))

In [None]:
test.corr()

#### Variable test 17 POPS_EQ

In [None]:
#Create subset test for testing ideal element
test = data_filtered_by_coe.loc[:, ['POPS_EQ','PARTS_EQ_0_12M']]
#define the x & y data

#test.describe()
#scatter plot
sns.pairplot(test, kind='reg', diag_kind='hist',
             height=6, aspect=1,
             plot_kws={'scatter_kws': {'alpha': 0.7}})
plt.savefig('..\\data\\Output\\POPS_EQ')

In [None]:
y_kurtosis = kurtosis(test['POPS_EQ'], fisher = True)
display("POPS_EQ kurtosis: {:2}".format(y_kurtosis))
display(stats.kurtosistest(test['POPS_EQ']))

y_skew = skew(test['POPS_EQ'])
display("POPS_EQ skew: {:2}".format(y_skew))
display(stats.skewtest(test['POPS_EQ']))

In [None]:
test.corr()

#### Variable test 18 PMContract_Only

In [None]:
#Create subset test for testing ideal element
test = data_filtered_by_coe.loc[:, ['PMContract_Only','PARTS_EQ_0_12M']]
#define the x & y data

#test.describe()
#scatter plot
sns.pairplot(test, kind='reg', diag_kind='hist',
             height=6, aspect=1,
             plot_kws={'scatter_kws': {'alpha': 0.7}})
plt.savefig('..\\data\\Output\\PMContract_Only')

In [None]:
y_kurtosis = kurtosis(test['PMContract_Only'], fisher = True)
display("PMContract_Only kurtosis: {:2}".format(y_kurtosis))
display(stats.kurtosistest(test['PMContract_Only']))

y_skew = skew(test['PMContract_Only'])
display("PMContract_Only skew: {:2}".format(y_skew))
display(stats.skewtest(test['PMContract_Only']))

In [None]:
test.corr()

#### Variable test 19 PartsDCAL

In [None]:
#Create subset test for testing ideal element
test = data_filtered_by_coe.loc[:, ['PartsDCAL','PARTS_EQ_0_12M']]
#define the x & y data

#test.describe()
#scatter plot
sns.pairplot(test, kind='reg', diag_kind='hist',
             height=6, aspect=1,
             plot_kws={'scatter_kws': {'alpha': 0.7}})
plt.savefig('..\\data\\Output\\PartsDCAL')

In [None]:
y_kurtosis = kurtosis(test['PartsDCAL'], fisher = True)
display("PartsDCAL kurtosis: {:2}".format(y_kurtosis))
display(stats.kurtosistest(test['PartsDCAL']))

y_skew = skew(test['PartsDCAL'])
display("PartsDCAL skew: {:2}".format(y_skew))
display(stats.skewtest(test['PartsDCAL']))

In [None]:
test.corr()

#### Variable test 20 Expenditure_12M_Equipment

In [None]:
#Create subset test for testing ideal element
test = data_filtered_by_coe.loc[:, ['Expenditure_12M_Equipment','PARTS_EQ_0_12M']]
#define the x & y data

#test.describe()
#scatter plot
sns.pairplot(test, kind='reg', diag_kind='hist',
             height=6, aspect=1,
             plot_kws={'scatter_kws': {'alpha': 0.7}})
plt.savefig('..\\data\\Output\\Expenditure_12M_Equipment')

In [None]:
y_kurtosis = kurtosis(test['Expenditure_12M_Equipment'], fisher = True)
display("Expenditure_12M_Equipment kurtosis: {:2}".format(y_kurtosis))
display(stats.kurtosistest(test['Expenditure_12M_Equipment']))

y_skew = skew(test['Expenditure_12M_Equipment'])
display("Expenditure_12M_Equipment skew: {:2}".format(y_skew))
display(stats.skewtest(test['Expenditure_12M_Equipment']))

In [None]:
test.corr()

#### Variable test 21 LTDSMU

In [None]:
#Create subset test for testing ideal element
test = data_filtered_by_coe.loc[:, ['LTDSMU','PARTS_EQ_0_12M']]
#define the x & y data

#test.describe()
#scatter plot
sns.pairplot(test, kind='reg', diag_kind='hist',
             height=6, aspect=1,
             plot_kws={'scatter_kws': {'alpha': 0.7}})
plt.savefig('..\\data\\Output\\LTDSMU')

In [None]:
y_kurtosis = kurtosis(test['LTDSMU'], fisher = True)
display("LTDSMU kurtosis: {:2}".format(y_kurtosis))
display(stats.kurtosistest(test['LTDSMU']))

y_skew = skew(test['LTDSMU'])
display("LTDSMU skew: {:2}".format(y_skew))
display(stats.skewtest(test['LTDSMU']))

In [None]:
test.corr()

#### Variable test 22 Age_Months

In [None]:
#Create subset test for testing ideal element
test = data_filtered_by_coe.loc[:, ['Age_Months','PARTS_EQ_0_12M']]
#define the x & y data

#test.describe()
#scatter plot
sns.pairplot(test, kind='reg', diag_kind='hist',
             height=6, aspect=1,
             plot_kws={'scatter_kws': {'alpha': 0.7}})
plt.savefig('..\\data\\Output\\Age_Months')

In [None]:
y_kurtosis = kurtosis(test['Age_Months'], fisher = True)
display("Age_Months kurtosis: {:2}".format(y_kurtosis))
display(stats.kurtosistest(test['Age_Months']))

y_skew = skew(test['Age_Months'])
display("Age_Months skew: {:2}".format(y_skew))
display(stats.skewtest(test['Age_Months']))

In [None]:
test.corr()

#### Variable test 23 Expenditure_12M_PartsNServices

In [None]:
#Create subset test for testing ideal element
test = data_filtered_by_coe.loc[:, ['Expenditure_12M_PartsNServices','PARTS_EQ_0_12M']]
#define the x & y data

#test.describe()
#scatter plot
sns.pairplot(test, kind='reg', diag_kind='hist',
             height=6, aspect=1,
             plot_kws={'scatter_kws': {'alpha': 0.7}})
plt.savefig('..\\data\\Output\\Expenditure_12M_PartsNServices')

In [None]:
y_kurtosis = kurtosis(test['Expenditure_12M_PartsNServices'], fisher = True)
display("Expenditure_12M_PartsNServices kurtosis: {:2}".format(y_kurtosis))
display(stats.kurtosistest(test['Expenditure_12M_PartsNServices']))

y_skew = skew(test['Expenditure_12M_PartsNServices'])
display("Expenditure_12M_PartsNServices skew: {:2}".format(y_skew))
display(stats.skewtest(test['Expenditure_12M_PartsNServices']))

In [None]:
test.corr()

### OLS

In [None]:
# define our input variable (X) & output variable
X = data_filtered_by_coe.drop('PARTS_EQ_0_12M', axis = 1)
Y = data_filtered_by_coe[['PARTS_EQ_0_12M']]

#OLS Model
model = sm.OLS(Y, X)
results = model.fit()
print(results.summary())

In [None]:
# define our input variable (X) & output variable
X = data_filtered_by_coe.drop('PARTS_EQ_0_12M', axis = 1)
Y = data_filtered_by_coe[['PARTS_EQ_0_12M']]

# create a Linear Regression model object
regression_model = LinearRegression()
regression_model.fit(X, Y)

In [None]:
# let's grab the coefficient of our model and the intercept
intercept = regression_model.intercept_[0]
coefficent = regression_model.coef_[0][0]

print("The intercept for our model is {:.4}".format(intercept))
print('-'*100)

# loop through the dictionary and print the data
for coef in zip(X.columns, regression_model.coef_[0]):
    print("The Coefficient for {} is {:.2}".format(coef[0],coef[1]))

In [None]:
results.pvalues

In [None]:
results.conf_int()

### Train model

In [None]:
# define our input variable (X) & output variable
X = data_filtered_by_coe.drop('PARTS_EQ_0_12M', axis = 1)
Y = data_filtered_by_coe[['PARTS_EQ_0_12M']]

# Split X and y into X_
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.20, random_state=1)

# create a Linear Regression model object
regression_model = LinearRegression()

# pass through the X_train & y_train data set
regression_model.fit(X_train, y_train)

# prediction model
y_predict = regression_model.predict(X_test)

In [None]:
# calculate the mean squared error
model_mse = mean_squared_error(y_test, y_predict)

# calculate the mean absolute error
model_mae = mean_absolute_error(y_test, y_predict)

# calulcate the root mean squared error
model_rmse =  math.sqrt(model_mse)

# display the output
print("MSE {:.3}".format(model_mse))
print("MAE {:.3}".format(model_mae))
print("RMSE {:.3}".format(model_rmse))

In [None]:
model_r2 = r2_score(y_test, y_predict)
print("R2: {:.2}".format(model_r2))

### Testing for Autocorrelation

In [None]:
# define our input variable (X) & output variable
X = data_filtered_by_coe.drop('PARTS_EQ_0_12M', axis = 1)
Y = data_filtered_by_coe[['PARTS_EQ_0_12M']]

# test for Autocorrelation
from statsmodels.stats.stattools import durbin_watson

# define our intput
X2 = sm.add_constant(X)

# create a OLS model
model = sm.OLS(Y, X2)

# fit the data
est = model.fit()

# calculate the lag, optional
lag = min(10, (len(X)//5))
print('The number of lags will be {}'.format(lag))
print('-'*100)

# run the Ljung-Box test for no autocorrelation of residuals
# test_results = diag.acorr_breusch_godfrey(est, nlags = lag, store = True)
test_results = diag.acorr_ljungbox(est.resid, lags = lag)

# grab the p-values and the test statistics
ibvalue, p_val = test_results

# print the results of the test
if min(p_val) > 0.05:
    print("The lowest p-value found was {:.4}".format(min(p_val)))
    print("We fail to reject the null hypthoesis, so there is no autocorrelation.")
    print('-'*100)
else:
    print("The lowest p-value found was {:.4}".format(min(p_val)))
    print("We reject the null hypthoesis, so there is autocorrelation.")
    print('-'*100)

# plot autocorrelation
sm.graphics.tsa.plot_acf(est.resid)
plt.show()

### Testing the Mean of the Residuals Equals 0

In [None]:
# check for the normality of the residuals
sm.qqplot(results.resid, line='s')
pylab.show()

# also check that the mean of the residuals is approx. 0.
mean_residuals = sum(results.resid)/ len(results.resid)
print("The mean of the residuals is {:.4}".format(mean_residuals))

### Testing for Heteroskedasticity

In [None]:
x = data_filtered_by_coe.drop('PARTS_EQ_0_12M', axis = 1)
y = data_filtered_by_coe[['PARTS_EQ_0_12M']]

In [None]:
# Run the Breusch-Pagan test
_, pval, __, f_pval = diag.het_breuschpagan(results.resid, results.model.exog)
print(pval, f_pval)
print('-'*100)

# print the results of the test
if pval > 0.05:
    print("For the Breusch-Pagan's Test")
    print("The p-value was {:.4}".format(pval))
    print("We fail to reject the null hypthoesis, so there is no heterosecdasticity.")

else:
    print("For the Breusch-Pagan's Test")
    print("The p-value was {:.4}".format(pval))
    print("We reject the null hypthoesis, so there is heterosecdasticity.")



### Re-Test After Adjustment

In [None]:
# define our input variable (X) & output variable
X = data_filtered.drop('PARTS_EQ_0_12M', axis = 1)
Y = data_filtered[['PARTS_EQ_0_12M']]

#Summary
model = sm.OLS(Y, X)
results = model.fit()
print(results.summary())