# Table of contents
  Each section have libraries and data. Do not need to run altogether.
 #1. Exploration: 
    #1. Use scatter for SN2 vs Age to see trend
    #2. Select and apply interp1d model (one serial at a time)
 #2. Testing with split for DBS and Delivery:
    #1. Split population before entering loop
    #2. Split by SN after entering loop
    #3. Test smoothing kind linear vs nearest
 #3. Loop for batch estimate and check output
 #4. Export

#  Exploration

In [None]:
import pandas as pd
import seaborn as sns

In [None]:
df = pd.read_excel('..\\data\\Equipment_Age_SQL_Export.xlsx', index_col='EquipmentCode')
df.shape

In [None]:
Prefix = list(dict.fromkeys(list(df['SN1'])))
len(Prefix)

In [None]:
#First play with Age_From_DBS. I need SN2 for matching and drop everything else for easier plotting.
plt = df.drop(['SerialNumber','Territory','Model','SISModel'],axis=1)
plt.columns

In [None]:
plot = plt.drop(plt.index[plt.SN1 != 'BZY'])

sns.pairplot(plot, kind='reg', diag_kind='hist',height=4, aspect=1,plot_kws={'scatter_kws': {'alpha': 0.7}})

In [None]:
plot = plt.drop(plt.index[plt.SN1 != 'RKB'])

sns.pairplot(plot, kind='reg', diag_kind='hist',height=4, aspect=1,plot_kws={'scatter_kws': {'alpha': 0.7}})

In [None]:
plot = plt.drop(plt.index[plt.SN1 != 'FJH'])

sns.pairplot(plot, kind='reg', diag_kind='hist',height=4, aspect=1,plot_kws={'scatter_kws': {'alpha': 0.7}})

In [None]:
plot = plt.drop(plt.index[plt.SN1 != 'RZA'])

sns.pairplot(plot, kind='reg', diag_kind='hist',height=4, aspect=1,plot_kws={'scatter_kws': {'alpha': 0.7}})

In [None]:
plot = plt.drop(plt.index[plt.SN1 != 'TZA'])

sns.pairplot(plot, kind='reg', diag_kind='hist',height=4, aspect=1,plot_kws={'scatter_kws': {'alpha': 0.7}})

In [None]:
plot = plt.drop(plt.index[plt.SN1 != 'DFY'])

sns.pairplot(plot, kind='reg', diag_kind='hist',height=4, aspect=1,plot_kws={'scatter_kws': {'alpha': 0.7}})

# Model

In [None]:
import math
import statsmodels.api as sm
from sklearn.metrics import mean_squared_error, mean_absolute_error
from sklearn.model_selection import train_test_split
from scipy import interpolate
from scipy.interpolate import *

pd.options.display.max_rows = 20
pd.options.display.max_columns = 10

## DBS

In [None]:
#Since we are filling missing data in a sequencial curce, I decided to use interpolation function to estimate cat production rate.
DBS = df.drop(['SerialNumber','Territory','Model','SISModel','Age_In_Months_From_Delivery_DBS'],axis=1)
BZY_DBS = DBS.drop(DBS.index[DBS.SN1 != 'BZY']).dropna()
#Split
Train_DBS, Test_DBS = train_test_split(BZY_DBS, test_size=0.1,random_state=50)
#Transform
X1 = Train_DBS[['SN2']].squeeze().astype(int)
Y1 = Train_DBS[['Age_In_Months_From_DBS']].squeeze().astype(int)
# Scipy learn interp1d. Data: DBS / Delivery. Kind: Linear / Nearest
model_DBS_linear = interp1d(X1, Y1 ,kind='linear',bounds_error=0,fill_value="extrapolate")
model_DBS_nearest = interp1d(X1, Y1 ,kind='nearest',fill_value="extrapolate")
model_DBS_cubic = interp1d(X1, Y1 ,kind='cubic',fill_value="array-like ")

In [None]:
#Use model to predict tesst
for idx in Test_DBS.index.values:
    Test_DBS.loc[idx, 'Predicted_Age_By_DBS_Linear'] = model_DBS_linear(Test_DBS.loc[idx, 'SN2'])
    Test_DBS.loc[idx, 'Predicted_Age_By_DBS_Nearest'] = model_DBS_nearest(Test_DBS.loc[idx, 'SN2'])
    Test_DBS.loc[idx, 'Predicted_Age_By_DBS_Cubic'] = model_DBS_cubic(Test_DBS.loc[idx, 'SN2'])

In [None]:
#Check for DBS Age Prediction errors 
Y = Test_DBS[['Age_In_Months_From_DBS']]
X_linear = Test_DBS[['Predicted_Age_By_DBS_Linear']]

model_mse = mean_squared_error(Y, X_linear)
model_mae = mean_absolute_error(Y, X_linear)
model_rmse =  math.sqrt(model_mse)

print("MSE for Linear {:3}".format(model_mse))
print("MAE for Linear {:3}".format(model_mae))
print("RMSE for Linear {:3}".format(model_rmse))

X_nearest = Test_DBS[['Predicted_Age_By_DBS_Nearest']]
model_mse2 = mean_squared_error(Y, X_nearest)
model_mae2 = mean_absolute_error(Y, X_nearest)
model_rmse2 =  math.sqrt(model_mse)

print("MSE for Nearest {:3}".format(model_mse2))
print("MAE for Nearest  {:3}".format(model_mae2))
print("RMSE for Nearest  {:3}".format(model_rmse2))

X_cubic = Test_DBS[['Predicted_Age_By_DBS_Cubic']]
model_mse3 = mean_squared_error(Y, X_cubic)
model_mae3 = mean_absolute_error(Y, X_cubic)
model_rmse3 =  math.sqrt(model_mse)

print("MSE for Cubic {:3}".format(model_mse3))
print("MAE for Cubic  {:3}".format(model_mae3))
print("RMSE for Cubic  {:3}".format(model_rmse3))

In [None]:
# Check prediction plot "DBS / linear"
sns.regplot(x="Predicted_Age_By_DBS_Linear", y="Age_In_Months_From_DBS", data=Test_DBS)

In [None]:
# Check prediction plot "DBS / Nearest"
sns.regplot(x="Predicted_Age_By_DBS_Nearest", y="Age_In_Months_From_DBS", data=Test_DBS)

In [None]:
# Check prediction plot "DBS / Nearest"
sns.regplot(x="Predicted_Age_By_DBS_Cubic", y="Age_In_Months_From_DBS", data=Test_DBS)

## Delivery_DBS

In [None]:
Delivery_DBS = df.drop(['SerialNumber','Territory','Model','SISModel','Age_In_Months_From_DBS'],axis=1)
BZY_Delivery_DBS = Delivery_DBS.drop(Delivery_DBS.index[Delivery_DBS.SN1 != 'BZY']).dropna()

Train_Delivery, Test_Delivery = train_test_split(BZY_Delivery_DBS, test_size=0.1,random_state=50)

X2 = Train_Delivery[['SN2']].squeeze().astype(int)
Y2 = Train_Delivery[['Age_In_Months_From_Delivery_DBS']].squeeze().astype(int)

model_Delivery_linear = interp1d(X2, Y2 ,kind='linear',fill_value="extrapolate")
model_Delivery_nearest = interp1d(X2, Y2 ,kind='nearest',fill_value="extrapolate")
model_Delivery_cubic = interp1d(X1, Y1 ,kind='cubic',fill_value="array-like ")

In [None]:
for idx in Test_Delivery.index.values:
    Test_Delivery.loc[idx, 'Predicted_Age_By_Delivery_Linear'] = model_Delivery_linear(Test_Delivery.loc[idx, 'SN2'])
    Test_Delivery.loc[idx, 'Predicted_Age_By_Delivery_Nearest'] = model_Delivery_nearest(Test_Delivery.loc[idx, 'SN2'])
    Test_Delivery.loc[idx, 'Predicted_Age_By_Delivery_Cubic'] = model_Delivery_cubic(Test_Delivery.loc[idx, 'SN2'])

In [None]:
#Check for DBS Age Prediction errors 
Y = Test_Delivery[['Age_In_Months_From_Delivery_DBS']]
X_linear = Test_Delivery[['Predicted_Age_By_Delivery_Linear']]

model_mse = mean_squared_error(Y, X_linear)
model_mae = mean_absolute_error(Y, X_linear)
model_rmse =  math.sqrt(model_mse)

print("MSE for Linear {:3}".format(model_mse))
print("MAE for Linear {:3}".format(model_mae))
print("RMSE for Linear {:3}".format(model_rmse))

X_nearest = Test_Delivery[['Predicted_Age_By_Delivery_Nearest']]
model_mse2 = mean_squared_error(Y, X_nearest)
model_mae2 = mean_absolute_error(Y, X_nearest)
model_rmse2 =  math.sqrt(model_mse)

print("MSE for Nearest {:3}".format(model_mse2))
print("MAE for Nearest  {:3}".format(model_mae2))
print("RMSE for Nearest  {:3}".format(model_rmse2))

X_cubic = Test_Delivery[['Predicted_Age_By_Delivery_Nearest']]
model_mse3 = mean_squared_error(Y, X_nearest)
model_mae3 = mean_absolute_error(Y, X_nearest)
model_rmse3 =  math.sqrt(model_mse)

print("MSE for Cubic {:3}".format(model_mse3))
print("MAE for Cubic  {:3}".format(model_mae3))
print("RMSE for Cubic  {:3}".format(model_rmse3))

In [None]:
# Check prediction plot for "Delivery / Linear"
sns.regplot(x="Predicted_Age_By_Delivery_Linear", y="Age_In_Months_From_Delivery_DBS", data=Test_Delivery)

In [None]:
# Check prediction plot for "Delivery / Linear"
sns.regplot(x="Predicted_Age_By_Delivery_Nearest", y="Age_In_Months_From_Delivery_DBS", data=Test_Delivery)

In [None]:
# Check prediction plot
sns.regplot(x="Predicted_Age_By_Delivery_Cubic", y="Age_In_Months_From_Delivery_DBS", data=Test_Delivery)

# Testing

In [None]:
import pandas as pd
import seaborn as sns
import math
import statsmodels.api as sm
from sklearn.metrics import mean_squared_error, mean_absolute_error
from sklearn.model_selection import train_test_split
from scipy import interpolate
from scipy.interpolate import *

df = pd.read_excel('..\\data\\Equipment_Age_SQL_Export.xlsx', index_col='EquipmentCode')
Prefix = list(dict.fromkeys(list(df['SN1'])))

In [None]:
#For testing step i don't need to concern with machines without age because i couldnt test them.

data_DBS = df.drop(['SerialNumber','Territory','Model','SISModel','Age_In_Months_From_Delivery_DBS'],axis=1)
data_Delivery = df.drop(['SerialNumber','Territory','Model','SISModel','Age_In_Months_From_DBS'],axis=1)

#Actual Train / Test will be created inside the loop

data_DBS_Train = data_DBS.dropna()
data_Delivery_Train = data_Delivery.dropna()

print('data_DBS_Train.shape',data_DBS_Train.shape)
print('data_Delivery_Train',data_Delivery_Train.shape)

### DBS

In [None]:
DBS_Results = pd.DataFrame()

for serialnumberbeinglooped in Prefix:
    Current_Train = data_DBS_Train.drop(data_DBS_Train.index[data_DBS_Train.SN1 != serialnumberbeinglooped])
    if len(Current_Train) > 2 :
        DBS_Train, DBS_Test = train_test_split(Current_Train, test_size=0.2,random_state=4)
    
        Current_Test = DBS_Test
        X = DBS_Train[['SN2']].squeeze()
        Y = DBS_Train[['Age_In_Months_From_DBS']].squeeze()
        X = X.astype(int)
        Y = Y.astype(int)
        model = interp1d(X, Y ,kind='linear',bounds_error=0)

        for idx in Current_Test.index.values:
            DBS_Test.loc[idx, 'Predicted_Age_By_DBS'] = model(DBS_Test.loc[idx, 'SN2'])

    DBS_Results = DBS_Results.append([DBS_Test])
    DBS_Results = DBS_Results.drop_duplicates()

In [None]:
DBS_Results.describe()

In [None]:
error_check = DBS_Results.dropna()
#Check for DBS Age Prediction errors 
Y = error_check[['Age_In_Months_From_DBS']]
X= error_check[['Predicted_Age_By_DBS']]

model_mse = mean_squared_error(Y, X)
model_mae = mean_absolute_error(Y, X)
model_rmse =  math.sqrt(model_mse)

print("MSE {:3}".format(model_mse))
print("MAE {:3}".format(model_mae))
print("RMSE {:3}".format(model_rmse))

In [None]:
# Check prediction plot "DBS / linear"
sns.regplot(X, Y)

In [None]:
ols = sm.OLS(X,Y)
est = ols.fit()
print(est.summary())

### Delivery

In [None]:
Delivery_Results = pd.DataFrame()

for serialnumberbeinglooped in Prefix:
    Current_Train = data_Delivery_Train.drop(data_Delivery_Train.index[data_Delivery_Train.SN1 != serialnumberbeinglooped])
    if len(Current_Train) > 2 :
        Delivery_Train, Delivery_Test = train_test_split(Current_Train, test_size=0.2,random_state=4)
    
        Current_Test = Delivery_Test
        X = Delivery_Train[['SN2']].squeeze()
        Y = Delivery_Train[['Age_In_Months_From_Delivery_DBS']].squeeze()
        X = X.astype(int)
        Y = Y.astype(int)
        model = interp1d(X, Y ,kind='linear',bounds_error=0)

        for idx in Current_Test.index.values:
            Delivery_Test.loc[idx, 'Predicted_Age_By_Delivery'] = model(Delivery_Test.loc[idx, 'SN2'])

    Delivery_Results = Delivery_Results.append([Delivery_Test])
    Delivery_Results = Delivery_Results.drop_duplicates()

In [None]:
Delivery_Results.describe()

In [None]:
error_check = Delivery_Results.dropna()
#Check for DBS Age Prediction errors 
Y = error_check[['Age_In_Months_From_Delivery_DBS']]
X= error_check[['Predicted_Age_By_Delivery']]

model_mse = mean_squared_error(Y, X)
model_mae = mean_absolute_error(Y, X)
model_rmse =  math.sqrt(model_mse)

print("MSE {:3}".format(model_mse))
print("MAE {:3}".format(model_mae))
print("RMSE {:3}".format(model_rmse))

In [None]:
# Check prediction plot "DBS / linear"
sns.regplot(X, Y)

In [None]:
ols = sm.OLS(X,Y)
est = ols.fit()
print(est.summary())

# Loop

In [1]:
import pandas as pd
import seaborn as sns
import math
import statsmodels.api as sm
from sklearn.metrics import mean_squared_error, mean_absolute_error
from sklearn.model_selection import train_test_split
from scipy import interpolate
from scipy.interpolate import *

In [11]:
df = pd.read_excel('..\\data\\Equipment_Age_SQL_Export.xlsx', index_col='EquipmentCode')
Prefix = list(dict.fromkeys(list(df['SN1'])))
df.columns

Index(['SN1', 'SN2', 'Age_In_Months_From_DBS',
       'Age_In_Months_From_Delivery_DBS'],
      dtype='object')

In [9]:
#Seperate data by DBS and Delivery. They have different null fields so len might be different
data_DBS = df.drop(['Age_In_Months_From_Delivery_DBS'],axis=1)
data_Delivery = df.drop(['Age_In_Months_From_DBS'],axis=1)
print(data_DBS.shape)
print(data_Delivery.shape)
#Training set should not contain nulls
DBS_Train = data_DBS.dropna()
Delivery_Train = data_Delivery.dropna()
print(DBS_Train.shape)
print(Delivery_Train.shape)
#Estimate entire population
DBS_Predict = data_DBS[pd.isnull(data_DBS['Age_In_Months_From_DBS'])]
Delivery_Predict = data_DBS[pd.isnull(data_Delivery['Age_In_Months_From_Delivery_DBS'])]
print(DBS_Predict.shape)
print(Delivery_Predict.shape)
#Blank results for appending later
Results = df
print(Results.shape)

(93289, 3)
(93289, 3)
(52129, 3)
(52133, 3)
(40637, 3)
(40633, 3)
(93289, 4)


In [13]:
for x in Prefix:
    Current_DBS_Train = DBS_Train.drop(DBS_Train.index[DBS_Train.SN1 != x])
    Current_DBS_Train = Current_DBS_Train.dropna()  
    Current_DBS_Predict = DBS_Predict.drop(DBS_Predict.index[DBS_Predict.SN1 != x])    
    if len(Current_DBS_Train) > 2 :
        X = Current_DBS_Train[['SN2']].squeeze()
        Y = Current_DBS_Train[['Age_In_Months_From_DBS']].squeeze()
        X = X.astype(int)
        Y = Y.astype(int)
        model_DBS = interp1d(X, Y,kind='linear',bounds_error=0)
        for idx in Current_DBS_Predict.index.values:
            Current_DBS_Predict.loc[idx, 'Predicted_Age_By_DBS'] = model_DBS(Current_DBS_Predict.loc[idx, 'SN2'])
        for idx in Current_DBS_Train.index.values:
            Current_DBS_Train.loc[idx, 'Predicted_Age_By_DBS'] = Current_DBS_Train.loc[idx, 'Age_In_Months_From_DBS']
    Current_Delivery_Train = Delivery_Train.drop(Delivery_Train.index[Delivery_Train.SN1 != x])
    Current_Delivery_Train = Current_Delivery_Train.dropna()
    Current_Delivery_Predict = Delivery_Predict.drop(Delivery_Predict.index[Delivery_Predict.SN1 != x])    
    if len(Current_Delivery_Train) > 2 :
        X2 = Current_Delivery_Train[['SN2']].squeeze()
        Y2 = Current_Delivery_Train[['Age_In_Months_From_Delivery_DBS']].squeeze()
        X2 = X2.astype(int)
        Y2 = Y2.astype(int)       
        model_Delivery = interp1d(X2, Y2,kind='linear',bounds_error=0)
        for idx in Current_Delivery_Predict.index.values:
            Current_Delivery_Predict.loc[idx, 'Predicted_Age_By_Delivery'] = model_Delivery(Current_Delivery_Predict.loc[idx, 'SN2'])
        for idx in Current_Delivery_Train.index.values:
            Current_Delivery_Train.loc[idx, 'Predicted_Age_By_Delivery'] = Current_Delivery_Train.loc[idx, 'Age_In_Months_From_Delivery_DBS']
    Results = Results.append([Current_DBS_Predict])
    Results = Results.append([Current_Delivery_Predict])
    Results = Results.append([Current_DBS_Train])
    Results = Results.append([Current_Delivery_Train])
    Results = Results.drop_duplicates()    
print('Done')

KeyboardInterrupt: 

In [60]:
Results.describe()

Unnamed: 0,Age_In_Months_From_DBS,Predicted_Age_By_DBS_Nearest,Predicted_Age_By_Delivery_Nearest,SN2,Territory
count,0.0,34350.0,34356.0,74405.0,74993.0
mean,,209.664203,209.043744,2807.465802,0.882842
std,,155.084358,155.41362,5440.654581,0.32161
min,,0.0,0.0,0.0,0.0
25%,,96.0,91.078416,559.0,1.0
50%,,168.0,168.0,1249.0,1.0
75%,,300.0,300.0,2864.0,1.0
max,,912.369657,912.369657,99999.0,1.0


In [None]:
check_for_null_dbs_prediction = Results.drop(['Predicted_Age_By_DBS'],axis=1)
check_for_null_dbs_prediction = check_for_null_dbs_prediction.dropna()
print('Could not estimate DBS Age:',len(DBS_Results) - len(check_for_null_dbs_prediction))

check_for_null_delivery_prediction = Results.drop(['Predicted_Age_By_Delivery'],axis=1)
check_for_null_delivery_prediction = check_for_null_delivery_prediction.dropna()
print('Could not estimate Delivery Age:',len(df) - len(check_for_null_delivery_prediction))

In [None]:
#Seperate data by DBS and Delivery. They have different null fields so len might be different
data_DBS = Results.drop(['Predicted_Age_By_Delivery','Age_In_Months_From_DBS','Age_In_Months_From_Delivery_DBS'],axis=1)
data_Delivery = df.drop(['Predicted_Age_By_DBS','Age_In_Months_From_DBS','Age_In_Months_From_Delivery_DBS'],axis=1)
print(data_DBS.shape)
print(data_Delivery.shape)
#Training set should not contain nulls
DBS_Train = data_DBS.dropna()
Delivery_Train = data_Delivery.dropna()
print(DBS_Train.shape)
print(Delivery_Train.shape)
#Estimate entire population
DBS_Predict = data_DBS[pd.isnull(data_DBS['Predicted_Age_By_DBS'])]
Delivery_Predict = data_DBS[pd.isnull(data_Delivery['Predicted_Age_By_Delivery'])]
print(DBS_Predict.shape)
print(Delivery_Predict.shape)
#Blank results for appending later
Results = df
print(Results.shape)

In [None]:
for x in Prefix:
    Current_DBS_Train = DBS_Train.drop(DBS_Train.index[DBS_Train.SN1 != x])
    Current_DBS_Train = Current_DBS_Train.dropna()  
    Current_DBS_Predict = DBS_Predict.drop(DBS_Predict.index[DBS_Predict.SN1 != x])    
    if len(Current_DBS_Train) > 2 :
        X = Current_DBS_Train[['SN2']].squeeze()
        Y = Current_DBS_Train[['Predicted_Age_By_DBS']].squeeze()
        X = X.astype(int)
        Y = Y.astype(int)
        model_DBS = interp1d(X, Y,kind='nearest',bounds_error=0,fill_type='extrapolate')
        for idx in Current_DBS_Predict.index.values:
            Current_DBS_Predict.loc[idx, 'Predicted_Age_By_DBS'] = model_DBS(Current_DBS_Predict.loc[idx, 'SN2'])
    
    Current_Delivery_Train = Delivery_Train.drop(Delivery_Train.index[Delivery_Train.SN1 != x])
    Current_Delivery_Train = Current_Delivery_Train.dropna()
    Current_Delivery_Predict = Delivery_Predict.drop(Delivery_Predict.index[Delivery_Predict.SN1 != x])    
    if len(Current_Delivery_Train) > 2 :
        X2 = Current_Delivery_Train[['SN2']].squeeze()
        Y2 = Current_Delivery_Train[['Age_In_Months_From_Delivery_DBS']].squeeze()
        X2 = X2.astype(int)
        Y2 = Y2.astype(int)       
        model_Delivery = interp1d(X2, Y2,kind='nearest',bounds_error=0,fill_type='extrapolate')
        for idx in Current_Delivery_Train.index.values:
            Current_Delivery_Train.loc[idx, 'Predicted_Age_By_Delivery'] = Current_Delivery_Train.loc[idx, 'Age_In_Months_From_Delivery_DBS']

    Results = Results.append([Current_DBS_Predict])
    Results = Results.append([Current_Delivery_Predict])
    Results = Results.drop_duplicates()    
print('Done')

# Export

In [None]:
Bruce_Export = DBS_Results
Bruce_Export.shape

In [None]:
Bruce_Export = Bruce_Export.drop(Bruce_Export.index[Bruce_Export.Age_In_Months_From_DBS > 120])
Bruce_Export = Bruce_Export.drop(Bruce_Export.index[Bruce_Export.Predicted_Age_By_DBS_Nearest > 120])
Bruce_Export.shape

In [None]:
Bruce_Export = Bruce_Export.drop(Bruce_Export.index[Bruce_Export.Territory != 1])
Bruce_Export.shape

In [None]:
Bruce_Export = Bruce_Export[Bruce_Export.Predicted_Age_By_DBS_Nearest.notnull()]

Bruce_Export.shape

In [None]:
Bruce_Export.columns

In [None]:
Bruce_Export = Bruce_Export.drop(['SN1','SN2','Territory'
                   ],axis=1)
Bruce_Export.columns

In [None]:
Bruce_Export.to_csv('..\\data\\Output\\ECAN_Equipment_Under_10_Years.csv')