# **Introduction to ML Pipelines**

---



## **Data Ingestion**

In [1]:
def get_data(url, drop=[]):
  import pandas as pd
    
  df = pd.read_csv(url)
  if len(drop) > 0:
    for col in drop:
      # make sure the column exists. otherwise, skip it
      try:
        df.drop(columns=col, inplace=True)
      except:
        pass
  
  return df
  
# Call the funtion here to test it; however, we will use it again later
import pandas as pd
pd.set_option('display.max_columns', 10)
df = get_data('https://www.ishelp.info/data/housing_full.csv', drop=['Id'])
df.head()

Unnamed: 0,MSSubClass,MSZoning,LotFrontage,LotArea,Street,...,MoSold,YrSold,SaleType,SaleCondition,SalePrice
0,60,RL,65.0,8450,Pave,...,2,2008,WD,Normal,208500
1,20,RL,80.0,9600,Pave,...,5,2007,WD,Normal,181500
2,60,RL,68.0,11250,Pave,...,9,2008,WD,Normal,223500
3,70,RL,60.0,9550,Pave,...,2,2006,WD,Abnorml,140000
4,60,RL,84.0,14260,Pave,...,12,2008,WD,Normal,250000


## **Data Cleaning**

## Binning Categorial Features

In [2]:
def bin_groups(df, percent=.05):
  import pandas as pd

  for col in df:
    if not pd.api.types.is_numeric_dtype(df[col]):
      for group, count in df[col].value_counts().iteritems():
        if count / len(df) < percent:
          df.loc[df[col] == group, col] = 'Other'
  return df

# Call the function to update the DataFrame. Pass in the Dataframe and the cutoff percent
df = bin_groups(df, 0.05)

# Check the value_counts() to see if it worked
for col in df:
  if not pd.api.types.is_numeric_dtype(df[col]):
    print(df[col].value_counts())
    print()

RL       1151
RM        218
Other      91
Name: MSZoning, dtype: int64

Pave     1454
Other       6
Name: Street, dtype: int64

Other    91
Name: Alley, dtype: int64

Reg      925
IR1      484
Other     51
Name: LotShape, dtype: int64

Lvl      1311
Other     149
Name: LandContour, dtype: int64

AllPub    1459
Other        1
Name: Utilities, dtype: int64

Inside     1052
Corner      263
CulDSac      94
Other        51
Name: LotConfig, dtype: int64

Gtl      1382
Other      78
Name: LandSlope, dtype: int64

Other      483
NAmes      225
CollgCr    150
OldTown    113
Edwards    100
Somerst     86
Gilbert     79
NridgHt     77
Sawyer      74
NWAmes      73
Name: Neighborhood, dtype: int64

Norm     1260
Other     119
Feedr      81
Name: Condition1, dtype: int64

Norm     1445
Other      15
Name: Condition2, dtype: int64

1Fam      1220
Other      126
TwnhsE     114
Name: BldgType, dtype: int64

1Story    726
2Story    445
1.5Fin    154
Other     135
Name: HouseStyle, dtype: int64

Gable  

## Missing Values

In [3]:
print("Feature\t", "Percent Missing\n")

for col in df:
  if df[col].isnull().sum() / len(df) > .5:
    print(f'{col}\t{round(df[col].isnull().sum() / len(df), 4) * 100}%')

Feature	 Percent Missing

Alley	93.77%
PoolQC	99.52%
Fence	80.75%
MiscFeature	96.3%


In [4]:
pd.set_option('display.max_rows', 500)

def drop_columns_missing_data(df, cutoff=.5):
  import pandas as pd
  for col in df:
    if df[col].isna().sum() / len(df) > cutoff:
      df.drop(columns=[col], inplace=True)
  return df

df = drop_columns_missing_data(df)

for col in df:
  if df[col].isna().sum() / len(df) > 0.001:
    print(col, '\t', round(df[col].isnull().sum() / len(df), 3) * 100)

LotFrontage 	 17.7
MasVnrType 	 0.5
MasVnrArea 	 0.5
BsmtQual 	 2.5
BsmtCond 	 2.5
BsmtExposure 	 2.6
BsmtFinType1 	 2.5
BsmtFinType2 	 2.6
FireplaceQu 	 47.3
GarageType 	 5.5
GarageYrBlt 	 5.5
GarageFinish 	 5.5
GarageQual 	 5.5
GarageCond 	 5.5


### Missing Value Imputation: Univariate

Replace with Mean, Median, or Mode: SimpleImputer

In [5]:
def impute_mean(df):
  from sklearn.impute import SimpleImputer
  import pandas as pd, numpy as np

  # Dummy code first; categorical features not allowed
  for col in df:
    if not pd.api.types.is_numeric_dtype(df[col]):
      df = pd.get_dummies(df, columns=[col], drop_first=True)

  # Change the strategy to mean, median, or mode
  imp = SimpleImputer(missing_values=np.nan, strategy='mean')
  df = pd.DataFrame(imp.fit_transform(df), columns=df.columns)

  return df

pd.set_option('display.max_columns', 6)
df_si = impute_mean(df)
df_si.head(8)

Unnamed: 0,MSSubClass,LotFrontage,LotArea,...,SaleCondition_Normal,SaleCondition_Other,SaleCondition_Partial
0,60.0,65.0,8450.0,...,1.0,0.0,0.0
1,20.0,80.0,9600.0,...,1.0,0.0,0.0
2,60.0,68.0,11250.0,...,1.0,0.0,0.0
3,70.0,60.0,9550.0,...,0.0,0.0,0.0
4,60.0,84.0,14260.0,...,1.0,0.0,0.0
5,50.0,85.0,14115.0,...,1.0,0.0,0.0
6,20.0,75.0,10084.0,...,1.0,0.0,0.0
7,60.0,70.049958,10382.0,...,1.0,0.0,0.0


### Missing Value Imputation: Multivariate

KNN Clustering-Based Imputation: KNNImputer

In [6]:
def impute_KNN(df):
  from sklearn.impute import KNNImputer
  from sklearn.preprocessing import MinMaxScaler
  import pandas as pd

  # Dummy code first; categorical features not allowed
  for col in df:
    if not pd.api.types.is_numeric_dtype(df[col]):
      df = pd.get_dummies(df, columns=[col], drop_first=True)

  # Clustering is biased by unstandardized data; so MinMax scale it
  df = pd.DataFrame(MinMaxScaler().fit_transform(df), columns = df.columns)

  imp = KNNImputer(n_neighbors=5, weights="uniform")
  df = pd.DataFrame(imp.fit_transform(df), columns=df.columns)

  return df

df_knn = impute_KNN(df)
df_knn.head(8)

Unnamed: 0,MSSubClass,LotFrontage,LotArea,...,SaleCondition_Normal,SaleCondition_Other,SaleCondition_Partial
0,0.235294,0.150685,0.03342,...,1.0,0.0,0.0
1,0.0,0.202055,0.038795,...,1.0,0.0,0.0
2,0.235294,0.160959,0.046507,...,1.0,0.0,0.0
3,0.294118,0.133562,0.038561,...,0.0,0.0,0.0
4,0.235294,0.215753,0.060576,...,1.0,0.0,0.0
5,0.176471,0.219178,0.059899,...,1.0,0.0,0.0
6,0.0,0.184932,0.041057,...,1.0,0.0,0.0
7,0.235294,0.228767,0.04245,...,1.0,0.0,0.0


Regression-Based Imputation: IterativeImputer

In [7]:
def impute_reg(df):
  from sklearn.experimental import enable_iterative_imputer
  from sklearn.impute import IterativeImputer
  import pandas as pd

  # Dummy code first; categorical features not allowed
  for col in df:
    if not pd.api.types.is_numeric_dtype(df[col]):
      df = pd.get_dummies(df, columns=[col], drop_first=True)

  # Scaling is unnecessary for regression-based imputation

  imp = IterativeImputer(max_iter=10, random_state=12345)
  df = pd.DataFrame(imp.fit_transform(df), columns=df.columns)

  return df

df_reg = impute_reg(df)
df_reg.head(8)

Unnamed: 0,MSSubClass,LotFrontage,LotArea,...,SaleCondition_Normal,SaleCondition_Other,SaleCondition_Partial
0,60.0,65.0,8450.0,...,1.0,0.0,0.0
1,20.0,80.0,9600.0,...,1.0,0.0,0.0
2,60.0,68.0,11250.0,...,1.0,0.0,0.0
3,70.0,60.0,9550.0,...,0.0,0.0,0.0
4,60.0,84.0,14260.0,...,1.0,0.0,0.0
5,50.0,85.0,14115.0,...,1.0,0.0,0.0
6,20.0,75.0,10084.0,...,1.0,0.0,0.0
7,60.0,90.998208,10382.0,...,1.0,0.0,0.0


### Compare Impute Methods

In [8]:
# Uncomment this block to compare the speed of the imputation method
# Mean is the fastest but is artificially inflated

# %timeit impute_mean(df)
# %timeit impute_KNN(df)
# %timeit impute_reg(df)

## **Model Fitting**

In [9]:
def fit_mlr(df, test_size=.2, random_state=12345, label=''):
  from sklearn.linear_model import LinearRegression
  from sklearn.model_selection import train_test_split
  import pandas as pd
  
  X = df.drop(label,axis=1)
  y = df[label]

  X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size, random_state=random_state)

  model = LinearRegression().fit(X_train, y_train)
  print(f'R-squared (mlr): \t{model.score(X_test, y_test)}')

  return model

#change this to change the impute method you used
df_impute_method = df_reg

model = fit_mlr(df_impute_method, .3, 12345, 'SalePrice')

R-squared (mlr): 	0.8176633275142347


In [10]:
def fit_crossvalidate_mlr(df, k, label, repeat=True):
  from sklearn.linear_model import LinearRegression
  from sklearn.model_selection import KFold, RepeatedKFold, cross_val_score
  import pandas as pd
  from numpy import mean, std
  X = df.drop(label,axis=1)
  y = df[label]

  # prepare the cross-validation procedure
  if repeat:
    cv = RepeatedKFold(n_splits=k, n_repeats=5, random_state=12345)
  else:
    cv = KFold(n_splits=k, random_state=12345, shuffle=True)
  

  # evaluate model
  scores = cross_val_score(LinearRegression(), X, y, scoring='r2', cv=cv, n_jobs=-1)

  # report performance
  print(f'Average R-squared:\t{mean(scores)}')

  return LinearRegression().fit(X, y)

model = fit_crossvalidate_mlr(df_si, 10, 'SalePrice', True)

Average R-squared:	0.7975108601968875


## **Model Deployment**

In [11]:
# Save models using two different packages to compare speed

def dump_pickle(model, file_name):
  import pickle
  pickle.dump(model, open(file_name, "wb"))

def dump_joblib(model, file_name):
  import joblib
  joblib.dump(model, file_name)

In [12]:
# Load models using two different packages to compare speed

def load_pickle(file_name):
  import pickle
  model = pickle.load(open(file_name, "rb"))
  return model

def load_joblib(file_name):
  import joblib
  model = joblib.load(file_name)
  return model

In [13]:
# Compare the packages for speed

%timeit dump_pickle(model, "model_pickle.sav")
%timeit dump_joblib(model, "model_joblib.sav")
print()
%timeit model = load_pickle("model_pickle.sav")
%timeit model = load_joblib("model_joblib.sav")

619 µs ± 40.7 µs per loop (mean ± std. dev. of 7 runs, 1000 loops each)
1.84 ms ± 307 µs per loop (mean ± std. dev. of 7 runs, 1000 loops each)

310 µs ± 55.5 µs per loop (mean ± std. dev. of 7 runs, 1000 loops each)
1.02 ms ± 118 µs per loop (mean ± std. dev. of 7 runs, 1000 loops each)


## **Create Pipelines** (these are the full pipelines for each method; only use one at a time)

KNN imputed pipeline

In [14]:
# Ingest data
df = get_data('https://www.ishelp.info/data/housing_full.csv', drop=['Id'])

# Clean data
df = bin_groups(df, percent=0.05)
df = drop_columns_missing_data(df, .5)
df = impute_KNN(df)

# Model data
model_mlr = fit_crossvalidate_mlr(df, 10, "SalePrice", True)

# Deploy model
dump_pickle(model_mlr, "model_pickle.sav")

Average R-squared:	-6.30012315768053e+18


Mean replacement pipeline

In [15]:
df = get_data('https://www.ishelp.info/data/housing_full.csv', drop=['Id'])
df = bin_groups(df, percent=0.05)
df = drop_columns_missing_data(df, .5)
df = impute_mean(df)
model_mlr = fit_crossvalidate_mlr(df, 10, "SalePrice", True)
dump_pickle(model_mlr, "model_pickle.sav")

Average R-squared:	0.7975108601968875


Regression replacement pipeline

In [16]:
df = get_data('https://www.ishelp.info/data/housing_full.csv', drop=['Id'])
df = bin_groups(df, percent=0.05)
df = drop_columns_missing_data(df, .5)
df = impute_reg(df)
model_mlr = fit_crossvalidate_mlr(df, 10, "SalePrice", True)
dump_pickle(model_mlr, "model_pickle.sav")

Average R-squared:	0.7983762915366868


## **Load Model for Later Use**

Single Prediction

In [21]:
# Load the stored model
model = load_pickle("model_pickle.sav")
model.predict(df.drop(columns=['SalePrice']))
# round(model.predict([[20,80,9600,6,8,1976,1976,0,978,0,284,1262,1262,0,0,1262,
#                     0,1,2,0,3,1,6,1,1976,2,460,298,0,0,0,0,0,0,5,2007,0,0,1,
#                     0,1,0,0,1,0,0,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
#                     0,0,0,0,0,0,0,0,0,0,1,1,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,
#                     0,0,0,1,0,0,0,0,0,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,
#                     1,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,1,0,0,0,1,
#                     0,0,0,1,1,0,0,0,0,0,1,0,0,0,1,1,0,0,0,0,0,0,0,0,0,0,0,1,
#                     1,0,0,0,0,0,0,0,0,1,0,0,0,1,0,0,1,0,0,0,0,0,1,0,0,0,1,1,
#                     0,0,0,0,1,0,0,0,0,1,0,0,0,1,0,1,0,0,0,0,0,0,0,1,0,0,0,1,0]])[0], 2)

Feature names unseen at fit time:
- BldgType_2fmCon
- BldgType_Duplex
- BldgType_Twnhs
- BsmtCond_Gd
- BsmtCond_Po
- ...
Feature names seen at fit time, yet now missing:
- BldgType_Other
- BsmtQual_Other
- Condition1_Other
- Condition2_Other
- Electrical_Other
- ...



ValueError: X has 236 features, but LinearRegression is expecting 127 features as input.

Batch of Predictions

In [22]:
pd.set_option('display.max_columns', 8)

# Load the stored model
model = load_pickle("model_pickle.sav")

# Get the data you want to predict and take it through the same steps as the training data
df = get_data('https://www.ishelp.info/data/housing_full.csv', drop=['Id'])
df = drop_columns_missing_data(df, .5)
df = impute_mean(df)

# Generate the list of predictions
predicted = model.predict(df.drop(columns=['SalePrice']))

# Remove the dummy codes just to simplify the results; not necessary
df.drop(df.iloc[:, 37:], inplace=True, axis=1)

# Add the predictions to the DataFrame
df['Predicted'] = predicted
df.head()

Feature names unseen at fit time:
- BldgType_2fmCon
- BldgType_Duplex
- BldgType_Twnhs
- BsmtCond_Gd
- BsmtCond_Po
- ...
Feature names seen at fit time, yet now missing:
- BldgType_Other
- BsmtQual_Other
- Condition1_Other
- Condition2_Other
- Electrical_Other
- ...



ValueError: X has 236 features, but LinearRegression is expecting 127 features as input.