In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import joblib
import os

from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.ensemble import RandomForestRegressor
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import StratifiedShuffleSplit




In [2]:
MODEL_FILE = "model.pkl"
PIPELINE_FILE = "pipeline.pkl"

In [4]:
def build_pipeline(num_attribs, cat_attribs):

    num_pipeline = Pipeline([
        ("imputer_data", SimpleImputer(strategy="median")),
        ("scale_data", StandardScaler())
    ])

    cat_pipeline = Pipeline([
        ("hot_encode_data",OneHotEncoder(handle_unknown="ignore"))
    ])

    full_pipeline = ColumnTransformer([
        ("num",num_pipeline, num_attribs),
        ("cat", cat_pipeline, cat_attribs)
    ])

    return full_pipeline

# IF the model does not exist i.e if the model is getting trained for the fisrst time then do this

if not os.path.exists(MODEL_FILE):
    # read the csv file
    housing = pd.read_csv("housing.csv")


    # add income catregory
    housing["income_cat"] = pd.cut(housing["median_income"],
                                  bins=[0.0,1.5,3.0,4.5,6.0, np.inf],
                                   labels=[1,2,3,4,5])

    # using stratified shuffle split so that every important variable is included in the training set
    mysplit = StratifiedShuffleSplit(n_splits=1,test_size=0.2,random_state=42)

    for train_index,test_index in mysplit.split(housing, housing["income_cat"]):
        housing.loc[test_index].drop("income_cat", axis=1).to_csv("input.csv", index=False) 
        housing = housing.loc[train_index]
        
    housing = housing.drop(["income_cat"], axis=1)
    
    # seprating housing dataset into labesl and features 
    housing_labels = housing["median_house_value"].copy()
    housing_features = housing.drop(["median_house_value"],axis=1)

    
    # seprating the categorical and numnerical attributes
    numerical_attributes = housing_features.drop(["ocean_proximity"],axis=1).columns.tolist()
    categorical_attributes = ["ocean_proximity"]

    
    # calling build_pipeline() function to create a complete pipeline
    mypipeline = build_pipeline(numerical_attributes,categorical_attributes)
    housing_prepared = mypipeline.fit_transform(housing_features)

    # Training the model on the dataset
    model = RandomForestRegressor(random_state=42)
    model.fit(housing_prepared, housing_labels)

    # Storing the model and pipeline using joblib
    joblib.dump(model,MODEL_FILE)
    joblib.dump(mypipeline, PIPELINE_FILE)

    print("Pipeline and model created and saved SUCCESSFULLY !!")

# If there is a model and pipeline PKL already exists then inference the input data
else:
    model = joblib.load(MODEL_FILE)
    pipeline = joblib.load(PIPELINE_FILE)
     
    input_data = pd.read_csv("input.csv")
    transformed_input = pipeline.transform(input_data)
    predictions = model.predict(transformed_input)
    input_data["median_house_value"]=predictions

    input_data.to_csv("housemodel_output_file.csv",index=False)
    print("Inference is completed, Please see the saved result in housemodel_output_file.csv file ")

Inference is completed, Please see the saved result in housemodel_output_file.csv file 
