# Project Overview

### Topics
- Saving and loading model
- Saving and loading data from database
- Deployment

# Saving and loading model

In [1]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error
from joblib import dump, load

In [2]:
# Read the data
df = pd.read_csv('data/train.csv', index_col='Id')
df

Unnamed: 0_level_0,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,LotConfig,...,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice
Id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,60,RL,65.0,8450,Pave,,Reg,Lvl,AllPub,Inside,...,0,,,,0,2,2008,WD,Normal,208500
2,20,RL,80.0,9600,Pave,,Reg,Lvl,AllPub,FR2,...,0,,,,0,5,2007,WD,Normal,181500
3,60,RL,68.0,11250,Pave,,IR1,Lvl,AllPub,Inside,...,0,,,,0,9,2008,WD,Normal,223500
4,70,RL,60.0,9550,Pave,,IR1,Lvl,AllPub,Corner,...,0,,,,0,2,2006,WD,Abnorml,140000
5,60,RL,84.0,14260,Pave,,IR1,Lvl,AllPub,FR2,...,0,,,,0,12,2008,WD,Normal,250000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1456,60,RL,62.0,7917,Pave,,Reg,Lvl,AllPub,Inside,...,0,,,,0,8,2007,WD,Normal,175000
1457,20,RL,85.0,13175,Pave,,Reg,Lvl,AllPub,Inside,...,0,,MnPrv,,0,2,2010,WD,Normal,210000
1458,70,RL,66.0,9042,Pave,,Reg,Lvl,AllPub,Inside,...,0,,GdPrv,Shed,2500,5,2010,WD,Normal,266500
1459,20,RL,68.0,9717,Pave,,Reg,Lvl,AllPub,Inside,...,0,,,,0,4,2010,WD,Normal,142125


In [3]:
# Read the data
df = pd.read_csv('data/train.csv', index_col='Id')

# Remove rows with missing target
df.dropna(axis=0, subset=['SalePrice'], inplace=True)

# Reset index
df.reset_index(drop=True, inplace=True)

# Replace (possible) None types with np.NaN
df.fillna(value=pd.np.nan, inplace=True)

# Separate target from predictors
y = df.SalePrice         
X = df.drop(['SalePrice'], axis=1)

# Break off validation set from training data
X_train_full, X_test_full, y_train, y_test = train_test_split(X, y,
                                                                train_size=0.8,
                                                                test_size=0.2,
                                                                random_state=0)

# "Cardinality" means the number of unique values in a column
# Select categorical columns with relatively low cardinality (convenient but arbitrary)
categorical_cols = [str(cname) for cname in X_train_full.columns if X_train_full[cname].nunique() < 10 and 
                        X_train_full[cname].dtype == "object"]

# Select numeric columns
numerical_cols = [str(cname) for cname in X_train_full.columns if X_train_full[cname].dtype in ['int64', 'float64']]

# Keep selected columns only
my_cols = categorical_cols + numerical_cols
X_train = X_train_full[my_cols].copy()
X_test = X_test_full[my_cols].copy()

# Preprocessing for numerical data
numerical_transformer = SimpleImputer(strategy='constant')

# Preprocessing for categorical data
categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

# Bundle preprocessing for numerical and categorical data
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_transformer, numerical_cols),
        ('cat', categorical_transformer, categorical_cols)
    ])

# Define model
model = RandomForestRegressor(n_estimators=100, random_state=0)

# Bundle preprocessing and modeling code in a pipeline
clf = Pipeline(steps=[('preprocessor', preprocessor),
                      ('model', model)
                     ])

# Preprocessing of training data, fit model 
clf.fit(X_train, y_train)

model_filepath = "data/regression.pkl"
dump(clf, model_filepath) 

['data/regression.pkl']

In [4]:
load_clf = load(model_filepath) 

In [5]:
X_test = X_test.reset_index(drop=True)
X_test[1:2]

Unnamed: 0,MSZoning,Street,Alley,LotShape,LandContour,Utilities,LotConfig,LandSlope,Condition1,Condition2,...,GarageArea,WoodDeckSF,OpenPorchSF,EnclosedPorch,3SsnPorch,ScreenPorch,PoolArea,MiscVal,MoSold,YrSold
1,RL,Pave,,Reg,Lvl,AllPub,Inside,Gtl,Artery,Norm,...,240,0,0,32,0,0,0,0,8,2006


In [6]:
# Since we trained the dataset using a DataFrame,
# a DataFrame must be used for predictions
print(f'House Value Prediction: {load_clf.predict(X_test[1:2])}')

House Value Prediction: [153548.75]


# Saving and loading data from database

In [7]:
from sqlalchemy import create_engine

In [8]:
df = pd.read_csv('data/train.csv', index_col='Id')

database_filename = "data/house.db"
engine = create_engine('sqlite:///' + database_filename)
df.to_sql("house", engine, index=False, if_exists="replace")

In [9]:
database_filepath = "data/house.db"
engine = create_engine('sqlite:///' + database_filepath)
df = pd.read_sql_table('house', engine)

In [10]:
df.head()

Unnamed: 0,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,LotConfig,...,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice
0,60,RL,65.0,8450,Pave,,Reg,Lvl,AllPub,Inside,...,0,,,,0,2,2008,WD,Normal,208500
1,20,RL,80.0,9600,Pave,,Reg,Lvl,AllPub,FR2,...,0,,,,0,5,2007,WD,Normal,181500
2,60,RL,68.0,11250,Pave,,IR1,Lvl,AllPub,Inside,...,0,,,,0,9,2008,WD,Normal,223500
3,70,RL,60.0,9550,Pave,,IR1,Lvl,AllPub,Corner,...,0,,,,0,2,2006,WD,Abnorml,140000
4,60,RL,84.0,14260,Pave,,IR1,Lvl,AllPub,FR2,...,0,,,,0,12,2008,WD,Normal,250000


# Deployment

### From Terminal / Command Prompt

First in your terminal, change directories to the project folder
```
cd 1_deployment_example
```
The next step is to install the heroku command line tools:
```
curl https://cli-assets.heroku.com/install.sh | sh
```
The above command is not Windows compatible. Check the link for the Windows installation: https://devcenter.heroku.com/articles/heroku-cli#standalone-installation
```
heroku —-version
```
And then log into heroku with the following command (assuming you have an account)
```
heroku login
```
It is by default by browser. To login through terminal, use
```
heroku login -i
```
Heroku asks for your account email address and password, which you type into the terminal and press enter.

Then create a proc file, which tells Heroku what to do when starting your web app:
```
touch Procfile
```
Then open the Procfile and type:
```
web gunicorn run:app
```
Next, create a requirements file, which lists all of the Python library that your app depends on:
```
pip freeze > requirements.txt
```
And initialize a git repository and make a commit:
```
git init
git add .
git commit -m ‘first commit’
```
Now, create a heroku app:
```
heroku create my-app-name
```
where my-app-name is a unique name that nobody else on Heroku has already used.

The `heroku create` command should create a git repository on Heroku and a web address for accessing your web app. You can check that a remote repository was added to your git repository with the following terminal command:
```
git remote -v
```

If you have already created your Heroku app, you can easily add a remote to your local repository with the heroku git:remote command. All you need is your Heroku app’s name:
```
heroku git:remote -a app-name
```
Next, you need to push your git repository to the remote heroku repository with this command:
```
git push heroku master
```
Now, you can type your web app's address in the browser to see the results.

### From GitHub
1. [Create a new repository](https://help.github.com/en/articles/creating-a-new-repository) on GitHub. To avoid errors, do not initialize the new repository with README, license, or `gitignore` files. You can add these files after your project has been pushed to GitHub.

2. Open Terminal.

3. Change the current working directory to your local project.

4. Initialize the local directory as a Git repository.
```
git init
```

5. Add the files in your new local repository. This stages them for the first commit.
```
git add .
```

6. Commit the files that you've staged in your local repository.
```
git commit -m "First commit"
```

7. At the top of your GitHub repository's Quick Setup page, click the copy button to copy the remote repository URL.
![copy](https://help.github.com/assets/images/help/repository/copy-remote-repository-url-quick-setup.png)

8. In Terminal, add the URL for the remote repository where your local repository will be pushed.
```
git remote add origin https://remote-repository-url.com
# Sets the new remote
git remote -v
# Verifies the new remote URL
```

9. Push the changes in your local repository to GitHub.
```
git push -u origin master
```

10. Navigate to https://dashboard.heroku.com/apps and create new app
![create](imgs/create.png)

11. After naming and creating app, click 'Connect to GitHub' as the deployment option, connect to GitHub, search for the repository, and click 'Connect'
![deploy](imgs/deploy.png)

12. Scroll down and click 'Enable Automatic Deploys' and also click 'Deploy Branch' at the bottom