In [1]:
import yaml, logging, boto3, os, shutil
import pandas as pd
import numpy as np

In [2]:
KEY_FILE_NAME = 'account-info.yaml'
BUCKET_NAME = 'picpay-cleaned-bucket'
LOCAL_DATA_PATH = "data"

In [3]:
logging.basicConfig(level=logging.INFO)

## Load AWS keys

In [4]:
try:
    with open(KEY_FILE_NAME) as f:
        credentials = yaml.load(f, Loader=yaml.FullLoader)
except FileNotFoundError as e:
    logging.error("AWS Keys file not found. Please define a YAML file called '{file_name}' containing 'access_key', 'secret_key' and 'region' keys.".format(file_name=KEY_FILE_NAME))

## Helper Functions Definition

### Data Aquisition

> This function was created for helping aquiring data and avoid un-necessary costs running the pipeline. Although, a version that donwloads data from S3 is still available on `get_data_from_s3()` function

In [5]:
def get_data_from_api(pages=10, items_per_page=80):
    import requests as r
    extract_cols = lambda data, cols: {key: data[key] for key in cols}
    cols = ['id','name','abv','ibu','target_fg','target_og','ebc','srm','ph']
    cleaned = []
    for x in range(1,pages):
        response = r.get("https://api.punkapi.com/v2/beers?page={page}&per_page={items}".format(page=x, items=items_per_page))
        records = response.json()
        cleaned = [*cleaned, *[extract_cols(x, cols) for x in records]]
    return pd.DataFrame(cleaned, columns=cols)

In [6]:
def get_data_from_s3(bucket, temp_dir, credentials):
    client = boto3.resource(
        service_name = 's3',
        region_name = credentials["region"],
        aws_access_key_id = credentials["access_key"],
        aws_secret_access_key = credentials["secret_key"]
    )
    
    os.makedirs(temp_dir)
    
    files = []
    target_bucket = client.Bucket(bucket)
    
    for o in target_bucket.objects.all():
        path, filename = os.path.split(o.key)
        target_bucket.download_file(o.key, os.path.join(temp_dir,filename))
        files.append(os.path.join(temp_dir,filename))

    data = []
    for file in files:
        with open(file, "r") as f:
            data = [*data, *f.read().split("\n")[1:-1]]

    shutil.rmtree(temp_dir)
    return pd.DataFrame([x.split(",") for x in data], columns=["id","name","abv","ibu","target_fg","target_og","ebc","srm","ph"])

### Model Training

In [7]:
def model_train(estimator, param_grid, cv, train_df, y, test_size=0.3, random_state=42):
    from sklearn.metrics import r2_score, mean_absolute_error, mean_squared_error
    from sklearn.model_selection import cross_val_score, GridSearchCV, train_test_split
    
    logging.info("Estimator: {}".format(estimator))
    logging.info("test_size: {}".format(test_size))
    logging.info("Number of folds: {}".format(cv))
    logging.info("Hyper parameter: {}".format(param_grid))
    logging.info("random_state: {}".format(random_state))
    logging.info("Dataset shape: {}".format(train_df.shape))
    
    x_train, x_test, y_train, y_test = train_test_split(train_df, y, test_size=0.3, random_state=42)
    
    grid_cv = GridSearchCV(estimator=estimator, param_grid=param_grid, cv= 5)
    grid_cv.fit(x_train, y_train)
    
    logging.info("Finish training model")
    logging.info("Predicting data")
    
    y_pred = grid_cv.predict(x_test)
    
    logging.info("Computing metrics")
    
    metrics = dict()
    
    metrics['r2'] = r2_score(y_test, y_pred)
    metrics['mae'] = mean_absolute_error(y_test, y_pred)
    metrics['mse'] = mean_squared_error(y_test, y_pred)
    
    logging.info("Output information:")
    logging.info("- Best params: {}".format(grid_cv.best_params_))
    logging.info("- Metrics: {}".format(metrics))
    
    return (grid_cv.best_estimator_, grid_cv.best_params_, metrics)

## Aquiring Data

**S3 file downloading**

In [8]:
# df = get_data_from_s3(BUCKET_NAME, LOCAL_DATA_PATH, credentials)

**API call**

In [9]:
df = get_data_from_api(pages=5, items_per_page=80)

## Data Cleaning and Preparing

In [10]:
df['id'] = pd.to_numeric(df["id"])
df['abv'] = pd.to_numeric(df["abv"])
df['target_fg'] = pd.to_numeric(df["target_fg"])
df['target_og'] = pd.to_numeric(df["target_og"])
df['ebc'] = pd.to_numeric(df["ebc"])
df['srm'] = pd.to_numeric(df["srm"])
df['ph'] = pd.to_numeric(df["ph"])
df = df.dropna()

## Model Selection and Training

In [11]:
x_vars = ["abv","target_fg","target_og","ebc","srm","ph"]
y_var = ["ibu"]

### 1. DecisionTreeRegressor

In [12]:
from sklearn.tree import DecisionTreeRegressor

In [13]:
param_grid = { 
    'criterion': ["mse", "mae"],
    'splitter': ['best', 'random'],
    'max_depth' : [4,6,9]
}

In [14]:
dec_model, dec_params, dec_metrics = model_train(
    estimator = DecisionTreeRegressor(random_state=42),
    param_grid = param_grid,
    cv = 5,
    train_df = df[x_vars],
    y = df[y_var])

INFO:root:Estimator: DecisionTreeRegressor(random_state=42)
INFO:root:test_size: 0.3
INFO:root:Number of folds: 5
INFO:root:Hyper parameter: {'criterion': ['mse', 'mae'], 'splitter': ['best', 'random'], 'max_depth': [4, 6, 9]}
INFO:root:random_state: 42
INFO:root:Dataset shape: (310, 6)
INFO:root:Finish training model
INFO:root:Predicting data
INFO:root:Computing metrics
INFO:root:Output information:
INFO:root:- Best params: {'criterion': 'mae', 'max_depth': 4, 'splitter': 'random'}
INFO:root:- Metrics: {'r2': 0.15200451869250942, 'mae': 16.150537634408604, 'mse': 722.7956989247311}


### 2. LinearRegression

In [15]:
from sklearn.linear_model import LinearRegression

In [16]:
param_grid = {}

In [17]:
lin_model, lin_params, lin_metrics = model_train(
    estimator = LinearRegression(),
    param_grid = param_grid,
    cv = 5,
    train_df = df[x_vars],
    y = df[y_var])

INFO:root:Estimator: LinearRegression()
INFO:root:test_size: 0.3
INFO:root:Number of folds: 5
INFO:root:Hyper parameter: {}
INFO:root:random_state: 42
INFO:root:Dataset shape: (310, 6)
INFO:root:Finish training model
INFO:root:Predicting data
INFO:root:Computing metrics
INFO:root:Output information:
INFO:root:- Best params: {}
INFO:root:- Metrics: {'r2': -2.388200983661166, 'mae': 29.73237279124114, 'mse': 2887.960080054735}


### 3. RandomForestRegressor

In [18]:
from sklearn.ensemble import RandomForestRegressor

In [19]:
param_grid = { 
    'n_estimators': [200, 500],
    'max_features': ['auto', 'sqrt', 'log2'],
    'max_depth' : [4,6,9],
    'criterion': ["mse", "mae"]
}

In [20]:
raf_model, raf_params, raf_metrics = model_train(
    estimator = RandomForestRegressor(),
    param_grid = param_grid,
    cv = 5,
    train_df = df[x_vars],
    y = df[y_var].values.ravel())

INFO:root:Estimator: RandomForestRegressor()
INFO:root:test_size: 0.3
INFO:root:Number of folds: 5
INFO:root:Hyper parameter: {'n_estimators': [200, 500], 'max_features': ['auto', 'sqrt', 'log2'], 'max_depth': [4, 6, 9], 'criterion': ['mse', 'mae']}
INFO:root:random_state: 42
INFO:root:Dataset shape: (310, 6)
INFO:root:Finish training model
INFO:root:Predicting data
INFO:root:Computing metrics
INFO:root:Output information:
INFO:root:- Best params: {'criterion': 'mse', 'max_depth': 9, 'max_features': 'auto', 'n_estimators': 500}
INFO:root:- Metrics: {'r2': 0.4901436929171501, 'mae': 15.395093542943979, 'mse': 434.58008203171255}


**Comparing metrics**

In [21]:
logging.info("Decision Tree: {metrics}".format(metrics=dec_metrics))
logging.info("Linear Regression: {metrics}".format(metrics=lin_metrics))
logging.info("Random Forest: {metrics}".format(metrics=raf_metrics))

INFO:root:Decision Tree: {'r2': 0.15200451869250942, 'mae': 16.150537634408604, 'mse': 722.7956989247311}
INFO:root:Linear Regression: {'r2': -2.388200983661166, 'mae': 29.73237279124114, 'mse': 2887.960080054735}
INFO:root:Random Forest: {'r2': 0.4901436929171501, 'mae': 15.395093542943979, 'mse': 434.58008203171255}


## Conclusion

Three techiniques were chosen for this study: `RandomForestRegressor`, `LinearRegression` and `DecisionTreeRegressor`. But the results showed that `RandomForestRegressor` performed better, so this model would be selected for deploying.