# Example usage

## Setup model class

In [100]:
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split

from mlmodels import (
    BaseModel,
    DataFrameModelMixin,
    infer_from_fit,
    ModelMethodColumnInfo,
)


# Create data frame model class where the feature and target schema are inferred when the model is fitted.
@infer_from_fit(
    feature_df_schema=True,
    target_df_schema=True,
    methods_with_features_as_input=['predict'],
    validate_input_output_method_list=['predict']
)
class RandomForestClassifierModel(BaseModel, DataFrameModelMixin):

    def __init__(
            self,
            features,
            random_forest_params={'n_estimators': 100, 'max_depth': 30},
    ):
        super().__init__()
        self.features = features
        self.target_columns = None,
        self.random_forest_params = random_forest_params
        self.model = RandomForestClassifier(**random_forest_params)

    def fit(self, X, y):
        self.model.fit(X[self.features], y)
        self.target_columns = y.columns
        return self

    def predict(self, X):
        predictions_array = self.model.predict(X[self.features])
        predictions_df = pd.DataFrame(
            data=predictions_array, 
            columns=self.target_columns
        )
        return predictions_df

## Prepare data

In [101]:
# Read data
csv_url = 'http://archive.ics.uci.edu/ml/machine-learning-databases/wine-quality/winequality-red.csv'
data = pd.read_csv(csv_url, sep=';')

# Create some categorical features
data['group1'] = np.random.choice(3, len(data))
data['group2'] = np.random.choice([3, 7], len(data))
data['group1'] = data['group1'].astype('int64')
data['group2'] = data['group2'].astype('int64')

# Split the data into training and test sets. (0.75, 0.25) split.
train, test = train_test_split(data)

# The predicted column is "quality" which is a scalar from [3, 9]
train_x = train.drop(["quality"], axis=1)
test_x = test.drop(["quality"], axis=1)
train_y = train[["quality"]]
test_y = test[["quality"]]

## Initialize a model object and set up column information before fitting

In [102]:
# Initialize a model
model = RandomForestClassifierModel(
    features=train_x.columns,
    random_forest_params={'n_estimators': 100, 'max_depth': 15},
)

# Set information about columns
model.set_model_method_column_info(
    ModelMethodColumnInfo(
        'predict',
        input_enum_columns=['group1', 'group2'],
        output_enum_columns=['quality'],
        input_interval_columns=['chlorides', 'free sulfur dioxide'],
        input_interval_percent_buffer=30,
    )
)

## Fit and predict

In [103]:
# Fit model, make predictions and evaluate
model.fit(train_x, train_y)
predicted_qualities = model.predict(test_x)



## Examples of schema validation

### Model input schema validation
If the input dataframe does not match the data frame schema you will get an error.
python

In [104]:
# Example of missing features
model.predict(test_x[["density", "chlorides", "alcohol"]])

SchemaError: column 'fixed acidity' not in dataframe
      density  chlorides  alcohol
1200  0.99458      0.069      9.8
801   0.99735      0.068     10.0
887   0.99577      0.066     12.1
1189  0.99616      0.075      9.6
633   1.00100      0.105     11.3

In [105]:
# Example of wrong dtype
test_x_copy = test_x.copy()
test_x_copy.density = test_x_copy.density.astype('int64')
model.predict(test_x_copy)

SchemaError: expected series 'density' to have type float64, got int64

In [106]:
# Example of wrong categorical value/enum.
test_x_copy = test_x.copy()
test_x_copy.group1 = 100
model.predict(test_x_copy)

SchemaError: <Schema Column: 'group1' type=int64> failed element-wise validator 0:
<Check _isin: isin(frozenset({0, 1, 2}))>
failure cases:
                                                          index  count
failure_case                                                          
100           [1200, 801, 887, 1189, 633, 662, 1414, 547, 14...    400

In [107]:
# Example of value outside of accepted interval.
test_x_copy = test_x.copy()
test_x_copy.chlorides = 100.0
model.predict(test_x_copy)

SchemaError: <Schema Column: 'chlorides' type=float64> failed element-wise validator 0:
<Check _in_range: in_range(-0.16739999999999997, 0.7894)>
failure cases:
                                                          index  count
failure_case                                                          
100.0         [1200, 801, 887, 1189, 633, 662, 1414, 547, 14...    400

## You can specify the schema instead of inferring it.

In [108]:
from mlmodels import (
    Interval,
    Column,
    DataFrameSchema,
    validate_method_input_and_output
)

In [109]:
class RandomForestClassifierModel(BaseModel, DataFrameModelMixin):

    def __init__(
            self,
            features,
            random_forest_params={'n_estimators': 100, 'max_depth': 30},
    ):
        super().__init__()
        self.features = features
        self.target_columns = None,
        self.random_forest_params = random_forest_params
        self.model = RandomForestClassifier(**random_forest_params)

    def fit(self, X, y):
        self.model.fit(X[self.features], y)
        self.target_columns = y.columns
        return self
    
    @validate_method_input_and_output
    def predict(self, X):
        predictions_array = self.model.predict(X[self.features])
        predictions_df = pd.DataFrame(
            data=predictions_array, 
            columns=self.target_columns
        )
        return predictions_df

In [110]:
# Initialize a model
model2 = RandomForestClassifierModel(
    features=['chlorides', 'group1'],
    random_forest_params={'n_estimators': 100, 'max_depth': 15},
)

In [111]:
input_columns = [
            Column(
                'chlorides',
                dtype='float64',
                interval=Interval(-10, 10)
            ),
            Column(
                'group1',
                dtype='int64',
                enum=[10, 11]
            ),
        ]

predict_input_schema = DataFrameSchema(input_columns)
model2.set_model_method_input_schema('predict', predict_input_schema)

output_columns = [
            Column(
                'quality',
                dtype='int64',
                enum=[ 6, 4, 5, 7, 8, 3]
            )
]

predict_output_schema = DataFrameSchema(output_columns)
model2.set_model_method_output_schema('predict', predict_output_schema)

In [112]:
model.fit(train_x, train_y)



<__main__.RandomForestClassifierModel at 0x1e903185c50>

In [114]:
model2.model_method_schema_dict

{'predict': ModelMethodSchema{method_name: predict,
 input_schema: DataFrameSchema{columns: {'chlorides': Column{name: chlorides, dtype: float64, enum: None, interval: Interval{start_value: -10, end_value: 10}}, 'group1': Column{name: group1, dtype: int64, enum: [10, 11], interval: None}}},
 output_schema: DataFrameSchema{columns: {'quality': Column{name: quality, dtype: int64, enum: [6, 4, 5, 7, 8, 3], interval: None}}}}}

In [113]:
model2.predict(test_x)

SchemaError: <Schema Column: 'group1' type=int64> failed element-wise validator 0:
<Check _isin: isin(frozenset({10, 11}))>
failure cases:
                                                          index  count
failure_case                                                          
1             [633, 1414, 547, 395, 885, 929, 978, 935, 408,...    145
2             [801, 887, 662, 938, 610, 899, 1360, 1562, 233...    143
0             [1200, 1189, 1467, 481, 235, 945, 587, 687, 59...    112