# NLP sample: sentiment classifier experimentation notebook

##### Jupyter helpers:

In [None]:
%reload_ext autoreload
%autoreload

## Define imports

In [None]:

from sentiment_analysis.data import NLPSampleDataLoader
from sentiment_analysis.models import SentimentClassifier
from sentiment_analysis.data_processing.text import SpacyTextProcessor
from sentiment_analysis.experimentation import MlflowExperimentation
from sentiment_analysis.evaluation import EvaluationMetrics, Evaluator
from sentiment_analysis import ExperimentRunner
from sklearn.metrics import accuracy_score


## Load data
*class NLPSampleDataLoader implements DataLoader api*

In [None]:
data_loader = NLPSampleDataLoader("imdb", 1.0)
data_loader.download_dataset()
imdb_df_train, imdb_df_test = data_loader.get_dataset()

X_train, y_train = imdb_df_train['text'], imdb_df_train['label']
X_test, y_test = imdb_df_test['text'], imdb_df_test['label']

## Create Model

*1. Create or reuse preprocessor for handling data preprocessing, feature engineering etc.*

In [None]:

preprocessor = SpacyTextProcessor()

*2. Create model/logic:*

In [None]:
my_model = SentimentClassifier(preprocessor = preprocessor)

## Define evaluation

In [None]:

class NLPSampleEvaluationMetrics(EvaluationMetrics):
    """
    Class to hold the actual values the evaluation created, e.g. precision, recall, MSE.
    """
    def __init__(self, validation_score):
        self.validation_score = validation_score
        super().__init__()

    def get_metrics(self):
        return {"validation_score": self.validation_score}
    
    def __repr__(self):
        return f"validation_score: {self.validation_score}"


class NLPSampleEvaluator(Evaluator):
    """
    Class to hold the logic for how the model is evaluated.
    """
    def __init__(self):
        super().__init__()

    def evaluate(self, predicted, actual) -> NLPSampleEvaluationMetrics:
        # This is where actual evaluation takes place.
        val_score = accuracy_score(actual, predicted)
        return NLPSampleEvaluationMetrics(
            validation_score=val_score
        )


evaluator = NLPSampleEvaluator()


## Define experimentation
Define experimentation object, which will be used for logging the experiments parameters, metrics and artifacts
*Replace MlflowExperimentation if you use a different experimentation system*

In [None]:
mlflow_experimentation = MlflowExperimentation(tracking_uri="databricks")

## Run experiment

In [None]:
experiment_runner = ExperimentRunner(
    model=my_model,
    X_train=X_train,
    y_train=y_train,
    X_test=X_test,
    y_test=y_test,
    data_loader=data_loader,
    log_experiment=True,
    experiment_logger=mlflow_experimentation,
    evaluator=evaluator,
    experiment_name="/NewExpr",
)

results = experiment_runner.run()
print(results)
