# Test integration of Flora of Manitoba data into the model (performance testing)
## Import requisite code

In [1]:

%load_ext autoreload
%autoreload 2

import os
import sys
import numpy as np
import pandas as pd

# Import custom modelling code
module_path = os.path.abspath(os.path.join('../../'))
if module_path not in sys.path:
    sys.path.append(module_path)

import make_dataset
import make_features
import make_model
import make_predict
# from src.models.run_model import *
# from visualization.visualize import *
# import features as features
from sklearn import metrics


[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/jocelynpender/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


## Load data in two different ways
### With FM data

In [2]:
flora_data_frame = make_dataset.main("../../data/processed/test_fm_integration/flora_data_frame_fm_nb.csv", 
                                     fna_filepath="../../data/raw/fna_with_habitat.csv", 
                                     bc_filepath="../../data/raw/eflora_bc.csv", 
                                     budds_file_path="../../data/raw/buddsfloraofcana00otta_djvu.xml", 
                                     fm_file_path="../../data/raw/fm.csv")

In [3]:
custom_vec, dtm_text_counts = make_features.main(train_file_path="../../data/processed/test_fm_integration/flora_data_frame_fm_nb.csv", 
                                                 features_save_path="../../models/test_fm_integration/with_fm/", 
                                                 custom_stop_words_path="../../models/stop_words.txt",
                                                 reclassify_keys="no", reclassify_habitat="no")

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/jocelynpender/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [4]:
clf, dtm_y_test, dtm_predictions = make_model.main("../../data/processed/test_fm_integration/flora_data_frame_fm_nb.csv", 
                "../../models/test_fm_integration/with_fm/dtm_text_counts",
                "../../models/test_fm_integration/with_fm/")


MultinomialNB Accuracy: 0.9842182693593687
                      precision    recall  f1-score   support

        distribution       0.99      1.00      0.99      1264
             habitat       0.99      1.00      0.99      1457
                 key       0.97      0.96      0.97      1232
          morphology       0.96      0.98      0.97      1349
taxon_identification       1.00      0.98      0.99      2175

            accuracy                           0.98      7477
           macro avg       0.98      0.98      0.98      7477
        weighted avg       0.98      0.98      0.98      7477



### Without FM data


In [5]:
flora_data_frame = make_dataset.main("../../data/processed/test_fm_integration/flora_data_frame_no_fm_nb.csv", 
                                     fna_filepath="../../data/raw/fna_with_habitat.csv", 
                                     bc_filepath="../../data/raw/eflora_bc.csv", 
                                     budds_file_path="../../data/raw/buddsfloraofcana00otta_djvu.xml")

In [6]:
custom_vec, dtm_text_counts = make_features.main(train_file_path="../../data/processed/test_fm_integration/flora_data_frame_no_fm_nb.csv", 
                                                 features_save_path="../../models/test_fm_integration/without_fm/", 
                                                 custom_stop_words_path="../../models/stop_words.txt",
                                                 reclassify_keys="no", reclassify_habitat="no")

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/jocelynpender/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [7]:
clf, dtm_y_test, dtm_predictions = make_model.main("../../data/processed/test_fm_integration/flora_data_frame_no_fm_nb.csv", 
                "../../models/test_fm_integration/without_fm/dtm_text_counts",
                "../../models/test_fm_integration/without_fm/")

MultinomialNB Accuracy: 0.9787267820288935
                      precision    recall  f1-score   support

        distribution       0.99      0.99      0.99       940
             habitat       0.97      1.00      0.98      1510
                 key       0.98      0.95      0.96      1100
          morphology       0.96      0.98      0.97      1347
taxon_identification       0.99      0.97      0.98      1402

            accuracy                           0.98      6299
           macro avg       0.98      0.98      0.98      6299
        weighted avg       0.98      0.98      0.98      6299



In [5]:
print("MultinomialNB Accuracy:", metrics.accuracy_score(dtm_y_test, dtm_predictions))

MultinomialNB Accuracy: 0.9789414834140887


## Conclusion
It appears as though there is a marginal improvement to distribution, key, morphology and taxon identification strings 
with FM data, but a slight decline in habitat accuracy.