In [1]:
# Auto update of packages within the notebook
%load_ext autoreload
%autoreload 2

# Packages required for model building and analysis
import os
import sys
import numpy as np
import pandas as pd

# Import custom modelling code
module_path = os.path.abspath(os.path.join('../'))
if module_path not in sys.path:
    sys.path.append(module_path)

from src.models.run_model import *
from src.visualization.visualize import *
import src.features as features

[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/jocelynpender/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [2]:
# Import model training data
flora_data_frame = pd.read_csv("../data/processed/flora_data_frame_full.csv", index_col=0)
train_indices = list(range(0, flora_data_frame.shape[0]))

In [3]:
# Import of Wikipedia dataset
wiki = pd.read_csv("../data/raw/cirsium_arvense_wikipedia.csv", index_col=None)
test_indices = list(range(flora_data_frame.shape[0] + 1, flora_data_frame.shape[0] + wiki.shape[0]))
flora_data_frame = pd.concat([flora_data_frame, wiki], ignore_index=True)
flora_data_frame

of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


  after removing the cwd from sys.path.


Unnamed: 0,classification,dataset_name,length,row,row_id,species,text
0,habitat,bc,267.0,,5721.0,Festuca saximontana var. saximontana,"Mesic to dry meadows, grasslands, rocky slopes..."
1,habitat,fna,78.0,,9819.0,,"Rock, outcrops, calcareous and volcanic boulde..."
2,morphology,bc,27.0,,2305.0,Epilobium glaberrimum,Blooming Period Mid Summer
3,taxon_identification,fna,251.0,,6636.0,,asteraceae family Martinov unknown senecioneae...
4,habitat,bc,194.0,,7239.0,Aquilegia formosa,"Mesic to moist meadows, rocky slopes, thickets..."
...,...,...,...,...,...,...,...
19262,morphology,wikipedia,,,,,Cirsium arvense is a perennial species of flow...
19263,morphology,wikipedia,,,,,The standard English name in its native area i...
19264,morphology,wikipedia,,,,,It is also commonly known as Canada thistle an...
19265,morphology,wikipedia,,,,,The plant is beneficial for pollinators that r...


In [4]:
# Customize stop words for model
tokenized_stop_words = features.prepare_stop_words(custom_stop_words=["unknown", "accepted", "synonym",
                                                             "basionym", "source",
                                                             "note", "notes", "morphology", "fna_id"])
# Build DTM
custom_vec, dtm_text_counts = build_dtm_text_counts(features.flora_tokenizer, tokenized_stop_words, flora_data_frame)
dtm_text_counts.toarray()
print(dtm_text_counts.shape)
print(flora_data_frame.shape)

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/jocelynpender/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


(19267, 40548)
(19267, 7)


In [5]:
# Prepare data for the model
X_train = dtm_text_counts[train_indices]
y_train = flora_data_frame.iloc[train_indices].classification
X_test = dtm_text_counts[test_indices]
y_test = flora_data_frame.iloc[test_indices].classification

In [6]:
clf = MultinomialNB().fit(X_train, y_train)
predicted = clf.predict(X_test)
dtm_y_test_df = pd.DataFrame(y_test).reset_index()
dtm_predictions_series = pd.Series(predicted)
results = pd.concat([dtm_y_test_df, dtm_predictions_series], axis=1)
results.rename(columns={0: 'predictions'}, inplace=True)
results = results.set_index('index')
results_flora_data_frame = pd.concat([results, flora_data_frame], axis=1, join='inner')
results_flora_data_frame

Unnamed: 0,classification,predictions,classification.1,dataset_name,length,row,row_id,species,text
19263,morphology,taxon_identification,morphology,wikipedia,,,,,The standard English name in its native area i...
19264,morphology,habitat,morphology,wikipedia,,,,,It is also commonly known as Canada thistle an...
19265,morphology,taxon_identification,morphology,wikipedia,,,,,The plant is beneficial for pollinators that r...
19266,morphology,morphology,morphology,wikipedia,,,,,It also was a top producer of nectar sugar in ...
