In [1]:
# Auto update of packages within the notebook
%load_ext autoreload
%autoreload 2

# Packages required for model building and analysis
import os
import sys
import numpy as np
import pandas as pd
import wikipedia

# Import custom modelling code
module_path = os.path.abspath(os.path.join('../'))
if module_path not in sys.path:
    sys.path.append(module_path)

from src.models.run_model import *
from src.visualization.visualize import *
import src.features as features

# pip install git+https://github.com/lucasdnd/Wikipedia.git --upgrade


[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/jocelynpender/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [2]:
# Import model training data
flora_data_frame = pd.read_csv("../data/processed/flora_data_frame_full.csv", index_col=0)
train_indices = list(range(0, flora_data_frame.shape[0]))

In [3]:
page = wikipedia.page("Cirsium arvense")
page_sections = page.sections
parsed_page = [(page_section_name, page.section(page_section_name)) for page_section_name in page_sections]
wiki_data = pd.DataFrame(parsed_page, columns = ['classification', 'text'])
wiki_data

Unnamed: 0,classification,text
0,Alternative names,A number of other names are used in other area...
1,Description,Cirsium arvense is a C3 carbon fixation plant....
2,Underground network,Its underground structure consists of four typ...
3,Shoots and leaves,"Stems are 30–150 cm, slender green, and freely..."
4,Flowers and seeds,The inflorescence is 10–22 mm (0.39–0.87 in) i...
5,Varieties,"Variation in leaf characters (texture, vestitu..."
6,Ecology,The seeds are an important food for the goldfi...
7,Status as a weed,The species is widely considered a weed even w...
8,Control,
9,Organic,Control methods include cutting at flower stem...


In [4]:
# Import of Wikipedia dataset
# wiki = pd.read_csv("../data/raw/cirsium_arvense_wikipedia.csv", index_col=None)
wiki = wiki_data
test_indices = list(range(flora_data_frame.shape[0] + 1, flora_data_frame.shape[0] + wiki.shape[0]))
flora_data_frame = pd.concat([flora_data_frame, wiki], ignore_index=True)
flora_data_frame

of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


  """


Unnamed: 0,classification,dataset_name,length,row,row_id,species,text
0,habitat,bc,267.0,,5721.0,Festuca saximontana var. saximontana,"Mesic to dry meadows, grasslands, rocky slopes..."
1,habitat,fna,78.0,,9819.0,,"Rock, outcrops, calcareous and volcanic boulde..."
2,morphology,bc,27.0,,2305.0,Epilobium glaberrimum,Blooming Period Mid Summer
3,taxon_identification,fna,251.0,,6636.0,,asteraceae family Martinov unknown senecioneae...
4,habitat,bc,194.0,,7239.0,Aquilegia formosa,"Mesic to moist meadows, rocky slopes, thickets..."
...,...,...,...,...,...,...,...
19271,Organic,,,,,,Control methods include cutting at flower stem...
19272,Chemical,,,,,,Applying herbicide: Herbicides dominated by ph...
19273,Uses,,,,,,"Like other Cirsium species, the roots are edib..."
19274,References,,,,,,


In [5]:
# Customize stop words for model
tokenized_stop_words = features.prepare_stop_words(custom_stop_words=["unknown", "accepted", "synonym",
                                                             "basionym", "source",
                                                             "note", "notes", "morphology", "fna_id"])
# Build DTM
custom_vec, dtm_text_counts = build_dtm_text_counts(features.flora_tokenizer, tokenized_stop_words, flora_data_frame)
dtm_text_counts.toarray()

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/jocelynpender/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


(19276, 40712)
(19276, 7)


In [6]:
# Prepare data for the model
X_train = dtm_text_counts[train_indices]
y_train = flora_data_frame.iloc[train_indices].classification
X_test = dtm_text_counts[test_indices]
y_test = flora_data_frame.iloc[test_indices].classification

In [7]:
clf = MultinomialNB().fit(X_train, y_train)
predicted = clf.predict(X_test)
dtm_y_test_df = pd.DataFrame(y_test).reset_index()
dtm_predictions_series = pd.Series(predicted)
results = pd.concat([dtm_y_test_df, dtm_predictions_series], axis=1)
results.rename(columns={0: 'predictions'}, inplace=True)
results = results.set_index('index')
results_flora_data_frame = pd.concat([results, flora_data_frame], axis=1, join='inner')
results_flora_data_frame

Unnamed: 0,classification,predictions,classification.1,dataset_name,length,row,row_id,species,text
19263,Description,key,Description,,,,,,Cirsium arvense is a C3 carbon fixation plant....
19264,Underground network,morphology,Underground network,,,,,,Its underground structure consists of four typ...
19265,Shoots and leaves,morphology,Shoots and leaves,,,,,,"Stems are 30–150 cm, slender green, and freely..."
19266,Flowers and seeds,morphology,Flowers and seeds,,,,,,The inflorescence is 10–22 mm (0.39–0.87 in) i...
19267,Varieties,key,Varieties,,,,,,"Variation in leaf characters (texture, vestitu..."
19268,Ecology,taxon_identification,Ecology,,,,,,The seeds are an important food for the goldfi...
19269,Status as a weed,habitat,Status as a weed,,,,,,The species is widely considered a weed even w...
19270,Control,morphology,Control,,,,,,
19271,Organic,habitat,Organic,,,,,,Control methods include cutting at flower stem...
19272,Chemical,habitat,Chemical,,,,,,Applying herbicide: Herbicides dominated by ph...
