##Exploratory Data Analysis for Med Cabinet Prediction Model

In [1]:
# EDA and visualization libraries
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np

In [2]:
df = pd.read_csv('cannabis_.csv')

In [3]:
df.head()

Unnamed: 0,Strain,Type,Rating,Effects,Flavor,Description
0,100-Og,hybrid,4.0,"Creative,Energetic,Tingly,Euphoric,Relaxed","Earthy,Sweet,Citrus",$100 OG is a 50/50 hybrid strain that packs a ...
1,98-White-Widow,hybrid,4.7,"Relaxed,Aroused,Creative,Happy,Energetic","Flowery,Violet,Diesel",The ‘98 Aloha White Widow is an especially pot...
2,1024,sativa,4.4,"Uplifted,Happy,Relaxed,Energetic,Creative","Spicy/Herbal,Sage,Woody",1024 is a sativa-dominant hybrid bred in Spain...
3,13-Dawgs,hybrid,4.2,"Tingly,Creative,Hungry,Relaxed,Uplifted","Apricot,Citrus,Grapefruit",13 Dawgs is a hybrid of G13 and Chemdawg genet...
4,24K-Gold,hybrid,4.6,"Happy,Relaxed,Euphoric,Uplifted,Talkative","Citrus,Earthy,Orange","Also known as Kosher Tangie, 24k Gold is a 60%..."


In [4]:
df.shape

(2351, 6)

In [5]:
df = df.replace('None', np.nan)
df = df.dropna().reset_index()

In [6]:
df['Flavor']= df['Flavor'].str.replace(',',' ')
df['Effects'] = df['Effects'].str.replace(',', ' ')

In [7]:
print(df.shape)
df

(2163, 7)


Unnamed: 0,index,Strain,Type,Rating,Effects,Flavor,Description
0,0,100-Og,hybrid,4.0,Creative Energetic Tingly Euphoric Relaxed,Earthy Sweet Citrus,$100 OG is a 50/50 hybrid strain that packs a ...
1,1,98-White-Widow,hybrid,4.7,Relaxed Aroused Creative Happy Energetic,Flowery Violet Diesel,The ‘98 Aloha White Widow is an especially pot...
2,2,1024,sativa,4.4,Uplifted Happy Relaxed Energetic Creative,Spicy/Herbal Sage Woody,1024 is a sativa-dominant hybrid bred in Spain...
3,3,13-Dawgs,hybrid,4.2,Tingly Creative Hungry Relaxed Uplifted,Apricot Citrus Grapefruit,13 Dawgs is a hybrid of G13 and Chemdawg genet...
4,4,24K-Gold,hybrid,4.6,Happy Relaxed Euphoric Uplifted Talkative,Citrus Earthy Orange,"Also known as Kosher Tangie, 24k Gold is a 60%..."
...,...,...,...,...,...,...,...
2158,2346,Zeus-Og,hybrid,4.7,Happy Uplifted Relaxed Euphoric Energetic,Earthy Woody Pine,Zeus OG is a hybrid cross between Pineapple OG...
2159,2347,Zkittlez,indica,4.6,Relaxed Happy Euphoric Uplifted Sleepy,Sweet Berry Grape,Zkittlez is an indica-dominant mix of Grape Ap...
2160,2348,Zombie-Kush,indica,5.0,Relaxed Sleepy Talkative Euphoric Happy,Earthy Sweet Spicy/Herbal,Zombie Kush by Ripper Seeds comes from two dif...
2161,2349,Zombie-Og,indica,4.4,Relaxed Sleepy Euphoric Happy Hungry,Sweet Earthy Pungent,If you’re looking to transform into a flesh-ea...


In [8]:
df['combined'] = df["Effects"] +  df["Flavor"] + df['Description']
df["combined"][0]

'Creative Energetic Tingly Euphoric RelaxedEarthy Sweet Citrus$100 OG is a 50/50 hybrid strain that packs a strong punch. The name supposedly refers to both its strength and high price when it first started showing up in Hollywood. As a plant, $100 OG tends to produce large dark green buds with few stems. Users report a strong body effect of an indica for pain relief with the more alert, cerebral feeling thanks to its sativa side.'

In [9]:
df.head()

Unnamed: 0,index,Strain,Type,Rating,Effects,Flavor,Description,combined
0,0,100-Og,hybrid,4.0,Creative Energetic Tingly Euphoric Relaxed,Earthy Sweet Citrus,$100 OG is a 50/50 hybrid strain that packs a ...,Creative Energetic Tingly Euphoric RelaxedEart...
1,1,98-White-Widow,hybrid,4.7,Relaxed Aroused Creative Happy Energetic,Flowery Violet Diesel,The ‘98 Aloha White Widow is an especially pot...,Relaxed Aroused Creative Happy EnergeticFlower...
2,2,1024,sativa,4.4,Uplifted Happy Relaxed Energetic Creative,Spicy/Herbal Sage Woody,1024 is a sativa-dominant hybrid bred in Spain...,Uplifted Happy Relaxed Energetic CreativeSpicy...
3,3,13-Dawgs,hybrid,4.2,Tingly Creative Hungry Relaxed Uplifted,Apricot Citrus Grapefruit,13 Dawgs is a hybrid of G13 and Chemdawg genet...,Tingly Creative Hungry Relaxed UpliftedApricot...
4,4,24K-Gold,hybrid,4.6,Happy Relaxed Euphoric Uplifted Talkative,Citrus Earthy Orange,"Also known as Kosher Tangie, 24k Gold is a 60%...",Happy Relaxed Euphoric Uplifted TalkativeCitru...


In [10]:
!python -m spacy download en_core_web_lg

[38;5;2m✔ Download and installation successful[0m
You can now load the model via spacy.load('en_core_web_lg')


In [11]:
# modeling, nlp, and ml libraries
import re
import string
import spacy
import sklearn
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from spacy.tokenizer import Tokenizer
from sklearn.neighbors import NearestNeighbors
import en_core_web_lg

In [12]:
nlp = spacy.load("en_core_web_lg")
df.head()

# The Tokenizer
tokenizer = Tokenizer(nlp.vocab)

# Make the tokens for description
combined_tokens = []
for txt in tokenizer.pipe(df['combined'], batch_size=500):
    txt_tokens = [token.text for token in txt]
    combined_tokens.append(txt_tokens)
df['combined_tokens'] = combined_tokens
print(df['combined_tokens'].head())

0    [Creative, Energetic, Tingly, Euphoric, Relaxe...
1    [Relaxed, Aroused, Creative, Happy, EnergeticF...
2    [Uplifted, Happy, Relaxed, Energetic, Creative...
3    [Tingly, Creative, Hungry, Relaxed, UpliftedAp...
4    [Happy, Relaxed, Euphoric, Uplifted, Talkative...
Name: combined_tokens, dtype: object


In [13]:
df.head()

Unnamed: 0,index,Strain,Type,Rating,Effects,Flavor,Description,combined,combined_tokens
0,0,100-Og,hybrid,4.0,Creative Energetic Tingly Euphoric Relaxed,Earthy Sweet Citrus,$100 OG is a 50/50 hybrid strain that packs a ...,Creative Energetic Tingly Euphoric RelaxedEart...,"[Creative, Energetic, Tingly, Euphoric, Relaxe..."
1,1,98-White-Widow,hybrid,4.7,Relaxed Aroused Creative Happy Energetic,Flowery Violet Diesel,The ‘98 Aloha White Widow is an especially pot...,Relaxed Aroused Creative Happy EnergeticFlower...,"[Relaxed, Aroused, Creative, Happy, EnergeticF..."
2,2,1024,sativa,4.4,Uplifted Happy Relaxed Energetic Creative,Spicy/Herbal Sage Woody,1024 is a sativa-dominant hybrid bred in Spain...,Uplifted Happy Relaxed Energetic CreativeSpicy...,"[Uplifted, Happy, Relaxed, Energetic, Creative..."
3,3,13-Dawgs,hybrid,4.2,Tingly Creative Hungry Relaxed Uplifted,Apricot Citrus Grapefruit,13 Dawgs is a hybrid of G13 and Chemdawg genet...,Tingly Creative Hungry Relaxed UpliftedApricot...,"[Tingly, Creative, Hungry, Relaxed, UpliftedAp..."
4,4,24K-Gold,hybrid,4.6,Happy Relaxed Euphoric Uplifted Talkative,Citrus Earthy Orange,"Also known as Kosher Tangie, 24k Gold is a 60%...",Happy Relaxed Euphoric Uplifted TalkativeCitru...,"[Happy, Relaxed, Euphoric, Uplifted, Talkative..."


In [14]:
df['combined_tokens'][0]

['Creative',
 'Energetic',
 'Tingly',
 'Euphoric',
 'RelaxedEarthy',
 'Sweet',
 'Citrus$100',
 'OG',
 'is',
 'a',
 '50/50',
 'hybrid',
 'strain',
 'that',
 'packs',
 'a',
 'strong',
 'punch.',
 'The',
 'name',
 'supposedly',
 'refers',
 'to',
 'both',
 'its',
 'strength',
 'and',
 'high',
 'price',
 'when',
 'it',
 'first',
 'started',
 'showing',
 'up',
 'in',
 'Hollywood.',
 'As',
 'a',
 'plant,',
 '$100',
 'OG',
 'tends',
 'to',
 'produce',
 'large',
 'dark',
 'green',
 'buds',
 'with',
 'few',
 'stems.',
 'Users',
 'report',
 'a',
 'strong',
 'body',
 'effect',
 'of',
 'an',
 'indica',
 'for',
 'pain',
 'relief',
 'with',
 'the',
 'more',
 'alert,',
 'cerebral',
 'feeling',
 'thanks',
 'to',
 'its',
 'sativa',
 'side.']

In [15]:
def tokenize(document):
    
    doc = nlp(document)
    
    return [token.lemma_.strip() for token in doc if (token.is_stop != True) and (token.is_punct != True)]

In [18]:
# Instantiate vectorizer object
tfidf = TfidfVectorizer(stop_words = 'english',
                       ngram_range = (1,2),
                       max_features = 2000)

# Create a vocabulary and tf-idf score per document
dtm = tfidf.fit_transform(df['combined'])
                         

# Get feature names to use as dataframe column headers
dtm = pd.DataFrame(dtm.todense(), columns=tfidf.get_feature_names())
nn = NearestNeighbors(n_neighbors=4, algorithm='kd_tree')
nn.fit(dtm)


# View Feature Matrix as DataFrame
print(dtm.shape)
dtm.head()

(2163, 2000)


Unnamed: 0,10,10 weeks,11,11 weeks,12,13,14,15,18,1980s,1st,1st hawaiian,1st place,20,20 indica,20 thc,2012,2014,2014 cannabis,2015,2016,22,23,24,25,2nd,2nd place,30,30 indica,3rd,3rd place,40,40 indica,40 sativa,45,47,50,50 50,50 hybrid,55,...,walker,want,warm,washington,washington state,way,way cross,week,week flowering,weeks,weight,weighted,west,west coast,whimsical,white,white alien,white og,white widow,wide,widow,winner,winning,won,won 1st,wonder,woody,woody spicy,work,works,world,worth,wowie,wrapped,wreck,years,yield,yielding,yields,zesty
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.286626,0.15769,0.162149,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.101565,0.132918,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.291018,0.0,0.0,0.371532,0.0,0.369437,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.138621,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.35166,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.12036,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.133172,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.131099,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [19]:
ideal = ["""
Creative,Uplifted,Tingly,Euphoric,Relaxed, Giggly
"""]

In [20]:
# Query the ideal descprition
new = tfidf.transform(ideal)
new

<1x2000 sparse matrix of type '<class 'numpy.float64'>'
	with 11 stored elements in Compressed Sparse Row format>

In [21]:
nn.kneighbors(new.todense())

(array([[1.19045604, 1.26028088, 1.26830873, 1.27166822]]),
 array([[1881, 1586,  373,  317]]))

In [22]:
# most ideal description,strain and flavor
df['Description'][1681]

'Saturn OG is a hybrid strain with mysterious beginnings, but its earthy citrus and diesel flavors confirms its close relationship to OG Kush. Its undocumented origins result in this strain being labeled as indica, sativa, and everything in between, but its effects are most commonly described as hybrid-like in its balanced calm and moderate cerebral effects. Saturn OG’s forest green buds are lit by a constellation of crystal trichomes, and this OG Kush relative is often lumped into a “planetary strain” series that includes Earth OG and Jupiter OG. The psychoactive onset of Saturn OG begins with an intense burst of euphoria that fades to smooth relaxation perfect for relieving stress and muscle tension.\xa0'

In [23]:
df['Strain'][1681]

'Saturn-Og'

In [24]:
df['Flavor'][1681]

'Earthy Sweet Tree Fruit'

In [25]:
df['Effects'][1681]

'Happy Relaxed Euphoric Uplifted Sleepy'

In [26]:
import pickle
# Dump the trained classifier (nn)  with Pickle
pickle_filename = 'model.pkl2'
pickled_model = open(pickle_filename, 'wb')  # Open the file to save as pkl file
pickle.dump(nn, pickled_model)
pickled_model.close() # Close the pickle instances

In [27]:
# Loading the saved model
model_pkl2 = open(pickle_filename, 'rb')
NN_model2 = pickle.load(model_pkl2)
print ("Loaded model :: ", NN_model2)  # print to verify

Loaded model ::  NearestNeighbors(algorithm='kd_tree', leaf_size=30, metric='minkowski',
                 metric_params=None, n_jobs=None, n_neighbors=4, p=2,
                 radius=1.0)


In [28]:
# Dump the trained classifier (tfidf)  with Pickle
pickle_filename_1 = 'tfidf.pkl2'
pickled_model_1 = open(pickle_filename_1, 'wb')  # Open the file to save as pkl file
pickle.dump(tfidf, pickled_model_1)
pickled_model_1.close() # Close the pickle instances

In [29]:
# Loading the saved model
model_pkl_1 = open(pickle_filename_1, 'rb')
tfidf_model2 = pickle.load(model_pkl_1)
print ("Loaded model :: ", tfidf_model2)  # print to verify

Loaded model ::  TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
                dtype=<class 'numpy.float64'>, encoding='utf-8',
                input='content', lowercase=True, max_df=1.0, max_features=2000,
                min_df=1, ngram_range=(1, 2), norm='l2', preprocessor=None,
                smooth_idf=True, stop_words='english', strip_accents=None,
                sublinear_tf=False, token_pattern='(?u)\\b\\w\\w+\\b',
                tokenizer=None, use_idf=True, vocabulary=None)


In [30]:
ideal2 = ["happy,Sleepy,Apricot,Citrus,Grapefruit"]

In [31]:
new2 = tfidf_model2.transform(ideal2)
new2

<1x2000 sparse matrix of type '<class 'numpy.float64'>'
	with 5 stored elements in Compressed Sparse Row format>

In [32]:
NN_model2.kneighbors(new2.todense())

(array([[1.09504845, 1.14089606, 1.15844355, 1.18178543]]),
 array([[1911,  398,  863, 1497]]))

In [33]:
df['Strain'][2013]

'Triangle-Kush'

In [34]:
import json
def recommend(user_input):
    temp_df = NN_model2.kneighbors(tfidf_model2.transform([user_input]).todense())[1]
    

    #print(temp_df)
    
    for i in range(4):
        info = df.loc[temp_df[0][i]]['Strain']
        info_effects = df.loc[temp_df[0][i]]['Effects']
        info_flavor = df.loc[temp_df[0][i]]['Flavor']
        info_description = df.loc[temp_df[0][i]]['Description']
        info_rating = df.loc[temp_df[0][i]]['Rating']
        
        print(json.dumps(info))
        print(json.dumps(info_effects))
        print(json.dumps(info_flavor))
        print(json.dumps(info_description))
        print(json.dumps(info_rating))
        
        #return json.dumps(info)  #for engineeers, the return does not work in jupyter lab.  Should work in vsCode.
        #return json.dumps(info_effects)
        #return json.dumps(info_flavor)
        #return json.dump(info_description)
        #return json.dumps(info_rating)

In [35]:
recommend('hybrid-like in its balanced calm and moderate cerebral effects')

"Saturn-Og"
"Happy Relaxed Euphoric Uplifted Sleepy"
"Earthy Sweet Tree Fruit"
"Saturn OG is a hybrid strain with mysterious beginnings, but its earthy citrus and diesel flavors confirms its close relationship to OG Kush. Its undocumented origins result in this strain being labeled as indica, sativa, and everything in between, but its effects are most commonly described as hybrid-like in its balanced calm and moderate cerebral effects. Saturn OG\u2019s forest green buds are lit by a constellation of crystal trichomes, and this OG Kush relative is often lumped into a \u201cplanetary strain\u201d series that includes Earth OG and Jupiter OG. The psychoactive onset of Saturn OG begins with an intense burst of euphoria that fades to smooth relaxation perfect for relieving stress and muscle tension.\u00a0"
4.5
"Moonwalker-Kush"
"Relaxed Euphoric Happy Focused Giggly"
"Earthy Pine Minty"
"Moonwalker Kush is a balanced indica-dominant hybrid that is the genetic cross of Triple OG and Tahoe Al