In [None]:
#references
#https://towardsdatascience.com/topic-model-visualization-using-pyldavis-fecd7c18fbf6


In [28]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.neighbors import NearestNeighbors
from sklearn.decomposition import PCA
import en_core_web_sm

import spacy
from spacy.tokenizer import Tokenizer
from nltk.stem import PorterStemmer
nlp = spacy.load("en_core_web_sm")

In [29]:
df = pd.read_csv('combined text data')
df.head()

Unnamed: 0.1,Unnamed: 0,name,strain_type,effect,may_relieve,aromas,flavors,rating,description,fav_count,sativa_comp,indica_comp,cbd,cbn,avg_thc,combined_text
0,0,s a g e sativa afghani genetic equilibrium ...,hybrid,energizing euphoria happy relaxi...,add adhd anxiety chronic pain de...,earthy herbal sage spicy woo...,sage spicy,4.8,s a g e is a popular marijuana strain that won...,621.0,0.5,0.5,0.0,0.0,0.19,s a g e sativa afghani genetic equilibrium ...
1,1,cannatonic marijuana strain,hybrid,calming happy mellow relaxing ...,anxiety chronic pain cramps head...,citrus earthy herbal lemon s...,citrus herbal sour spicy swe...,4.7,cannatonic is an evenly balanced hybrid strain...,1535.0,0.5,0.5,0.14,0.01,0.07,cannatonic marijuana strain calming happ...
2,2,blue dream marijuana strain,sativa dominant hybrid,creative energizing euphoria hap...,add adhd anxiety autism bipolar ...,earthy fruity sweet vanilla,berry blueberry fruity herbal ...,4.5,blue dream is a slightly sativa dominant hybri...,15731.0,0.6,0.4,0.02,0.01,0.17,blue dream marijuana strain creative ene...
3,3,goji og marijuana strain,sativa dominant hybrid,creative energizing euphoria hap...,chronic pain depression gastrointest...,cherry pungent sweet,berry cherry strawberry sweet ...,4.5,named after the bright red himalayan berry th...,301.0,0.8,0.2,0.01,0.0,0.22,goji og marijuana strain creative energi...
4,4,green love potion marijuana strain,indica dominant hybrid,aroused happy relaxing sleepy,anxiety chronic pain cramps depr...,earthy floral lavender pungent ...,chemical herbal lavender menthol...,4.5,being an indica dominant hybrid strain green ...,60.0,0.8,0.2,0.0,0.0,0.11,green love potion marijuana strain aroused ...


In [30]:
import re 
import nltk

#removing numbers and punctuation
def remove_numbers_punctuation(text):
    return re.sub(r'[^\w\s]', '', text)

#tokenization
def tokenize_words(text):
    return nltk.word_tokenize(text)

#lemmatization
def lemmatize_words_spacy(text):
    text = nlp(str(text))  #create a doc object 
    return " ".join([token.lemma_ for token in text])

#removing stopwords
def remove_stopwords(text):
    return [word for word in text if word not in stopwords]

In [33]:

import gensim
from gensim import corpora

#Settings, formatting, stopwords, and Spacy model
from tqdm.auto import tqdm
tqdm.pandas()
sns.set_palette("Accent")
pd.options.display.float_format = "{:,.3f}".format

nlp = spacy.load("en_core_web_sm")
stopwords = gensim.parsing.preprocessing.STOPWORDS

In [34]:
df['clean_text'] = df['combined_text'].progress_map(remove_numbers_punctuation)

df['clean_text'] = df['clean_text'].progress_map(lemmatize_words_spacy)

df['clean_text'] = df['clean_text'].str.lower()

df['clean_text'] = df['clean_text'].progress_map(tokenize_words)

100%|██████████| 8924/8924 [00:00<00:00, 28127.90it/s]
100%|██████████| 8924/8924 [05:34<00:00, 26.70it/s]
100%|██████████| 8924/8924 [00:08<00:00, 1065.86it/s]


In [35]:

df['stopwords_clean_text'] = df['clean_text'].progress_map(remove_stopwords)
df['stopwords_clean_text'] = df['stopwords_clean_text'].apply(', '.join)
df['stopwords_clean_text'] = df['stopwords_clean_text'].apply(lambda x: ' '.join(word for word in x.split() if len(word)>3))

100%|██████████| 8924/8924 [00:00<00:00, 17583.71it/s]


In [36]:
#tokenize combined text data 
tokenizer = Tokenizer(nlp.vocab)

In [37]:
tokens = []

for doc in tokenizer.pipe(df['stopwords_clean_text'], batch_size = 500):
    doc_tokens = [token.text for token in doc]
    tokens.append(doc_tokens)

df['tokens'] = tokens


In [38]:
df['tokens'].head()

0    [sativa,, afghani,, genetic,, equilibrium,, ma...
1    [cannatonic,, marijuana,, strain,, calm,, happ...
2    [blue,, dream,, marijuana,, strain,, creative,...
3    [goji,, marijuana,, strain,, creative,, energi...
4    [green,, love,, potion,, marijuana,, strain,, ...
Name: tokens, dtype: object

In [39]:
from sklearn.feature_extraction.text import TfidfVectorizer

tfidf = TfidfVectorizer(stop_words= 'english')

dtm = tfidf.fit_transform(df['stopwords_clean_text'])
dtm = pd.DataFrame(dtm.todense(), columns = tfidf.get_feature_names_out())
dtm.head()

Unnamed: 0,001,007,020,074,098,100,1000,101,1024,106,...,zour,zoz,zprite,zsweet,zum,zurcule,zurple,zushi,zuyaqui,zwarte
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [40]:
from sklearn.neighbors import NearestNeighbors
from sklearn.feature_extraction.text import TfidfVectorizer

#Fit knn on on tf-ifdf vectors 

knn = NearestNeighbors(n_neighbors = 5, algorithm= 'kd_tree')
knn.fit(dtm)


NearestNeighbors(algorithm='kd_tree')

In [41]:
knn.get_params()

{'algorithm': 'kd_tree',
 'leaf_size': 30,
 'metric': 'minkowski',
 'metric_params': None,
 'n_jobs': None,
 'n_neighbors': 5,
 'p': 2,
 'radius': 1.0}

In [42]:
#Query using the neigbors 
knn.kneighbors([dtm.iloc[456]])



(array([[0.        , 1.04852774, 1.06005286, 1.08018137, 1.09212875]]),
 array([[ 456, 3603, 1090, 8873,  966]]))

In [51]:
df['stopwords_clean_text'][456][:200]

'triangle, kush, marijuana, strain, body, high, cerebral, long, relax, uplift, citrus, lemon, spicy, woody, chronic, pain, eye, pressure, fatigue, glaucoma, migraine, nausea, earthy, fuel, lemon, pine,'

In [52]:
df['stopwords_clean_text'][3603][:200]

'love, triangle, marijuana, strain, body, high, calm, relax, sleepy, uplifting, berry, cherry, mint, peppery, sweet, chronic, pain, cramp, insomnia, stress, berry, earthy, herbal, pepper, spicy, sweet,'

In [53]:
df['stopwords_clean_text'][1090][:200]

'golden, triangle, marijuana, strain, creative, energize, euphoria, happy, relax, uplift, citrus, diesel, pine, spicy, bipolar, disorder, depression, fatigue, mood, swing, stress, citrus, diesel, flora'

In [54]:
df['stopwords_clean_text'][8873][:200]

'perfect, triangle, marijuana, strain, arouse, energize, euphoria, focus, relax, uplift, grassy, herbal, spicy, woody, chronic, pain, depression, fatigue, headache, migraine, mood, swing, earthy, flowe'

In [55]:
df['stopwords_clean_text'][966][:200]

'black, triangle, marijuana, strain, body, high, euphoria, relax, sleepy, uplift, pine, sweet, vanilla, appetite, loss, chronic, pain, depression, insomnia, muscle, spasm, dank, earthy, pine, pungent, '

In [15]:
knn2 = NearestNeighbors(n_neighbors= 7, algorithm='brute')
knn2.fit(dtm)

NearestNeighbors(algorithm='brute', n_neighbors=7)

In [16]:
#Query using the neigbors 
knn2.kneighbors([dtm.iloc[456]])



(array([[0.        , 1.0696755 , 1.09324611, 1.10311469, 1.11961772,
         1.13244949, 1.14015897]]),
 array([[ 456, 3603, 1090, 8873,  966, 1440, 3206]]))

In [56]:
import pickle
pickle.dump(tfidf, open('model.pkl','wb'))

In [57]:
# Loading model to compare the results
vectorizer = pickle.load(open('model.pkl','rb'))

In [58]:
pickle.dump(knn, open('knn_model.pkl','wb'))

In [59]:
knn = pickle.load(open('knn_model.pkl','rb'))

In [60]:
def recommender(text):
    text = pd.Series(text)
    vect = vectorizer.transform(text)

    #send to df 
    vectdf = pd.DataFrame(vect.todense())

    #Return a list of indexes 
    top5 = knn.kneighbors([vectdf][0], n_neighbors= 5)[1][0].tolist()

    #Send recommendations to dataframe 
    recs_df = df.iloc[top5]
    recs_df['index'] = recs_df.index
    

    return recs_df


In [61]:
recommender('I want a bud that taste like herbs and woody that relieves anxiety and nausea and makes me feel happy and creative')

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  recs_df['index'] = recs_df.index


Unnamed: 0.1,Unnamed: 0,name,strain_type,effect,may_relieve,aromas,flavors,rating,description,fav_count,sativa_comp,indica_comp,cbd,cbn,avg_thc,combined_text,clean_text,stopwords_clean_text,tokens,index
2318,2395,woody og marijuana strain,indica,body high happy sleepy,chronic pain eye pressure insomnia ...,dank skunky,pine,4.9,woody og also known to most of the cannabis c...,17.0,0.0,1.0,0.0,0.0,0.21,woody og marijuana strain body high happ...,"[woody, og, marijuana, strain, body, high, hap...","woody, marijuana, strain, body, high, happy, s...","[woody,, marijuana,, strain,, body,, high,, ha...",2318
2513,2646,hawaiian delight marijuana strain,hybrid,aroused euphoria happy hungry ...,anxiety asthma depression migrai...,diesel earthy hash woody,diesel hash sweet woody,4.9,hawaiian delight is primarily an indica strain...,0.0,0.5,0.5,0.0,0.0,0.2,hawaiian delight marijuana strain aroused ...,"[hawaiian, delight, marijuana, strain, arouse,...","hawaiian, delight, marijuana, strain, arouse, ...","[hawaiian,, delight,, marijuana,, strain,, aro...",2513
2626,2782,big bang marijuana strain,indica dominant hybrid,euphoria happy relaxing sleepy,add adhd anorexia bipolar disorder ...,citrus fruity,sweet woody,4.5,being an indica dominant strain with a 60 40 i...,10.0,0.6,0.4,0.0,0.0,0.19,big bang marijuana strain euphoria happy...,"[big, bang, marijuana, strain, euphoria, happy...","big, bang, marijuana, strain, euphoria, happy,...","[big,, bang,, marijuana,, strain,, euphoria,, ...",2626
7890,8193,white 91 marijuana strain,indica dominant hybrid,cerebral creative happy relaxing...,chronic pain depression insomnia ...,chemical earthy herbal peppery ...,herbal peppery pine sour spi...,4.3,white 91 is an indica dominant hybrid strain ...,0.0,0.7,0.3,0.01,0.0,0.2,white 91 marijuana strain cerebral creat...,"[white, 91, marijuana, strain, cerebral, creat...","white, marijuana, strain, cerebral, creative, ...","[white,, marijuana,, strain,, cerebral,, creat...",7890
616,623,c4 marijuana strain,hybrid,cerebral creative energizing eup...,anxiety arthritis chronic pain g...,sweet,cheesy citrus fruity lemon p...,4.8,c4 is a hybrid strain and is a cross between s...,0.0,0.5,0.5,0.01,0.0,0.14,c4 marijuana strain cerebral creative ...,"[c4, marijuana, strain, cerebral, creative, en...","marijuana, strain, cerebral, creative, energiz...","[marijuana,, strain,, cerebral,, creative,, en...",616


In [64]:
knn.kneighbors([dtm.iloc[7890]])



(array([[0.        , 1.05685925, 1.08090772, 1.11190351, 1.11400795]]),
 array([[7890, 5327, 5305,  425, 4876]]))

In [62]:
recommender(' I want a strain that tastes like citrus and makes me feel happy and relieves anxiety')

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  recs_df['index'] = recs_df.index


Unnamed: 0.1,Unnamed: 0,name,strain_type,effect,may_relieve,aromas,flavors,rating,description,fav_count,sativa_comp,indica_comp,cbd,cbn,avg_thc,combined_text,clean_text,stopwords_clean_text,tokens,index
4438,4683,grape goddess marijuana strain,hybrid,happy uplifting,chronic pain depression insomnia ...,fragrant grape musky spicy s...,fruity grape spicy sweet,4.6,grape goddess also known as grape god is a...,10.0,0.5,0.5,0.01,0.0,0.28,grape goddess marijuana strain happy upl...,"[grape, goddess, marijuana, strain, happy, upl...","grape, goddess, marijuana, strain, happy, upli...","[grape,, goddess,, marijuana,, strain,, happy,...",4438
5842,6099,urkle berry marijuana strain,indica dominant hybrid,body high calming hungry relaxin...,chronic pain depression insomnia ...,berry earthy fruity herbal s...,berry blueberry fruity spicy ...,4.5,urkle berry also known as urkleberry or ur...,0.0,0.8,0.2,0.0,0.0,0.19,urkle berry marijuana strain body high c...,"[urkle, berry, marijuana, strain, body, high, ...","urkle, berry, marijuana, strain, body, high, c...","[urkle,, berry,, marijuana,, strain,, body,, h...",5842
2513,2646,hawaiian delight marijuana strain,hybrid,aroused euphoria happy hungry ...,anxiety asthma depression migrai...,diesel earthy hash woody,diesel hash sweet woody,4.9,hawaiian delight is primarily an indica strain...,0.0,0.5,0.5,0.0,0.0,0.2,hawaiian delight marijuana strain aroused ...,"[hawaiian, delight, marijuana, strain, arouse,...","hawaiian, delight, marijuana, strain, arouse, ...","[hawaiian,, delight,, marijuana,, strain,, aro...",2513
4698,4946,citrus fire marijuana strain,sativa dominant hybrid,calming cerebral creative energi...,anxiety bipolar depression depressio...,dank earthy herbal lemon pep...,citrus dank lemon peppery so...,4.6,citrus fire a slightly sativa dominant hybrid ...,0.0,0.6,0.4,0.16,0.0,0.01,citrus fire marijuana strain calming cer...,"[citrus, fire, marijuana, strain, calm, cerebr...","citrus, marijuana, strain, calm, cerebral, cre...","[citrus,, marijuana,, strain,, calm,, cerebral...",4698
616,623,c4 marijuana strain,hybrid,cerebral creative energizing eup...,anxiety arthritis chronic pain g...,sweet,cheesy citrus fruity lemon p...,4.8,c4 is a hybrid strain and is a cross between s...,0.0,0.5,0.5,0.01,0.0,0.14,c4 marijuana strain cerebral creative ...,"[c4, marijuana, strain, cerebral, creative, en...","marijuana, strain, cerebral, creative, energiz...","[marijuana,, strain,, cerebral,, creative,, en...",616


In [27]:
recommender('I want to be energetic and focused but relaxed ')

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  recs_df['index'] = recs_df.index


Unnamed: 0.1,Unnamed: 0,name,strain_type,effect,may_relieve,aromas,flavors,rating,description,fav_count,sativa_comp,indica_comp,cbd,cbn,avg_thc,combined_text,tokens,index
364,370,god s green crack marijuana strain,indica dominant hybrid,body high cerebral happy uplifti...,add adhd depression fatigue migr...,earthy fruity sour tropical,fruity herbal sour,4.7,god s green crack is a super rare slightly ind...,72.0,0.55,0.45,0.0,0.0,0.22,god s green crack marijuana strain body high...,"[god, s, green, crack, marijuana, strain, , ...",364
57,58,green punch marijuana strain,sativa dominant hybrid,happy motivation,anxiety eye pressure inflammation ...,earthy hash herbal,pineapple sweet tropical,4.6,green punch is a super rare sativa dominant hy...,12.0,0.7,0.3,0.13,0.0,0.07,green punch marijuana strain happy motiv...,"[green, punch, marijuana, strain, , happy, ...",57
4628,4875,top gun marijuana strain,indica dominant hybrid,calming energizing focus happy ...,add adhd chronic pain depression ...,citrus diesel earthy orange ...,citrus nutty orange sour sug...,4.6,top gun is an indica dominant hybrid strain 8...,0.0,0.8,0.2,0.0,0.0,0.16,top gun marijuana strain calming energiz...,"[top, gun, marijuana, strain, , calming, ...",4628
4214,4453,g13 diesel marijuana strain,indica dominant hybrid,energizing giggly happy hungry ...,anorexia depression fatigue inso...,citrus dank diesel earthy sp...,citrus diesel pine sweet,4.6,g13 diesel is a slightly indica dominant hybri...,0.0,0.6,0.4,0.0,0.0,0.17,g13 diesel marijuana strain energizing g...,"[g13, diesel, marijuana, strain, , energizin...",4214
8799,9163,eli marijuana strain,sativa dominant hybrid,calming cerebral creative energi...,depression eye pressure fatigue ...,citrus earthy herbal sour sp...,citrus sour spicy sweet vani...,4.5,eli is a sativa dominant hybrid strain 70 sa...,0.0,0.7,0.3,0.0,0.0,0.14,eli marijuana strain calming cerebral ...,"[eli, marijuana, strain, , calming, , ce...",8799
