In [216]:
import pandas as pd
import sklearn.feature_extraction as fe
from sklearn.metrics.pairwise import cosine_similarity

import spacy

import gensim.downloader
from gensim.models import Word2Vec
from gensim.models import KeyedVectors
from gensim.models.doc2vec import Doc2Vec, TaggedDocument
from scipy import spatial

[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_sm')


In [191]:
df = pd.read_csv("data/bulbapedia_smogon_data.csv")#.set_index("name")
df.head()

Unnamed: 0,ndex,thumbnail,name,description
0,1,https://archives.bulbagarden.net/media/upload/...,Bulbasaur,weak weak frail frail slow light short Chlorop...
1,2,https://archives.bulbagarden.net/media/upload/...,Ivysaur,weak weak frail frail slow light short Chlorop...
2,3,https://archives.bulbagarden.net/media/upload/...,Venusaur,weak weak frail frail slow light short Chlorop...
3,4,https://archives.bulbagarden.net/media/upload/...,Charmander,weak weak frail frail slow light short Blaze S...
4,5,https://archives.bulbagarden.net/media/upload/...,Charmeleon,weak weak frail frail slow light short Blaze S...


In [231]:
nlp = spacy.load("data/vocabulary/en_core_web_sm-3.5.0")
def clean_query(raw_query):
    #lowercase, stem, remove stopwords
    lowercase = raw_query.lower()
    document = nlp(lowercase)
    tokens = []
    for token in document:
        if token.text not in spacy.lang.en.stop_words.STOP_WORDS:
            tokens.append(token.lemma_)
    cleaned_query = " ".join(tokens)
    return cleaned_query

### IR-TF-IDF

In [192]:
model = fe.text.TfidfVectorizer(input="content")
vector = model.fit_transform(df["description"])

def ir_tf_idf(query):
    res = model.transform([query])
    similarities = cosine_similarity(vector,res)
    return df.assign(sim=similarities.reshape(-1)).sort_values(by="sim",ascending=False)[:5]

### IR-TF-IDF with Word2Vec

In [197]:
# wiki_vectors = gensim.downloader.load('glove-wiki-gigaword-50')

In [165]:
# wiki_vectors.save("glove-wiki-gigaword-50_vectors.bin")

In [199]:
# w2v_model = Word2Vec(sentences=common_texts, sg=1, min_count=1)
# w2v_model = Word2Vec(sentences=data_tokens, sg=1, min_count=1)
# data_tokens = []
# for row in df.iterrows():
#     data_tokens = data_tokens + (row[1]["description"].split())
# df_by_name = df.set_index("name")
w2v_model =  KeyedVectors.load('glove-wiki-gigaword-50_vectors.bin')

def w2v_tf_idf(query):
     # assume data is stemmed and removed of stopwords
    sims = []
    large_query = query
#     for pokemon in df_by_name.index:
#         cosine_sim = wiki_vectors.n_similarity(query_tokens,df_by_name.loc[pokemon]["description"].split())
#         sims.append((cosine_sim,pokemon))
#     return sorted(sims, key=lambda item: item[0], reverse=True)[:5]
    for token in query.split():
        #### STEM WORDS HERE! ###
        additional_words = list(map(lambda item: item[0], w2v_model.most_similar(token)))
        large_query = large_query + " " + " ".join(additional_words)
    return ir_tf_idf(large_query)

### Doc2Vec

In [212]:
documents = []
for row in df.iterrows():
    doc = row[1]["description"]
    tag = row[1]["name"]
    documents.append(TaggedDocument(doc.split(),[tag]))
d2v_model = Doc2Vec(documents, vector_size=5, min_count=1)

def d2v(query):
    query_tokens = query.split()
    sims = []
    for pokemon in df["name"]:
        query_vector = d2v_model.infer_vector(query_tokens)
        cosine_sim = spatial.distance.cosine(query_vector,d2v_model.__getitem__(pokemon))
        sims.append((cosine_sim,pokemon))
    return sorted(sims, key=lambda item: item[0], reverse=True)[:5]

In [215]:
# d2v("plant")

[-0.06054189  0.09077215  0.0104624  -0.11133643 -0.02221121]
[-0.13273159 -0.09584848  0.2560783   0.01671835 -0.05762176]
[-0.03054326  0.09720871  0.05214211 -0.07725228 -0.05481282]
[ 0.23453416 -0.13638905  0.03819108 -0.22231826 -0.14177659]
[-0.00090178  0.1386183   0.02070218 -0.19845162  0.00868185]
[ 0.03562545  0.06390736  0.10500553 -0.14269988 -0.37343746]
[ 0.05136576  0.11863915 -0.00592785 -0.18150519  0.02405389]
[ 0.02706435  0.28092688 -0.01598051 -0.00414842  0.0563367 ]
[ 0.02250708  0.12565209  0.02251228 -0.15862179  0.019298  ]
[-0.11832219  0.28354767  0.34377575 -0.24331377 -0.11500413]
[-0.01968777  0.06253148 -0.01534953 -0.13951313 -0.02935868]
[ 0.19076857  0.1125676  -0.01150167 -0.03484523 -0.0769968 ]
[-0.05563398  0.1432768   0.04545065 -0.12355312 -0.04327392]
[-0.20258525 -0.0927336  -0.03974775 -0.16072433  0.09783924]
[ 0.00721763  0.09035466  0.03079967 -0.14321667 -0.0239322 ]
[ 0.10140789  0.15496874  0.08327086 -0.09283704 -0.11312065]
[-0.0917

[-0.0750689   0.09301946 -0.01151772 -0.13774191  0.0099287 ]
[-0.09528651 -0.0450396  -0.10460604 -0.2879241  -0.05301307]
[-0.00834516  0.08164871 -0.00711128 -0.1294943  -0.01312925]
[ 0.05201099  0.2375277   0.07806306 -0.10170361 -0.12242316]
[-0.02031169  0.08373587 -0.03429004 -0.17465417  0.01316683]
[ 0.02937842  0.01590386  0.17809437 -0.24527161 -0.26922727]
[ 0.00339058  0.12522846  0.04542474 -0.10446882 -0.02192915]
[-0.17610613  0.13381995  0.09358443 -0.27192265 -0.10774127]
[-0.04432364  0.07995826  0.02711401 -0.11710507 -0.04006648]
[ 0.16088887  0.3296445   0.18468976 -0.19296288 -0.08808947]
[-0.00697663  0.1077034   0.02048078 -0.11987191  0.01559271]
[-0.0374317   0.1248813   0.08636533 -0.3284323  -0.08837549]
[-0.06888083  0.04451991 -0.00555396 -0.14681108 -0.07080486]
[ 0.22777443 -0.08599891 -0.1084547  -0.18691006 -0.09506783]
[-0.01047395  0.14217627  0.01057938 -0.132438    0.00823097]
[ 0.25698477  0.23260465  0.02681059 -0.10349661 -0.09077364]
[-0.0854

[ 0.00030313  0.09773117 -0.0015329  -0.12401701 -0.00751715]
[-0.15453398 -0.12372866  0.28058332 -0.18647876 -0.18545508]
[ 0.05903442  0.18832871 -0.00603019 -0.19416556  0.09043204]
[-0.06082824  0.01175895 -0.10210908 -0.17874004 -0.142399  ]
[-0.05379738  0.111991    0.04053729 -0.15706562 -0.01323777]
[-0.16386193 -0.01080463  0.21426219  0.02207991 -0.11731394]
[ 0.00623554  0.09092301  0.02151893 -0.15888967 -0.00484624]
[-0.10561281  0.25874904  0.2431944  -0.20098777 -0.05668011]
[ 0.07695108  0.10871258  0.01671267 -0.17379825 -0.0149446 ]
[-0.09163561 -0.02554027  0.39976633 -0.2580987  -0.21566036]
[-0.06673568  0.08389915 -0.03132491 -0.1291113  -0.05458798]
[-0.03941718  0.22438756  0.10326988 -0.2353981  -0.18901777]
[ 0.00885656  0.07902638  0.01652295 -0.15746854 -0.02360553]
[ 0.01542698  0.06487979  0.20344065 -0.13550289  0.05847527]
[-0.03374371  0.07251981 -0.00645191 -0.13335128 -0.04328258]
[ 0.06445428 -0.14242752  0.17270958 -0.05812553 -0.00938223]
[ 0.0063

[-0.04191     0.09882206  0.01514359 -0.1488433  -0.03241602]
[-0.03513415  0.1605813   0.05103102  0.01769156 -0.1364875 ]
[-0.01823589  0.07766878  0.01452048 -0.16683857 -0.0140237 ]
[ 0.0630224  -0.04450569  0.03706032  0.08083323 -0.08980183]
[-0.05108434  0.05752002 -0.01587727 -0.16164322 -0.01730185]
[ 0.12949234  0.1645474   0.20731325 -0.04443543 -0.23827279]
[ 0.0374145   0.11802427  0.03657654 -0.12353788 -0.03403646]
[-0.14794172 -0.01579148  0.06256206  0.00873016  0.00093736]
[ 0.07375827  0.17924999  0.0099517  -0.18973309  0.04749256]
[-0.10407836 -0.02628927 -0.09382942  0.03944545 -0.1213477 ]
[-0.00309497  0.08998211  0.00509474 -0.14460778 -0.00284674]
[-0.1723453  -0.0737519   0.04752273  0.02106559 -0.10150553]
[-0.04645393  0.10803862 -0.01739973 -0.1220737  -0.03747376]
[ 0.01982521 -0.14548191  0.01565485 -0.031977   -0.064714  ]
[-0.00415634  0.12955002 -0.0444174  -0.13747558  0.04419839]
[-0.17366043 -0.10239033  0.04491527  0.12114326  0.03309697]
[-0.0946

[ 0.01169887  0.09391174 -0.02001182 -0.19784188 -0.00022829]
[-0.15256953 -0.1069089  -0.03609189 -0.2974788  -0.11366557]
[-0.02038533  0.08807527  0.00801994 -0.1549926  -0.04488381]
[ 0.09282998  0.2487732   0.13016467 -0.06736207 -0.22187327]
[ 0.01234806  0.11095475  0.03800569 -0.11570907 -0.04814283]
[-0.21615013 -0.05504     0.38391474 -0.26773426 -0.39013073]
[-0.05544339  0.09688403  0.05078873 -0.09161515 -0.04979139]
[ 0.04102328  0.02756446  0.06687272 -0.13101478  0.10780708]
[-0.05447548  0.13388202  0.02460908 -0.1791087   0.01393915]
[-0.07641258 -0.07542451  0.03130517 -0.00700042 -0.08704775]
[-0.03225544  0.11986359 -0.02499584 -0.19486015  0.01564908]
[ 0.02012068 -0.01285125  0.07893398  0.10877965 -0.1862383 ]
[-0.02448559  0.11523947  0.03977746 -0.11957206 -0.00988405]
[-0.08901518  0.13843758  0.10174681 -0.11824055 -0.10384557]
[-0.0148516   0.10071977 -0.00521376 -0.11479308 -0.00999816]
[ 0.0397659   0.22939605  0.11533131  0.01250322 -0.19759063]
[-0.0123

[-0.03003183  0.07890312 -0.01140016 -0.15373275 -0.0217571 ]
[-0.15212394  0.18188311 -0.05315298 -0.12405407 -0.09796496]
[-0.01873714  0.10127678 -0.01091641 -0.09809753 -0.0202332 ]
[-0.2301116  -0.0841158  -0.0511704  -0.21217519 -0.14989829]
[-0.00409093  0.153364    0.01964389 -0.13045186 -0.01161531]
[-0.03705857  0.15161285  0.14477003  0.00885862 -0.20604822]
[-0.04492114  0.0811498   0.01604919 -0.11481875 -0.0362012 ]
[-0.18800095  0.12081495  0.19063556 -0.10114045 -0.02834219]
[-0.0091142   0.11057454 -0.01719868 -0.16776496  0.00784843]
[-0.10180255  0.08782913 -0.06121143  0.0810389  -0.2410239 ]
[ 0.00279226  0.12128194  0.028262   -0.12323228 -0.02631913]
[-0.01613369  0.21583904 -0.04064878 -0.14452378 -0.24387424]
[-0.04438182  0.10610708 -0.0046464  -0.10870023 -0.03248944]
[-0.04099654  0.07464225  0.32658187 -0.1944517  -0.09105022]
[-0.00659449  0.1186973   0.00430247 -0.19456647 -0.02562561]
[ 0.01701492  0.19765355  0.22334625  0.07969752 -0.14450203]
[-0.0335

[(1.9243059754371643, 'Deoxys-Speed'),
 (1.8019468784332275, 'Suicune'),
 (1.7869475483894348, 'Oricorio-Pom-Pom'),
 (1.7832735776901245, 'Stufful'),
 (1.7775980234146118, 'Registeel')]

In [177]:
# wiki_vectors.most_similar("stone")

[('wood', 0.8393091559410095),
 ('brick', 0.8055505752563477),
 ('walls', 0.7950719594955444),
 ('carved', 0.778467059135437),
 ('stones', 0.7560802698135376),
 ('marble', 0.7539349794387817),
 ('granite', 0.7267210483551025),
 ('glass', 0.7247025966644287),
 ('roof', 0.7223367691040039),
 ('hill', 0.7204645276069641)]

In [148]:
# list(map(lambda item: item[0], wiki_vectors.most_similar(["water", "tail", "luck"])))

In [4]:
# res = model.transform([query])
# similarities = cosine_similarity(vector,res)

In [5]:
# matches = df.assign(sim=similarities.reshape(-1)).sort_values(by="sim",ascending=False)[:5]
# matches.index

In [147]:
# df["description"][0].split()