In [189]:
from collections import defaultdict
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import TruncatedSVD
from numpy import dot
from numpy.linalg import norm
from sklearn.metrics.pairwise import cosine_similarity


class RetrievalSystem(object):
    def __init__(self, villagers=None,num_topics =-1, 
                 min_df=1, user_list = []
                 ):
        """
        RetrievalSystem setup for lsi, creation of doc term matrix, and query vectors
        """

        self.villagers = villagers
        self.user_list = user_list
        self.villagers.drop(columns=['Unnamed: 0'],inplace=True)
        villagers_lines = self.villagers[['Species','Personality','Hobby','Astrology','Genre','Style 1','Style 2','Color 1','Color 2']].copy()
        villagers_lines = villagers_lines.apply(lambda x: x.astype(str).str.lower())
        villagers_arr = villagers_lines.to_numpy()
        
        # create a doc-term matrix out of our doc collection
        self.vec = TfidfVectorizer(tokenizer=str.split, min_df=min_df)
        doc_term_mat = self.vec.fit_transform([" ".join(vil) for vil in villagers_arr])
        print(doc_term_mat.shape)
        
        #lsi
        self.lsi = TruncatedSVD(n_components=num_topics,random_state=42,algorithm='randomized')
        
        # Fit SVD model on data
        self.doc_vecs = self.lsi.fit_transform(doc_term_mat) # document vectors in a matrix
        
        #create query vectors
        self.q_vecs = dict()
       
        
    def retrieve_n_rank_docs(self):
        """
        Retrieve and rank documents in the latent semantic (concept) space
        """
        
        def getcossim(q_vec,docdict):
            
            cos_val = cosine_similarity(q_vec.reshape(1, -1) , docdict.reshape(1, -1))
            
            if np.isnan(cos_val):
                return cos_val == 0
            else:
                return cos_val[0]
        
        #create doc vectors
        d_vecs = dict()
        for i,vec in enumerate(self.doc_vecs):
            d_vecs[i]=vec

        # create cosim values per villager based on user list   
        cos_ret_docs=dict()
        for i,q in d_vecs.items():
            new_q_uid_vec = self.vec.transform([" ".join(self.user_list)])
            q_id_vec_user = self.lsi.transform(new_q_uid_vec)
            cossim = getcossim(q_id_vec_user, q)
            cos_ret_docs[i] = cossim
        
        # sort the cosim values and return the top 2 villagers
        ret_docs_1 = dict(sorted(cos_ret_docs.items(), key = lambda x: x[1], reverse=True))
        ret_docs_df = pd.DataFrame.from_dict(ret_docs_1,orient="index",columns=['cosine_similarity'])
        ret_docs_df['cosine_similarity'] = ret_docs_df['cosine_similarity'].apply(lambda x: round(x,2))
        ret_docs_df = ret_docs_df.merge(self.villagers,how='left',left_index=True,right_index=True)
        ret_docs_df['tup_col'] = list(zip(ret_docs_df.cosine_similarity, ret_docs_df.Overall_Popularity))
        ret_docs_dict = dict(zip(ret_docs_df.Name, ret_docs_df.tup_col))
        ret_docs = dict(sorted(ret_docs_dict.items(), key = lambda x: (x[1],x[0]), reverse=True))
        
        vil_1 = list(ret_docs.keys())[0]
        vil_2 = list(ret_docs.keys())[1]

        return vil_1, vil_2

    def villagers_id(self, vil_1, vil_2,villagers_id):
        """ 
        Retrieve the villagers id from the villagers_id dataframe to use to bring back images for the webpage
        results.
        """
        vil_1_id =villagers_id.loc[villagers_id['Name'] == vil_1, 'Filename'].values[0]
        vil_2_id = villagers_id.loc[villagers_id['Name'] == vil_2, 'Filename'].values[0]
        vil_1_tup = (vil_1,vil_1_id)
        vil_2_tup = (vil_2,vil_2_id)
        
        return vil_1_tup, vil_2_tup
             


In [190]:
user_sim_cl = RetrievalSystem(villagers= pd.read_csv("villagers_final.csv"), num_topics=9,
                              user_list = ['Frog','Big Sister','Fitness','Gemini','Electronic','Active','Gorgeous','Green','Light Blue'],
                              )
villager_1, villager_2 = user_sim_cl.retrieve_n_rank_docs()
print(villager_1, villager_2)
v_id1, v_id2 = user_sim_cl.villagers_id(vil_1 = villager_1, vil_2 = villager_2, villagers_id= pd.read_csv("villagers_id.csv"))


(391, 99)
Shari Canberra
Shari


In [191]:
v_id1

('Shari', 'mnk07')

In [192]:
v_id2

('Canberra', 'kal08')

In [None]:
import spacy
import pandas as pd
nlp = spacy.load('en_core_web_sm')

In [None]:
villagers_lines = pd.read_csv('villagers_final.csv')

In [None]:
villagers_lines.drop(columns=['Unnamed: 0'],inplace=True)

In [None]:
villagers_lines.head()

Unnamed: 0,Name,Species,Personality,Hobby,Astrology,Genre,Style 1,Style 2,Color 1,Color 2,Total_Google_Searches,Poll_Results,Overall_Popularity
0,Admiral,Bird,Cranky,Nature,Aquarius,Pop,Cool,Cool,Black,Blue,2417200.0,632.0,2417832.0
1,Agent S,Squirrel,Peppy,Fitness,Cancer,Pop,Active,Simple,Blue,Black,2664010.0,1225.0,2665235.0
2,Agnes,Pig,Big Sister,Play,Taurus,Electronic,Simple,Elegant,Pink,White,4731450.0,2535.0,4733985.0
3,Al,Gorilla,Lazy,Fitness,Libra,Pop,Active,Active,Red,White,1618750.0,600.0,1619350.0
4,Alfonso,Alligator,Lazy,Play,Gemini,Folk,Simple,Simple,Red,Blue,913600.0,1230.0,914830.0


In [None]:
villagers = villagers_lines[['Species','Personality','Hobby','Astrology','Genre','Style 1','Style 2','Color 1','Color 2']].copy()

In [None]:
user_list =['Frog','Big Sister','Fitness','Gemini','Electronic','Active','Gorgeous','Green','Light Blue']

In [None]:
villagers.loc[len(villagers)] = user_list

In [None]:

villagers = villagers.apply(lambda x: x.astype(str).str.lower())

In [None]:
villagers.shape

(392, 9)

In [None]:
villagers_arr = villagers.to_numpy()

In [None]:
villagers_arr

array([['bird', 'cranky', 'nature', ..., 'cool', 'black', 'blue'],
       ['squirrel', 'peppy', 'fitness', ..., 'simple', 'blue', 'black'],
       ['pig', 'big sister', 'play', ..., 'elegant', 'pink', 'white'],
       ...,
       ['deer', 'smug', 'music', ..., 'gorgeous', 'purple', 'gray'],
       ['octopus', 'lazy', 'nature', ..., 'cute', 'blue', 'yellow'],
       ['frog', 'big sister', 'fitness', ..., 'gorgeous', 'green',
        'light blue']], dtype=object)

In [None]:
doc = [nlp(" ".join(villager_line)) for villager_line in villagers_arr]

In [None]:
doc[391].vector

array([ 7.15376914e-01,  2.57801443e-01,  2.37910077e-01,  2.41983291e-02,
       -1.68698952e-01, -3.13314199e-01, -6.07793212e-01,  5.88031895e-02,
        2.51864612e-01, -1.49235427e-02, -5.81968307e-01, -5.69856390e-02,
       -2.81857044e-01, -4.87884611e-01, -3.07918817e-01,  1.31606698e+00,
       -7.19539523e-02, -3.86790663e-01, -3.24366599e-01,  2.23372713e-01,
       -6.07005553e-04,  6.24713749e-02, -9.13527980e-02, -1.46563187e-01,
       -7.01500654e-01, -5.72738290e-01, -3.54152620e-01, -7.85638869e-01,
       -1.19634673e-01,  1.76422104e-01, -4.88285422e-01, -6.62242055e-01,
        1.21949959e+00,  7.88092837e-02,  4.25458461e-01,  2.44229406e-01,
        3.51742744e-01, -3.51985335e-01,  3.44748080e-01,  7.60566533e-01,
       -2.09170833e-01, -5.20090520e-01, -2.87748545e-01,  8.85290146e-01,
       -9.40537035e-01, -2.81647682e-01,  4.15146828e-01,  1.79919437e-01,
        5.58689296e-01,  1.33468524e-01,  7.03292608e-01, -4.57553297e-01,
        2.19251681e-02, -

In [None]:
labels = [vil[:391] for vil in villagers_arr]

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt
def create_heatmap(similarity_vil, cmap = "YlGnBu"):
  df = pd.DataFrame(similarity_vil)
  df.columns = labels
  df.index = labels
  fig, ax = plt.subplots(figsize=(15,15))
  sns.heatmap(df, cmap=cmap)


In [None]:
doc[1].vector

array([ 0.977058  ,  0.44990715,  0.03975629, -0.1329225 ,  0.11022175,
        0.02271874, -1.0912694 ,  0.1780339 ,  0.43552783,  0.25493795,
       -0.37409708, -0.09070725, -0.33887768, -0.6667877 , -0.07816624,
        1.2173713 ,  0.06755082, -0.04178786, -0.45575216,  0.19684765,
       -0.31484476,  0.4008498 , -0.18486498, -0.19857822, -0.84646374,
       -0.4785579 , -0.2540438 , -0.6399956 ,  0.17926341,  0.29807445,
       -0.37065473, -0.50494176,  0.6327365 , -0.04655737,  0.47187984,
        0.27250165, -0.08714153, -0.22548789,  0.27012858,  0.72291464,
       -0.11250769, -0.57227206, -0.15490495,  0.62693715, -0.6392302 ,
       -0.31190008,  0.30594134,  0.3343445 ,  0.85370857,  0.03809138,
        0.6911551 , -0.24886128, -0.24231485,  0.05493042,  0.28836942,
       -0.23083533, -0.01396599,  0.11122958,  0.15488791,  0.43094122,
       -0.70595634, -0.60723734, -0.07571481, -0.09186713,  0.8348802 ,
       -0.58291173, -0.15016101, -0.0256102 ,  0.12206352, -0.37

In [None]:
doc[2].vector

array([ 0.74621296,  0.33424112, -0.06209763, -0.06210431,  0.09150366,
        0.13659564, -0.5878782 ,  0.20205793,  0.38968155,  0.05289168,
       -0.5273622 , -0.0874822 , -0.12080572, -0.6878077 , -0.4630174 ,
        1.4330633 ,  0.04969016, -0.4582161 , -0.4514861 ,  0.06306006,
       -0.17239633,  0.44066024, -0.29291087, -0.13384835, -0.6130446 ,
       -0.5479286 , -0.3054121 , -0.72505313, -0.18396553,  0.4309306 ,
       -0.5247887 , -0.7143429 ,  0.7814385 ,  0.15229616,  0.08893213,
        0.3317351 ,  0.11206786, -0.22164838,  0.15202303,  0.89929533,
       -0.01827448, -0.31984404, -0.3959321 ,  0.7741375 , -0.80828685,
       -0.10459197,  0.35422993,  0.08982602,  0.5920146 , -0.03894204,
        0.90532225, -0.10233466, -0.34260663, -0.3314807 ,  0.4401104 ,
       -0.23661952, -0.3448888 ,  0.2712795 ,  0.09987442,  0.65707004,
       -0.5125593 , -0.8043088 , -0.44945422,  0.2821085 ,  0.5925754 ,
       -0.5819853 , -0.15844491,  0.16271092,  0.4055273 , -0.21

In [None]:
from sklearn.metrics.pairwise import cosine_similarity

similarity_vil = []
for i in range(len(doc)):
    row = []
    for j in range(len(doc)):
        row.append(cosine_similarity(doc[i].vector.reshape(1, -1),doc[j].vector.reshape(1, -1)))
    similarity_vil.append(row)
# create_heatmap(similarity_vil)

In [None]:
similarity_vil

[[array([[0.99999994]], dtype=float32),
  array([[0.8585658]], dtype=float32),
  array([[0.875821]], dtype=float32),
  array([[0.81510365]], dtype=float32),
  array([[0.868052]], dtype=float32),
  array([[0.8943597]], dtype=float32),
  array([[0.9142455]], dtype=float32),
  array([[0.9015499]], dtype=float32),
  array([[0.90633905]], dtype=float32),
  array([[0.8228353]], dtype=float32),
  array([[0.9323071]], dtype=float32),
  array([[0.8525325]], dtype=float32),
  array([[0.886546]], dtype=float32),
  array([[0.917117]], dtype=float32),
  array([[0.8755621]], dtype=float32),
  array([[0.8285066]], dtype=float32),
  array([[0.9091485]], dtype=float32),
  array([[0.779156]], dtype=float32),
  array([[0.8661715]], dtype=float32),
  array([[0.8375193]], dtype=float32),
  array([[0.8634145]], dtype=float32),
  array([[0.8929763]], dtype=float32),
  array([[0.8916546]], dtype=float32),
  array([[0.8649292]], dtype=float32),
  array([[0.82242304]], dtype=float32),
  array([[0.8281456]], dty

In [None]:
similarity_vil[-1]

[array([[0.9132124]], dtype=float32),
 array([[0.89654136]], dtype=float32),
 array([[0.91434747]], dtype=float32),
 array([[0.8883427]], dtype=float32),
 array([[0.9096771]], dtype=float32),
 array([[0.89380634]], dtype=float32),
 array([[0.9024841]], dtype=float32),
 array([[0.9111577]], dtype=float32),
 array([[0.891306]], dtype=float32),
 array([[0.8482353]], dtype=float32),
 array([[0.88382655]], dtype=float32),
 array([[0.8440897]], dtype=float32),
 array([[0.9259982]], dtype=float32),
 array([[0.9108801]], dtype=float32),
 array([[0.8871995]], dtype=float32),
 array([[0.8313035]], dtype=float32),
 array([[0.8652753]], dtype=float32),
 array([[0.7691159]], dtype=float32),
 array([[0.88619024]], dtype=float32),
 array([[0.88273084]], dtype=float32),
 array([[0.82437456]], dtype=float32),
 array([[0.86789846]], dtype=float32),
 array([[0.916168]], dtype=float32),
 array([[0.8999959]], dtype=float32),
 array([[0.8491949]], dtype=float32),
 array([[0.8617533]], dtype=float32),
 array

In [None]:
vil_user = villagers_lines.Name.to_list()

In [None]:
vil_user.append('user')

In [None]:
similarity_vil_df = pd.DataFrame(similarity_vil,columns = vil_user, index = vil_user)

In [None]:
user_row = similarity_vil_df.tail(1).values.tolist()[0]

In [None]:
user_row_flt = [float(user[0]) for user in user_row]

In [None]:
names_vil_vol = similarity_vil_df.columns.tolist()

In [None]:
zip_val_name = list(zip(names_vil_vol,user_row_flt))

In [None]:
df_zip_val_name = pd.DataFrame(zip_val_name,columns = ['Name','Similarity'])


In [None]:
vil_id = pd.read_csv('villagers_id.csv')
vil_id.drop(columns=['Unnamed: 0'],inplace=True)

In [None]:
df_val_name_id = pd.merge(df_zip_val_name,vil_id, on = 'Name')

In [None]:
df_val_name_id.sort_values(by = 'Similarity',ascending = False)

Unnamed: 0,Name,Similarity,Filename,Unique Entry ID
349,Sylvia,0.955598,kgr06,PZsBTkeYBg5ACE5xB
137,Flo,0.952080,pgn13,6cB5T5yZtevY6d2jp
245,Muffy,0.952037,shp12,BMbfDZTTqMr8T4Jvt
303,Ribbot,0.951906,flg01,vg2BFTmhyyhbiHtbC
387,Wolfgang,0.950480,wol02,RbF2wcn6jRxtgLDRd
...,...,...,...,...
352,Tad,0.764335,flg09,v74Hs9XT2zRC6gDjE
30,Beau,0.752997,der07,zbkkW8wWbmyGAZauv
51,Boone,0.751461,gor02,ZgTh9dkb3k36EPZ4o
104,Cyrano,0.743705,ant00,JaNDeYdAuiJCYZbAx


In [None]:
# max_list = max(user_row)

In [None]:
# max_list

In [None]:
# user_row.index(sorted(user_row)[-3])

In [None]:
# villagers_lines.loc[349,:]

In [None]:
# villagers_lines.loc[137,:]

In [None]:
# villagers_lines.loc[326,:]