In [1]:
import pandas as pd
from gensim.models import Word2Vec
from annoy import AnnoyIndex
from sklearn.preprocessing import MinMaxScaler

In [2]:
books_data = pd.read_csv('D:/2_course/recomendational_system/new_term_new_life_new_project/data/books_data.csv')
books_data.head(5)

Unnamed: 0,title,Book-Author,Year-Of-Publication,categories
0,themartianchronicles,raybradbury,1984.0,fiction
1,nonoboy,johnokada,1978.0,japanese
2,fahrenheit451,raybradbury,1976.0,bookburning
3,1sttodieanovel,jamespatterson,2001.0,fiction
4,theno1ladiesdetectiveagency,alexandermccallsmith,2002.0,botswana


In [3]:
old_books_data = books_data.copy()

In [4]:
old_books = books_data['Year-Of-Publication'].quantile(0.25)
new_books = books_data['Year-Of-Publication'].quantile(0.75)

In [5]:
word2vec_data = books_data.copy()

In [6]:
word2vec_data.loc[word2vec_data['Year-Of-Publication'] <= old_books, 'Age'] = 'old'
word2vec_data.loc[word2vec_data['Year-Of-Publication'] >= new_books, 'Age'] = 'new'
word2vec_data.loc[(word2vec_data['Year-Of-Publication'] > old_books) & (word2vec_data['Year-Of-Publication'] < new_books), 'Age'] = 'medium'

In [7]:
word2vec_data = word2vec_data.drop(columns=['Year-Of-Publication'], axis=1)
word2vec_data.head(5)

Unnamed: 0,title,Book-Author,categories,Age
0,themartianchronicles,raybradbury,fiction,old
1,nonoboy,johnokada,japanese,old
2,fahrenheit451,raybradbury,bookburning,old
3,1sttodieanovel,jamespatterson,fiction,new
4,theno1ladiesdetectiveagency,alexandermccallsmith,botswana,new


In [8]:
tokens = [list(row) for index, row in word2vec_data.iterrows()]

In [9]:
vectorSize = 5
embedding_model = Word2Vec(tokens, vector_size=vectorSize, workers=6, min_count=1)

In [10]:
books_data['categories'] = books_data['categories'].apply(lambda x: embedding_model.wv[x])
books_data['Book-Author'] = books_data['Book-Author'].apply(lambda x: embedding_model.wv[x])
books_data['title'] = books_data['title'].apply(lambda x: embedding_model.wv[x])

In [11]:
scaler = MinMaxScaler()
scaler.fit(books_data[['Year-Of-Publication']])
books_data['Year-Of-Publication'] = scaler.transform(books_data[['Year-Of-Publication']])
books_data.head(5)

Unnamed: 0,title,Book-Author,Year-Of-Publication,categories
0,"[-0.1589609, -0.12047824, 0.105400525, 0.20482...","[0.20753618, 0.15359612, 0.15577872, 0.1535045...",0.634921,"[0.365427, 1.7963643, 1.9628439, -1.1997832, -..."
1,"[0.15081216, -0.08867208, 0.18637246, 0.039561...","[0.035992485, -0.051977538, -0.0811907, -0.022...",0.587302,"[0.081445545, 0.03007207, -0.08950874, -0.0727..."
2,"[0.0885797, 0.12104141, 0.201732, -0.13435763,...","[0.20753618, 0.15359612, 0.15577872, 0.1535045...",0.571429,"[0.095903516, 0.011547155, 0.14294764, 0.05322..."
3,"[0.04621587, -0.100881785, 0.18313536, 0.02327...","[-0.017149108, -0.09995801, -0.007233374, -0.1...",0.769841,"[0.365427, 1.7963643, 1.9628439, -1.1997832, -..."
4,"[0.17594343, -0.09071953, 0.0020906762, -0.011...","[-0.028140398, 0.09627315, -0.008315163, -0.11...",0.777778,"[-0.012512286, 0.12762418, -0.098618105, 0.064..."


In [12]:
def func(column, vector_size):
    split = pd.DataFrame(books_data[column].apply(pd.Series))
    split_columns = []
    for i in range(vector_size):
        split_columns.append(column + str(i))
    split.columns = split_columns
    return split

In [13]:
annoy_data = pd.concat([func('title', vectorSize), func('Book-Author', vectorSize), func('categories', vectorSize), books_data['Year-Of-Publication']], axis=1)
annoy_data.head(5)

Unnamed: 0,title0,title1,title2,title3,title4,Book-Author0,Book-Author1,Book-Author2,Book-Author3,Book-Author4,categories0,categories1,categories2,categories3,categories4,Year-Of-Publication
0,-0.158961,-0.120478,0.105401,0.204827,0.078647,0.207536,0.153596,0.155779,0.153505,-0.244842,0.365427,1.796364,1.962844,-1.199783,-1.459167,0.634921
1,0.150812,-0.088672,0.186372,0.039562,-0.11299,0.035992,-0.051978,-0.081191,-0.022568,-0.094587,0.081446,0.030072,-0.089509,-0.07279,-0.209239,0.587302
2,0.08858,0.121041,0.201732,-0.134358,-0.137923,0.207536,0.153596,0.155779,0.153505,-0.244842,0.095904,0.011547,0.142948,0.053226,0.07669,0.571429
3,0.046216,-0.100882,0.183135,0.023275,-0.183189,-0.017149,-0.099958,-0.007233,-0.149294,0.034346,0.365427,1.796364,1.962844,-1.199783,-1.459167,0.769841
4,0.175943,-0.09072,0.002091,-0.011578,0.127756,-0.02814,0.096273,-0.008315,-0.111623,0.059503,-0.012512,0.127624,-0.098618,0.064544,-0.088841,0.777778


In [14]:
annoy_index = AnnoyIndex(16, 'euclidean')
for index, row in annoy_data.iterrows():
    annoy_index.add_item(index, list(row))

In [15]:
annoy_index.build(30)

True

In [16]:
annoy_index.get_nns_by_item(2, 6)[1:]

[15152, 8492, 4357, 13425, 13147]