In [345]:
import pandas as pd
import numpy as np

import nltk
from nltk.corpus import stopwords
nltk.download('stopwords')

import re

# download list of stopwords from nltk lib.
stop_words = set(stopwords.words('english'))

import warnings
warnings.filterwarnings('ignore')


from sklearn.feature_extraction.text import CountVectorizer

from sklearn.metrics import pairwise_distances

from scipy.sparse import hstack


pd.set_option('display.max_colwidth', None)

[nltk_data] Downloading package stopwords to /home/karol/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [452]:
def convert_to_string(row, column):
    if isinstance(row[column], list):
        # If the value is a list, join the strings using a comma
        subject_key_str = ' '.join(row[column])
    else:
        # If the value is a string, just return the string itself
        subject_key_str = row[column]
    return subject_key_str


def nlp_preprocessing(total_text, index, column, dataframe):
    if type(total_text) is str:
        string = ""
        for words in total_text.split():
            # remove the special chars like '"#$@!%^&*()_+-~?>< etc.
            word = ("".join(e for e in words if e.isalnum()))
            # Convert all letters to lower-case
            word = word.lower()
            # stop-word removal
            if not word in stop_words:
                string += word + " "
        dataframe[column][index] = string
    else:
        dataframe[column][index] = ""
        
        

def contains(sublist, item):
    for l in sublist:
        if re.search(r".*{}.*".format(item.lower()), l.lower()):
            return True
    return False
        
    
    
all_names = {
        'iran': ['iran', 'persia', 'persepolis', 'pasargad', 'elam', 'media'],
        'iraq': ['iraq', 'mesopotamia', 'sumer', 'akkad', 'babylon', 'assyr', 'parthi', 'sassanian'],
        'united kingdom': ['england','ireland', 'scotland', 'wales', 'london'],
        'pakistan': ['pakistan', 'indus', 'mohendsch', 'mohenj'],
        'greece': ['greece', 'achaea', 'aeolis', 'arcadia', 'boeotia', 'chalcidice', 'crete', 'cyprus', 'cyzicus', 'delphi', 'dodona', 'euboea', 'epirus', 'etolia', 'heracleia', 'ionia', 'laconia', 'lesbos', 'lydia', 'macedonia', 'megaris', 'messinia', 'mycenae', 'olbia', 'peloponnese', 'phocis', 'phoenicia', 'thebes', 'thessaly', 'crete'],
        'albania': ['albania', 'dardania', 'ancient epirus'],
        'algeria': ['algeria', 'numidia', 'roman province of mauretania'],
        'angola': ['angola', 'kingdom of kongo'],
        'armenia': ['armenia', 'urartu', 'arsacid empire'],
        'austria': ['austria', 'ostmark', 'roman province of noricum'],
        'belarus': ['belarus', 'white rus', 'slavic settlements'],
        'belgium': ['belgium', 'belgium', 'roman province of gallia belgica', 'habsburg netherlands'],
        'bosnia and herzegovina': ['bosnia and herzegovina', 'bosna', 'hum'],
        'bulgaria': ['bulgaria', 'thrace', 'odysian kingdom'],
        'croatia': ['croatia', 'panonia', 'illyria'],
        'cyprus': ['cyprus', 'cypriot civilization', 'minoan settlements'],
        'czechia': ['czech republic', 'czech lands', 'great moravian empire', 'bohemian kingdom'],
        'denmark': ['denmark', 'denmark', 'vikings', 'viking age'],
        'finland': ['finland', 'finland', 'samoyede'],
        'france': ['france', 'gaul', 'celtic tribes', 'roman province of gaul'],
        'georgia': ['georgia', 'iberia', 'colchis'],
        'germany': ['germany', 'teutonic tribes', 'holy roman empire'], 
        'hungary': ['hungary', 'hungary', 'avar khaganate'],
        'iceland': ['iceland', 'norse settlers', 'viking age'],
        'ireland': ['ireland', 'ireland', 'celtic tribes'],
        'italy': ['italy', 'latium', 'etruria', 'ausonia', 'enotria', 'roma', 'rome'],
        'kazakhstan': ['kazakhstan', 'saka tribes', 'khazar khanate'],
        'kosovo': ['kosovo', 'kosovo', 'serbian empire'],
        'latvia': ['latvia', 'latvia', 'baltic tribes'],
        'lithuania': ['lithuania', 'lithuania', 'baltic tribes'],
        'luxembourg': ['luxembourg', 'grand duchy of luxembourg'],
        'macedonia': ['macedonia', 'aegae', 'eordaia', 'upper macedonia', 'chalcidice'],
        'north macedonia': ['macedonia', 'aegae', 'eordaia', 'upper macedonia', 'chalcidice'],
        'moldova': ['moldova', 'dacia', 'roman province of dacia'],
        'morocco': ['morocco', 'berber kingdoms'],
        'netherlands': ['netherlands', 'low countries', 'frankish empire', 'dutch republic'],
        'poland': ['poland', 'vistula river trade routes', 'polish-lithuanian commonwealth'],
        'portugal': ['portugal', 'lusitanian tribes', 'roman province of lusitania'],
        'romania': ['romania', 'romania', 'dacia', 'roman province of dacia'],
        'russia': ['russia', 'scythia', 'sarmatians', 'khazar khanate'],
        'serbia': ['serbia', 'serbia', 'serbian empire'],
        'slovakia': ['slovakia', 'slovak lands', 'principality of nitra', 'great moravian empire'],
        'slovenia': ['slovenia', 'slovenian lands', 'slovenia'],
        'spain': ['spain', 'hispania', 'iberian peninsula', 'tartessian civilization', 'numidians', 'roman province of hispanial'],
        'switzerland': ['switzerland', 'helvetian confederacy', 'roman province of helvetia'],
        'turkey': ['turkey', 'anatolia', 'hittite empire', 'hattu', 'phrygian kingdom', 'phrygia', 'ancient greek colonies'],
        'ukraine': ['ukraine', 'kievan rus']
        }


for k,i in all_names.items():
    all_names.update({k:' '.join(all_names[k])})
    
    
def collapse_column(column):
    return 1 if any(column) else 0

In [454]:
#read full book list and process subject_key text column

df = pd.read_json('data/downloaded.json')


# Apply convert_to_string function to each row of the 'subject_key' column
df['subject_key'] = df.apply(convert_to_string, args=('subject_key',), axis=1)
df['place_key'] = df.apply(convert_to_string, args=('place_key',), axis=1)
df['person_key'] = df.apply(convert_to_string, args=('person_key',), axis=1)

for index, row in df.iterrows():
    nlp_preprocessing(row['subject_key'], index, 'subject_key',df)
for index, row in df.iterrows():
    nlp_preprocessing(row['place_key'], index, 'place_key',df)
for index, row in df.iterrows():
    nlp_preprocessing(row['person_key'], index, 'person_key',df)
    
df=df.reset_index(drop=True)

df.iloc[[208]]

Unnamed: 0,title,author_name,first_publish_year,last_publish_year,place,place_key,subject,subject_key,person,person_key,language,isbn,key,url
208,Persians,Lloyd Llewellyn-Jones,2022,2022,"[Babylon, Egypt, Pasargadae, Persepolis]",babylon egypt pasargadae persepolis,"[Persia, ancient history, Asian / Middle Eastern history]",ancienthistory asianmiddleeasternhistory persia,"[Alexander, Artaxerxes, Cambyses, Cyrus, Darius, Herodotus, Xerxes]",alexander artaxerxes cambyses cyrus darius herodotus xerxes,[eng],9781541604230,/works/OL25338087W,https://openlibrary.org/works/OL25338087W


In [455]:
#df[df['person'].fillna('').apply(contains, item='alexa')]


In [456]:
#all_subjects=''
#for i in df['subject_key']:
#    all_subjects+=i

In [457]:
#read selected favorie book list and process subject_key text column

df_fav = pd.read_csv('data/my_favs.csv')
df_fav= df_fav.set_index('Unnamed: 0')
df_fav.index.name = None

#fav_subjects=''
#for i in df_fav['subject_key']:
#    fav_subjects+=i
 #print(fav_subjects)   
    
df_fav["subject_key"] = df_fav["subject_key"].str.replace(","," ")
df_fav["place_key"] = df_fav["place_key"].str.replace(","," ")
df_fav["person_key"] = df_fav["person_key"].str.replace(","," ")

for index, row in df_fav.iterrows():
    nlp_preprocessing(row['subject_key'], index, 'subject_key',df_fav)
for index, row in df_fav.iterrows():
    nlp_preprocessing(row['place_key'], index, 'place_key',df_fav)
for index, row in df_fav.iterrows():
    nlp_preprocessing(row['person_key'], index, 'person_key',df_fav)
    
    
df_fav.head(1)
    




Unnamed: 0,title,author_name,first_publish_year,last_publish_year,place,place_key,subject,subject_key,person,person_key,language,isbn,key,url,favorite
208,Persians,Lloyd Llewellyn-Jones,2022,2022,"Babylon,Egypt,Pasargadae,Persepolis",babylon egypt pasargadae persepolis,"Persia,ancient history,Asian / Middle Eastern history",ancienthistory asianmiddleeasternhistory persia,"Alexander,Artaxerxes,Cambyses,Cyrus,Darius,Herodotus,Xerxes",alexander artaxerxes cambyses cyrus darius herodotus xerxes,eng,9781541604230,/works/OL25338087W,https://openlibrary.org/works/OL25338087W,True


In [458]:

df.iloc[[208]]

Unnamed: 0,title,author_name,first_publish_year,last_publish_year,place,place_key,subject,subject_key,person,person_key,language,isbn,key,url
208,Persians,Lloyd Llewellyn-Jones,2022,2022,"[Babylon, Egypt, Pasargadae, Persepolis]",babylon egypt pasargadae persepolis,"[Persia, ancient history, Asian / Middle Eastern history]",ancienthistory asianmiddleeasternhistory persia,"[Alexander, Artaxerxes, Cambyses, Cyrus, Darius, Herodotus, Xerxes]",alexander artaxerxes cambyses cyrus darius herodotus xerxes,[eng],9781541604230,/works/OL25338087W,https://openlibrary.org/works/OL25338087W


In [459]:
df=df[~df.index.isin(df_fav.index)]

In [460]:

df.iloc[[208]]

Unnamed: 0,title,author_name,first_publish_year,last_publish_year,place,place_key,subject,subject_key,person,person_key,language,isbn,key,url
209,Time in Antiquity,Robert Hannah,2000,2009,,,"[Ancient Civilization, History, Social aspects of Time, Time, Time measurements, Classical Civilization, Temps, Histoire, Aspect social, Mesure, Civilisation ancienne, SCIENCE, Social aspects]",ancientcivilization aspectsocial civilisationancienne classicalcivilization histoire history mesure science socialaspects socialaspectsoftime temps time timemeasurements,,,[eng],1134323131,/works/OL8371761W,https://openlibrary.org/works/OL8371761W


In [461]:
df['place_key']

0                       greece 
1                              
2       greece rome romeempire 
3                              
4                       russia 
                 ...           
4896                     texas 
4897                           
4898                           
4899                           
4900                           
Name: place_key, Length: 4894, dtype: object

In [462]:
df['place_key']=df['place_key'].replace(all_names, regex=True)

In [463]:
df['place_key']

0                       greece achaea aeolis arcadia boeotia chalcidice crete cyprus cyzicus delphi dodona euboea epirus etolia heracleia ionia laconia lesbos lydia macedonia megaris messinia mycenae olbia peloponnese phocis phoenicia thebes thessaly crete 
1                                                                                                                                                                                                                                                                
2       greece achaea aeolis arcadia boeotia chalcidice crete cyprus cyzicus delphi dodona euboea epirus etolia heracleia ionia laconia lesbos lydia macedonia megaris messinia mycenae olbia peloponnese phocis phoenicia thebes thessaly crete rome romeempire 
3                                                                                                                                                                                                                                 

In [464]:
df_fav['place_key']

208     babylon egypt pasargadae persepolis 
866                                    iraq 
1560                                   iraq 
1673                                   iran 
1689                                   iran 
1967                                   iran 
2086                              iran rome 
Name: place_key, dtype: object

In [465]:
df_fav['place_key']=df_fav['place_key'].replace(all_names, regex=True)

In [466]:
df_fav['place_key']

208                             babylon egypt pasargadae persepolis 
866     iraq mesopotamia sumer akkad babylon assyr parthi sassanian 
1560    iraq mesopotamia sumer akkad babylon assyr parthi sassanian 
1673                     iran persia persepolis pasargad elam media 
1689                     iran persia persepolis pasargad elam media 
1967                     iran persia persepolis pasargad elam media 
2086                iran persia persepolis pasargad elam media rome 
Name: place_key, dtype: object

# Create Bag of Words

In [467]:
subject_key_vectorizer = CountVectorizer()
subject_key_features   = subject_key_vectorizer.fit_transform(df['subject_key'])
print( subject_key_features.get_shape() ) 

place_key_vectorizer = CountVectorizer()
place_key_features   = place_key_vectorizer.fit_transform(df['place_key'])
print( place_key_features.get_shape() ) 

person_key_vectorizer = CountVectorizer()
person_key_features   = person_key_vectorizer.fit_transform(df['person_key'])
print( person_key_features.get_shape() )

all_features_df = hstack((subject_key_features, place_key_features,person_key_features)).tocsr()
print(all_features_df.get_shape() )

(4894, 9871)
(4894, 624)
(4894, 401)
(4894, 10896)


In [468]:
subject_key_features   = subject_key_vectorizer.transform(df_fav['subject_key'])
print( subject_key_features.get_shape() ) 

place_key_features   = place_key_vectorizer.transform(df_fav['place_key'])
print( place_key_features.get_shape() ) 

person_key_features   = person_key_vectorizer.transform(df_fav['person_key'])
print( person_key_features.get_shape() )

all_features_df_fav = hstack((subject_key_features, place_key_features,person_key_features)).tocsr()
print(all_features_df_fav.get_shape() )

(7, 9871)
(7, 624)
(7, 401)
(7, 10896)


In [469]:
all_features_df_fav.nonzero()

(array([0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
        1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 3, 3, 3, 3, 3, 3,
        3, 3, 3, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 5, 5, 5, 5, 5, 5, 5, 5, 5,
        5, 5, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6], dtype=int32),
 array([  373,   896,  9944, 10052, 10318, 10682,   410,   524,  1103,
         1879,  4202,  4370,  4824,  5973,  8557,  8566,  9886,  9928,
         9944, 10143, 10240, 10312, 10368, 10406,   329,   886,  1879,
         4202,  4820,  4824,  9886,  9928,  9944, 10143, 10240, 10312,
        10368, 10406,   373,  4202, 10054, 10142, 10234, 10313, 10318,
        10319, 10499,  1879,  4134,  4202,  4211, 10054, 10142, 10234,
        10313, 10318, 10319,   320,   749,  1527,  4202,  4812, 10054,
        10142, 10234, 10313, 10318, 10319,  4202,  4217,  4811,  7259,
         8207, 10054, 10142, 10234, 10313, 10318, 10319, 10351, 10551],
       dtype=int32))

In [470]:

# Collapse each column into a matrix of a single row
collapsed_matrix = np.array([[collapse_column(i) for i in all_features_df_fav.toarray().T]])

print(collapsed_matrix[:,371:376])


[[0 0 1 0 0]]


## For more simultaneous input-books, create a new row in df that contains all the key words of all df_fav books together. Then create a new bag of words

## Or create a matrix from df_fav and combine all rows together

In [471]:
df[df.index.isin(df_fav.index)].index

Index([], dtype='int64')

In [472]:
def bag_of_words_model(selection, num_results):
    # doc_id: apparel's id in given corpus
    
    # pairwise_dist will store the distance from given input apparel to all remaining apparels
    # the metric we used here is cosine, the coside distance is mesured as K(X, Y) = <X, Y> / (||X||*||Y||)
    # http://scikit-learn.org/stable/modules/metrics.html#cosine-similarity
    pairwise_dist = pairwise_distances(all_features_df,selection)
    
    # np.argsort will return indices of the smallest distances
    indices = np.argsort(pairwise_dist.flatten())[0:num_results]
    #pdists will store the smallest distances
    pdists  = np.sort(pairwise_dist.flatten())[0:num_results]

    #data frame indices of the 9 smallest distace's
    df_indices = list(df.index[indices])
    
    for i in range(0,len(indices)):
        # we will pass 1. doc_id, 2. title1, 3. title2, url, model
        #get_result(indices[i],data['title'].loc[df_indices[0]], data['title'].loc[df_indices[i]], data['medium_image_url'].loc[df_indices[i]], 'bag_of_words')
        print('TITLE :',df['title'].loc[df_indices[i]])
        print ('AUTHOR:', df['author_name'].loc[df_indices[i]])
        print ('Euclidean similarity with the query image :', pdists[i])


        print('='*60)

#call the bag-of-words model for a product to get similar products.
bag_of_words_model(collapsed_matrix, 10) # change the index if you want to.

TITLE : Buyid Coinage
AUTHOR: Luke Treadwell
Euclidean similarity with the query image : 5.744562646538029
TITLE : Archaeological perspectives on houses and households in third millennium Mesopotamian society
AUTHOR: Alessandra Salvin
Euclidean similarity with the query image : 5.744562646538029
TITLE : Arsacids and Sasanians
AUTHOR: M. Rahim Shayegan
Euclidean similarity with the query image : 5.744562646538029
TITLE : From Mesopotamia to Iraq
AUTHOR: Hans Jörg Nissen
Euclidean similarity with the query image : 5.830951894845301
TITLE : A survey of Persian art from prehistoric times to the present
AUTHOR: Abbas Daneshvari
Euclidean similarity with the query image : 5.916079783099616
TITLE : Engraved on Stone
AUTHOR: Rony Feingold
Euclidean similarity with the query image : 5.916079783099616
TITLE : Iraq
AUTHOR: Michael Wood
Euclidean similarity with the query image : 6.0
TITLE : Rituals of war
AUTHOR: Zainab Bahrani
Euclidean similarity with the query image : 6.0
TITLE : Intangible s

In [473]:
df_fav

Unnamed: 0,title,author_name,first_publish_year,last_publish_year,place,place_key,subject,subject_key,person,person_key,language,isbn,key,url,favorite
208,Persians,Lloyd Llewellyn-Jones,2022,2022,"Babylon,Egypt,Pasargadae,Persepolis",babylon egypt pasargadae persepolis,"Persia,ancient history,Asian / Middle Eastern history",ancienthistory asianmiddleeasternhistory persia,"Alexander,Artaxerxes,Cambyses,Cyrus,Darius,Herodotus,Xerxes",alexander artaxerxes cambyses cyrus darius herodotus xerxes,eng,9781541604230,/works/OL25338087W,https://openlibrary.org/works/OL25338087W,True
866,Everyday Life in Ancient Mesopotamia,Jean Bottéro,2001,2001,Iraq,iraq mesopotamia sumer akkad babylon assyr parthi sassanian,"Antiquities,Civilization,History,Ancient Near East,Social history,Mesopotamia,BCE to c 500 CE,History: World,Social life and customs,Iraq, history, to 634",ancientneareast antiquities bcetoc500ce civilization history historyworld iraqhistoryto634 mesopotamia socialhistory sociallifeandcustoms,,,eng,9780801868627,/works/OL8396695W,https://openlibrary.org/works/OL8396695W,True
1560,Mesopotamia,Gwendolyn Leick,2001,2002,Iraq,iraq mesopotamia sumer akkad babylon assyr parthi sassanian,"Civilization,Ancient Cities and towns,Iraq, history, to 634,Asia, history,Iraq, antiquities,History",ancientcitiesandtowns asiahistory civilization history iraqantiquities iraqhistoryto634,,,"eng,spa",9780140265743,/works/OL3961855W,https://openlibrary.org/works/OL3961855W,True
1673,Excavating an empire,Touraj Daryaee,2014,2014,Iran,iran persia persepolis pasargad elam media,"History,Ancient History",ancienthistory history,Achaemenid dynasty (559-330 B.C),achaemeniddynasty559330bc,eng,9781568592985,/works/OL23318927W,https://openlibrary.org/works/OL23318927W,True
1689,The Persian empire,"Allen, Lindsay.",2005,2005,Iran,iran persia persepolis pasargad elam media,"Civilization,History,History, ancient,Perzische rijk,Histoire",civilization histoire history historyancient perzischerijk,,,eng,9780226014470,/works/OL5844839W,https://openlibrary.org/works/OL5844839W,True
1967,Persian art,Vladimir Loukonine,1998,2012,Iran,iran persia persepolis pasargad elam media,"Ancient Art,Iranian Art,History,Catalogs,Art",ancientart art catalogs history iranianart,,,eng,9781859951675,/works/OL20325135W,https://openlibrary.org/works/OL20325135W,True
2086,Arsacids and Sasanians,M. Rahim Shayegan,2011,2011,"Rome,Iran",iran persia persepolis pasargad elam media rome,"History,Sassanids,HISTORY / Ancient / General,Political science,Iran, history, to 640",history historyancientgeneral iranhistoryto640 politicalscience sassanids,Arsacid dynasty (247 B.C.-224 A.D),arsaciddynasty247bc224ad,eng,9780521766418,/works/OL16414417W,https://openlibrary.org/works/OL16414417W,True
