# Предобработка датасета

In [1]:
import pandas as pd 
import numpy as np 

In [2]:
df = pd.read_csv('GoodReads_100k_books.csv')
df.head()

Unnamed: 0,author,bookformat,desc,genre,img,isbn,isbn13,link,pages,rating,reviews,title,totalratings
0,Laurence M. Hauptman,Hardcover,Reveals that several hundred thousand Indians ...,"History,Military History,Civil War,American Hi...",https://i.gr-assets.com/images/S/compressed.ph...,002914180X,9780000000000.0,https://goodreads.com/book/show/1001053.Betwee...,0,3.52,5,Between Two Fires: American Indians in the Civ...,33
1,"Charlotte Fiell,Emmanuelle Dirix",Paperback,Fashion Sourcebook - 1920s is the first book i...,"Couture,Fashion,Historical,Art,Nonfiction",https://i.gr-assets.com/images/S/compressed.ph...,1906863482,9780000000000.0,https://goodreads.com/book/show/10010552-fashi...,576,4.51,6,Fashion Sourcebook 1920s,41
2,Andy Anderson,Paperback,The seminal history and analysis of the Hungar...,"Politics,History",https://i.gr-assets.com/images/S/compressed.ph...,948984147,9780000000000.0,https://goodreads.com/book/show/1001077.Hungar...,124,4.15,2,Hungary 56,26
3,Carlotta R. Anderson,Hardcover,"""All-American Anarchist"" chronicles the life a...","Labor,History",https://i.gr-assets.com/images/S/compressed.ph...,814327079,9780000000000.0,https://goodreads.com/book/show/1001079.All_Am...,324,3.83,1,All-American Anarchist: Joseph A. Labadie and ...,6
4,Jean Leveille,,"Aujourdâ€™hui, lâ€™oiseau nous invite Ã sa ta...",,https://i.gr-assets.com/images/S/compressed.ph...,2761920813,,https://goodreads.com/book/show/10010880-les-o...,177,4.0,1,Les oiseaux gourmands,1


Удалим ненужные колонки:

In [3]:
df = df.drop(columns=['bookformat', 'isbn', 'isbn13', 'link' ,  'pages', 'reviews', 'totalratings'])
df.head()

Unnamed: 0,author,desc,genre,img,rating,title
0,Laurence M. Hauptman,Reveals that several hundred thousand Indians ...,"History,Military History,Civil War,American Hi...",https://i.gr-assets.com/images/S/compressed.ph...,3.52,Between Two Fires: American Indians in the Civ...
1,"Charlotte Fiell,Emmanuelle Dirix",Fashion Sourcebook - 1920s is the first book i...,"Couture,Fashion,Historical,Art,Nonfiction",https://i.gr-assets.com/images/S/compressed.ph...,4.51,Fashion Sourcebook 1920s
2,Andy Anderson,The seminal history and analysis of the Hungar...,"Politics,History",https://i.gr-assets.com/images/S/compressed.ph...,4.15,Hungary 56
3,Carlotta R. Anderson,"""All-American Anarchist"" chronicles the life a...","Labor,History",https://i.gr-assets.com/images/S/compressed.ph...,3.83,All-American Anarchist: Joseph A. Labadie and ...
4,Jean Leveille,"Aujourdâ€™hui, lâ€™oiseau nous invite Ã sa ta...",,https://i.gr-assets.com/images/S/compressed.ph...,4.0,Les oiseaux gourmands


img я оставил для возможного дальнейшего улучшения (выводить рядом с результатом запроса картинку книги)

Посмотрим, есть ли пропущенные значения:

In [4]:
df.isna().sum()

author        0
desc       6772
genre     10467
img        3045
rating        0
title         1
dtype: int64

Пропущенный жанр - не критично, а вот строки с пропущенным заголовком и описанием нужно удалить.

In [5]:
df = df.dropna(how='any', subset=['desc', 'title'])
df.shape

(93228, 6)

In [6]:
df.isna().sum()

author       0
desc         0
genre     7743
img       1240
rating       0
title        0
dtype: int64

В остальных колонках Nan заменим на пустую строку:

In [7]:
df[['genre', 'img']] = df[['genre','img']].fillna(value='')
df.isna().sum()

author    0
desc      0
genre     0
img       0
rating    0
title     0
dtype: int64

In [8]:
df = df.sort_values('rating', ascending=False)

In [9]:
df_processed = df[['author', 'desc', 'genre', 'title']].copy()

In [10]:
df['title'] = df['author'] + ' "'+ df['title'] + '"'
df.head()

Unnamed: 0,author,desc,genre,img,rating,title
69580,Kwame Nkrumah,Kwame Nkrumah intended to write on the Zimbabw...,Nonfiction,https://i.gr-assets.com/images/S/compressed.ph...,5.0,"Kwame Nkrumah ""Rhodesia File"""
59983,Vic Juris,Take the ultimate jazz master class with Guita...,,https://i.gr-assets.com/images/S/compressed.ph...,5.0,"Vic Juris ""All That Jazz"""
58256,Andrea Kaitany,A collection of over 100 recipes from East Afr...,"Cultural,Africa",https://i.gr-assets.com/images/S/compressed.ph...,5.0,"Andrea Kaitany ""From the Heartland to the Hear..."
95256,Frederic Rzewski,The People United Will Never Be Defeated,,https://i.gr-assets.com/images/S/compressed.ph...,5.0,"Frederic Rzewski ""The People United Will Never..."
64531,"John Bryson,Robert Lewis",Don't face college unprepared. Be ready. Colle...,,https://i.gr-assets.com/images/S/compressed.ph...,5.0,"John Bryson,Robert Lewis ""College Ready Stude..."


В результате запроса будут выводиться два столбца из данного датасета: title и desc.
Но для реализации score я собираюсь использовать и author, и genre.

Для распараллеливания процессов я использовал pandarallel. Правда есть одна сложность, прописанная даже в документации: импорт библиотек должен быть внутри вызываемой функции, иначе не заработает.

In [None]:
!pip install pandarallel

In [11]:
from pandarallel import pandarallel
pandarallel.initialize(progress_bar=True)

def process(text):
    import string
    import nltk
    from nltk.corpus import stopwords, wordnet
    from nltk.stem.snowball import SnowballStemmer
    import re
    
    sw_eng = set(stopwords.words('english'))
    stemmer = SnowballStemmer(language='english')
    
    #remove punctuation
    for punctuation in string.punctuation:
        text = str(text).replace(punctuation, '')
        
    # tokenize
    expr = r'[^(\w.\w)\w\s]'
    parser=re.compile(expr)
    text = parser.sub(r'', text).split()
    
    # to lower
    text = [word.lower() for word in text]
    
    # delete stop-words
    text = [word for word in text if word not in sw_eng]
    
    # stemming
    return ' '.join([stemmer.stem(word) for word in text])


for column in list(df_processed.columns):
    df_processed[column] = df_processed[column].parallel_apply(process)

df_processed.head()

INFO: Pandarallel will run on 4 workers.
INFO: Pandarallel will use standard multiprocessing data transfer (pipe) to transfer data between the main process and workers.

https://nalepae.github.io/pandarallel/troubleshooting/


VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=23307), Label(value='0 / 23307')))…

VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=23307), Label(value='0 / 23307')))…

VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=23307), Label(value='0 / 23307')))…

VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=23307), Label(value='0 / 23307')))…

Unnamed: 0,author,desc,genre,title
69580,kwame nkrumah,kwame nkrumah intend write zimbabwean struggl ...,nonfict,rhodesia file
59983,vic juri,take ultim jazz master class guitar world colu...,,jazz
58256,andrea kaitani,collect 100 recip east africa along color phot...,culturalafrica,heartland heart rift east african cookbook
95256,freder rzewski,peopl unit never defeat,,peopl unit never defeat piano zenon piano librari
64531,john brysonrobert lewi,dont face colleg unprepar readi colleg readi y...,,colleg readi student guid make next great adve...


In [14]:
df_processed = df_processed.rename(columns={'author': 'author_proc', 'desc': 'desc_proc', 'genre': 'genre_proc', 'title': 'title_proc'})
df_processed.head()

Unnamed: 0,author_proc,desc_proc,genre_proc,title_proc
69580,kwame nkrumah,kwame nkrumah intend write zimbabwean struggl ...,nonfict,rhodesia file
59983,vic juri,take ultim jazz master class guitar world colu...,,jazz
58256,andrea kaitani,collect 100 recip east africa along color phot...,culturalafrica,heartland heart rift east african cookbook
95256,freder rzewski,peopl unit never defeat,,peopl unit never defeat piano zenon piano librari
64531,john brysonrobert lewi,dont face colleg unprepar readi colleg readi y...,,colleg readi student guid make next great adve...


In [19]:
search_data = pd.concat([df[['title', 'desc']], df_processed], axis=1)
search_data.head()

Unnamed: 0,title,desc,author_proc,desc_proc,genre_proc,title_proc
69580,"Kwame Nkrumah ""Rhodesia File""",Kwame Nkrumah intended to write on the Zimbabw...,kwame nkrumah,kwame nkrumah intend write zimbabwean struggl ...,nonfict,rhodesia file
59983,"Vic Juris ""All That Jazz""",Take the ultimate jazz master class with Guita...,vic juri,take ultim jazz master class guitar world colu...,,jazz
58256,"Andrea Kaitany ""From the Heartland to the Hear...",A collection of over 100 recipes from East Afr...,andrea kaitani,collect 100 recip east africa along color phot...,culturalafrica,heartland heart rift east african cookbook
95256,"Frederic Rzewski ""The People United Will Never...",The People United Will Never Be Defeated,freder rzewski,peopl unit never defeat,,peopl unit never defeat piano zenon piano librari
64531,"John Bryson,Robert Lewis ""College Ready Stude...",Don't face college unprepared. Be ready. Colle...,john brysonrobert lewi,dont face colleg unprepar readi colleg readi y...,,colleg readi student guid make next great adve...


In [20]:
search_data.shape

(93228, 6)

In [21]:
search_data.to_csv('search_data.csv', index=False)