In [1]:
import pandas as pd

<font size="3">Here, we want to generate some string features of a book.
We will later use the string associated with each book to process the 
user query string. In particular, we will measure the similarity between 
the string associated with a book and the user query string. The similarity is based 
on tf-idf in NLP</font>

In [2]:
ratings_df = pd.read_csv('data/ratings.csv')
to_read_df = pd.read_csv('data/to_read.csv')
books_df = pd.read_csv('data/books.csv')
tags_df = pd.read_csv('data/tags.csv')
book_tags_df = pd.read_csv('data/book_tags.csv')

In [3]:
book_tags_df = book_tags_df.merge(tags_df, on = 'tag_id')
book_tags_df = book_tags_df.merge(books_df[['book_id','goodreads_book_id']], on = 'goodreads_book_id')
book_tags_df.head()

Unnamed: 0,goodreads_book_id,tag_id,count,tag_name,book_id
0,1,30574,167697,to-read,27
1,1,11305,37174,fantasy,27
2,1,11557,34173,favorites,27
3,1,8717,12986,currently-reading,27
4,1,33114,12716,young-adult,27


In [4]:
books_string_feature_df = books_df[['book_id','authors','original_title','title','language_code']]
books_string_feature_df.head()

Unnamed: 0,book_id,authors,original_title,title,language_code
0,1,Suzanne Collins,The Hunger Games,"The Hunger Games (The Hunger Games, #1)",eng
1,2,"J.K. Rowling, Mary GrandPré",Harry Potter and the Philosopher's Stone,Harry Potter and the Sorcerer's Stone (Harry P...,eng
2,3,Stephenie Meyer,Twilight,"Twilight (Twilight, #1)",en-US
3,4,Harper Lee,To Kill a Mockingbird,To Kill a Mockingbird,eng
4,5,F. Scott Fitzgerald,The Great Gatsby,The Great Gatsby,eng


In [5]:
books_string_feature_df.tail()

Unnamed: 0,book_id,authors,original_title,title,language_code
9995,9996,Ilona Andrews,Bayou Moon,"Bayou Moon (The Edge, #2)",eng
9996,9997,Robert A. Caro,Means of Ascent,"Means of Ascent (The Years of Lyndon Johnson, #2)",eng
9997,9998,Patrick O'Brian,The Mauritius Command,The Mauritius Command,eng
9998,9999,Peggy Orenstein,Cinderella Ate My Daughter: Dispatches from th...,Cinderella Ate My Daughter: Dispatches from th...,eng
9999,10000,John Keegan,The First World War,The First World War,


In [6]:
all_tags_list_of_lists = []
for book_id in range(1,10001):
    all_tags = book_tags_df[book_tags_df['book_id'] == book_id].tag_name.tolist()
    all_tags_list_of_lists.append(all_tags)

In [7]:
books_string_feature_df['all_tags'] = all_tags_list_of_lists

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  books_string_feature_df['all_tags'] = all_tags_list_of_lists


In [8]:
books_string_feature_df.head()

Unnamed: 0,book_id,authors,original_title,title,language_code,all_tags
0,1,Suzanne Collins,The Hunger Games,"The Hunger Games (The Hunger Games, #1)",eng,"[to-read, fantasy, favorites, currently-readin..."
1,2,"J.K. Rowling, Mary GrandPré",Harry Potter and the Philosopher's Stone,Harry Potter and the Sorcerer's Stone (Harry P...,eng,"[to-read, fantasy, favorites, currently-readin..."
2,3,Stephenie Meyer,Twilight,"Twilight (Twilight, #1)",en-US,"[to-read, fantasy, favorites, currently-readin..."
3,4,Harper Lee,To Kill a Mockingbird,To Kill a Mockingbird,eng,"[to-read, favorites, currently-reading, young-..."
4,5,F. Scott Fitzgerald,The Great Gatsby,The Great Gatsby,eng,"[to-read, favorites, currently-reading, young-..."


In [9]:
#an example of what book string feature might be

string_features = ""

for entry in books_string_feature_df.iloc[0].tolist()[1:]:
    if type(entry) == str:
        string_features += entry
    elif type(entry) == list:
        string_features += ' '.join(entry)

print(string_features)

Suzanne CollinsThe Hunger GamesThe Hunger Games (The Hunger Games, #1)engto-read fantasy favorites currently-reading young-adult fiction books-i-own owned ya series favourites re-read adventure sci-fi-fantasy all-time-favorites default my-books reread i-own audiobook 5-stars favorite-books novels fantasy-sci-fi favorite audiobooks read-more-than-once my-library ya-fantasy teen english books ya-fiction my-favorites own-it library audio young-adult-fiction novel scifi-fantasy faves favorite-series shelfari-favorites kindle romance favourite to-buy read-in-2014 ebook contemporary 5-star coming-of-age favourite-books favs action read-in-2013 read-in-2011 finished ya-books borrowed sci-fi ya-lit loved love thriller science-fiction finished-series action-adventure scifi sf book-club speculative-fiction ebooks e-book read-in-2012 read-in-2010 survival future drama reviewed suspense dystopia dystopian post-apocalyptic read-2012 dystopias dystopian-fiction distopia distopian read-2011 teen-fict

In [10]:
combined_string_features_list = []

for row_index in range(0,10000):

    string_features = ""

    for entry in books_string_feature_df.iloc[row_index].tolist()[1:]:
        if type(entry) == str:
            string_features += entry
        elif type(entry) == list:
            string_features += ' '.join(entry)
    
    combined_string_features_list.append(string_features)

In [11]:
combined_string_features_df = pd.DataFrame()
combined_string_features_df['book_id'] = books_string_feature_df['book_id']
combined_string_features_df['combined_string_features'] = combined_string_features_list
combined_string_features_df.head()

Unnamed: 0,book_id,combined_string_features
0,1,Suzanne CollinsThe Hunger GamesThe Hunger Game...
1,2,"J.K. Rowling, Mary GrandPréHarry Potter and th..."
2,3,"Stephenie MeyerTwilightTwilight (Twilight, #1)..."
3,4,Harper LeeTo Kill a MockingbirdTo Kill a Mocki...
4,5,F. Scott FitzgeraldThe Great GatsbyThe Great G...


In [12]:
combined_string_features_df.to_csv('combined_string_features_df.csv')