In [12]:
import pandas as pd
import re

## Use genres to create "collections"
aka categories for each book to locate them in library

In [4]:
info = pd.read_csv("../data/book_info.csv")
info

Unnamed: 0,isbn,title,author,genre,cover,desc
0,9780385498418,Woman: An Intimate Geography,Natalie Angier,,https://covers.openlibrary.org/b/isbn/97803854...,
1,9780679434597,A history of the breast,Marilyn Yalom,"Breast -- Social aspects., Breast -- History.,...",https://covers.openlibrary.org/b/isbn/97806794...,
2,9780692238066,A boy like me,Jennie Wood,"Juvenile fiction, Transgender youth, Sexual or...",https://covers.openlibrary.org/b/isbn/97806922...,"Born a girl, Peyton Honeycutt meets Tara Parks..."
3,9780307396402,Lucy's legacy: the quest for human origins,Donald C. Johanson,,https://covers.openlibrary.org/b/isbn/97803073...,"{'type': '/type/text', 'value': '""In his New Y..."
4,9781594480201,Beijing Doll,Chun Sue,"Sue, Chun, Fiction - General, Fiction, General...",https://covers.openlibrary.org/b/isbn/97815944...,
...,...,...,...,...,...,...
258,9780806129792,Feminism and Disability,Hillyer Barbara,"Coping with disability, Disability: social asp...",https://covers.openlibrary.org/b/isbn/97808061...,
259,9780374286217,Walking out on the boys,Frances K. Conley,"Conley, Frances K, Neurosurgeons -- California...",https://covers.openlibrary.org/b/isbn/97803742...,"{'type': '/type/text', 'value': '""In May 1991,..."
260,9780811840989,Woman: a celebration,,"Photography of women, Portrait photography, Wo...",https://covers.openlibrary.org/b/isbn/97808118...,
261,446603775,Dawn,,,https://covers.openlibrary.org/b/isbn/44660377...,


In [9]:
# we will try to organize books in the library by genres 
# maybe around ~10 collections is good?
genre_count = pd.read_csv("../data/genre_count.csv")
print(genre_count.sort_values(['count'], ascending=[False]).to_string())

                                                   genre  count
15                                             Sociology     29
13                                        Social Science     26
22                             Women's Studies - General     14
19                                         United States     14
4                                                General     13
23                                                 Women     13
3                                                Fiction     10
7                                                History     10
62                                            Psychology      9
73                      Social Science / Women's Studies      8
12                            Feminism & Feminist Theory      7
44                                             Biography      7
26                                     Social conditions      7
30                                        Gender Studies      7
25                                      

In [28]:
# key to determine collection based on genre terms

# Collections: Gender Studies, LGBTQ+, Textbooks, Fiction, History, Art, Social Science, Self-help, Other

# Sorted by descending popularity (based on above)
gender = {'women', 'womens', 'feminism', 'gender', 'men', 'mens', 'masculinity', 'sex'}
lgbtq = {'gay', 'homosexuality', 'lesbian', 'orientation', 'sexuality'}
social = {'sociology', 'politics', 'social', 'psychology'}
art = {'art', 'poetry', 'comic'}
fiction = {'fiction'}
history = {'history', 'movement', 'movements'}
self_help = {'selfhelp', 'health'}
textbook = {'textbooks', 'textbook'}


collection = []
for i, row in info.iterrows():
    genre_clean = str(row['genre']).lower()
    genre_clean = re.sub('[^a-zA-Z0-9 ]','', genre_clean)
    genres = (set(genre_clean.split())) # only key words in genre(s)
    
    # each book can only be part of one collection
    # going from least popular to most popular genres in general
    if genres.intersection({"nan"}):
        collection.append("other")

    elif genres.intersection(textbook):
        collection.append("textbooks")

    elif genres.intersection(self_help):
        collection.append("self-help")
    
    elif genres.intersection(history):
        collection.append("history")

    elif genres.intersection(fiction):
        collection.append("fiction")

    elif genres.intersection(art):
        collection.append("art")

    elif genres.intersection(social):
        collection.append("social science")

    elif genres.intersection(lgbtq):
        collection.append("lgbtq+")

    elif genres.intersection(gender):
        collection.append("gender studies")
    
    else:
        collection.append("other")

collection

['other',
 'history',
 'fiction',
 'other',
 'fiction',
 'history',
 'other',
 'history',
 'social science',
 'history',
 'social science',
 'other',
 'art',
 'social science',
 'history',
 'social science',
 'gender studies',
 'history',
 'other',
 'other',
 'history',
 'textbooks',
 'social science',
 'gender studies',
 'gender studies',
 'other',
 'fiction',
 'fiction',
 'lgbtq+',
 'fiction',
 'fiction',
 'lgbtq+',
 'lgbtq+',
 'social science',
 'fiction',
 'other',
 'fiction',
 'other',
 'art',
 'fiction',
 'other',
 'other',
 'other',
 'history',
 'history',
 'gender studies',
 'other',
 'social science',
 'history',
 'social science',
 'other',
 'history',
 'other',
 'social science',
 'other',
 'other',
 'social science',
 'self-help',
 'other',
 'other',
 'other',
 'other',
 'other',
 'social science',
 'social science',
 'social science',
 'other',
 'gender studies',
 'other',
 'social science',
 'other',
 'other',
 'gender studies',
 'social science',
 'history',
 'gender stu

In [30]:
# Make sure this is equal to number of books we have in library
len(collection)

263

In [29]:
from collections import Counter
Counter(collection)
# i know we have a bunch of textbooks so textbook count is def wrong
# too many 'other' books so need to find a way around that (maybe evaluate based on title / get genre info from a different API)

Counter({'other': 105,
         'social science': 49,
         'history': 34,
         'gender studies': 31,
         'fiction': 22,
         'self-help': 10,
         'art': 6,
         'lgbtq+': 4,
         'textbooks': 2})

In [31]:
# appending collection info as col in df
info['collection'] = collection
info

Unnamed: 0,isbn,title,author,genre,cover,desc,collection
0,9780385498418,Woman: An Intimate Geography,Natalie Angier,,https://covers.openlibrary.org/b/isbn/97803854...,,other
1,9780679434597,A history of the breast,Marilyn Yalom,"Breast -- Social aspects., Breast -- History.,...",https://covers.openlibrary.org/b/isbn/97806794...,,history
2,9780692238066,A boy like me,Jennie Wood,"Juvenile fiction, Transgender youth, Sexual or...",https://covers.openlibrary.org/b/isbn/97806922...,"Born a girl, Peyton Honeycutt meets Tara Parks...",fiction
3,9780307396402,Lucy's legacy: the quest for human origins,Donald C. Johanson,,https://covers.openlibrary.org/b/isbn/97803073...,"{'type': '/type/text', 'value': '""In his New Y...",other
4,9781594480201,Beijing Doll,Chun Sue,"Sue, Chun, Fiction - General, Fiction, General...",https://covers.openlibrary.org/b/isbn/97815944...,,fiction
...,...,...,...,...,...,...,...
258,9780806129792,Feminism and Disability,Hillyer Barbara,"Coping with disability, Disability: social asp...",https://covers.openlibrary.org/b/isbn/97808061...,,self-help
259,9780374286217,Walking out on the boys,Frances K. Conley,"Conley, Frances K, Neurosurgeons -- California...",https://covers.openlibrary.org/b/isbn/97803742...,"{'type': '/type/text', 'value': '""In May 1991,...",gender studies
260,9780811840989,Woman: a celebration,,"Photography of women, Portrait photography, Wo...",https://covers.openlibrary.org/b/isbn/97808118...,,gender studies
261,446603775,Dawn,,,https://covers.openlibrary.org/b/isbn/44660377...,,other


## Clean up some of the description
Some description were still in json format 

example:
```
"{'type': '/type/text', 'value': 'Oklahoma teen Neal Barton stands up for his favorite fantasy series, The Chronicles of Apathea Ravenchilde, when conservative Christians try to bully the town of Americus into banning it from the public library.'}"
```

In [43]:
for i, row in info.iterrows():
   if str(row['desc']).startswith("{'type'"):
      new_desc = str(eval(row['desc'])['value'])
      info.at[i, 'desc'] = new_desc # directly update df

In [44]:
info

Unnamed: 0,isbn,title,author,genre,cover,desc,collection
0,9780385498418,Woman: An Intimate Geography,Natalie Angier,,https://covers.openlibrary.org/b/isbn/97803854...,,other
1,9780679434597,A history of the breast,Marilyn Yalom,"Breast -- Social aspects., Breast -- History.,...",https://covers.openlibrary.org/b/isbn/97806794...,,history
2,9780692238066,A boy like me,Jennie Wood,"Juvenile fiction, Transgender youth, Sexual or...",https://covers.openlibrary.org/b/isbn/97806922...,"Born a girl, Peyton Honeycutt meets Tara Parks...",fiction
3,9780307396402,Lucy's legacy: the quest for human origins,Donald C. Johanson,,https://covers.openlibrary.org/b/isbn/97803073...,"""In his New York times bestseller, Lucy: the b...",other
4,9781594480201,Beijing Doll,Chun Sue,"Sue, Chun, Fiction - General, Fiction, General...",https://covers.openlibrary.org/b/isbn/97815944...,,fiction
...,...,...,...,...,...,...,...
258,9780806129792,Feminism and Disability,Hillyer Barbara,"Coping with disability, Disability: social asp...",https://covers.openlibrary.org/b/isbn/97808061...,,self-help
259,9780374286217,Walking out on the boys,Frances K. Conley,"Conley, Frances K, Neurosurgeons -- California...",https://covers.openlibrary.org/b/isbn/97803742...,"""In May 1991, Frances Conley, the first female...",gender studies
260,9780811840989,Woman: a celebration,,"Photography of women, Portrait photography, Wo...",https://covers.openlibrary.org/b/isbn/97808118...,,gender studies
261,446603775,Dawn,,,https://covers.openlibrary.org/b/isbn/44660377...,,other


## Correct capitalization in titles
This feels unnecessarily complicated, somehow

In [77]:
# words to not capitalize
# remember exceptions for first word and first word after ':' colon
dont = ['the', 'of', 'a', 'an', 'from', 'with', 'for', 'and', 'in', 'at', 'on', 'to', 'for']

In [85]:
for i, row in info.iterrows():
    follows_colon = False
    
    # list of every word in title
    title = row['title'].split()
    correct_title = []
    for j in range(len(title)):
        # sometimes titles are stylistically in all upper-case
        # assume first letter (in title or after colon) is always capitalized
        if (j == 0) or (follows_colon):
            correct_title.append(title[j].title())
        elif title[j].lower() not in dont:
            correct_title.append(title[j].title())
        else:
            correct_title.append(title[j].lower())

        # toggle follows colon if colon is detected
        if title[j][-1] == ':':
            # print(title[i])
            follows_colon = True
        else:
            follows_colon = False
    
    # join list back to string
    correct_title = (' '.join(correct_title))

    # fix title within dataframe
    info.at[i, 'title'] = correct_title # directly update df
    print(correct_title)



Woman: An Intimate Geography
A History of the Breast
A Boy Like Me
Lucy'S Legacy: The Quest for Human Origins
Beijing Doll
They'Re Bankrupting Us!: And Twenty Other Myths About Unions
100 Words Almost Everyone Confuses & Misuses
Shadows of Tender Fury: The Letters and CommuniquéS of Subcomandante Marcos and the Zapatista Army of National Liberation
When Women Come First: Gender and Class in Transnational Migration
Conquest: Sexual Violence and American Indian Genocide
The Trouble with Diversity: How We Learned to Love Identity and Ignore Inequality
Labor and the Locavore the Making of a Comprehensive Food Ethic
Of Poetry & Protest: From Emmett Till to Trayvon Martin
It'S a Jungle Out There: The Feminist Survival Guide to Politically Inhospitable Environments
Cuestión De Educación: Un Viaje Por La Enseñanza Española
Women'S Growth in Connection: Writings from the Stone Center
Gender in Cross-Cultural Perspective
Palabras De Mujeres: Escritoras EspañOlas ContemporáNeas
In the Time of 

In [86]:
info

Unnamed: 0,isbn,title,author,genre,cover,desc,collection
0,9780385498418,Woman: An Intimate Geography,Natalie Angier,,https://covers.openlibrary.org/b/isbn/97803854...,,other
1,9780679434597,A History of the Breast,Marilyn Yalom,"Breast -- Social aspects., Breast -- History.,...",https://covers.openlibrary.org/b/isbn/97806794...,,history
2,9780692238066,A Boy Like Me,Jennie Wood,"Juvenile fiction, Transgender youth, Sexual or...",https://covers.openlibrary.org/b/isbn/97806922...,"Born a girl, Peyton Honeycutt meets Tara Parks...",fiction
3,9780307396402,Lucy'S Legacy: The Quest for Human Origins,Donald C. Johanson,,https://covers.openlibrary.org/b/isbn/97803073...,"""In his New York times bestseller, Lucy: the b...",other
4,9781594480201,Beijing Doll,Chun Sue,"Sue, Chun, Fiction - General, Fiction, General...",https://covers.openlibrary.org/b/isbn/97815944...,,fiction
...,...,...,...,...,...,...,...
258,9780806129792,Feminism and Disability,Hillyer Barbara,"Coping with disability, Disability: social asp...",https://covers.openlibrary.org/b/isbn/97808061...,,self-help
259,9780374286217,Walking Out on the Boys,Frances K. Conley,"Conley, Frances K, Neurosurgeons -- California...",https://covers.openlibrary.org/b/isbn/97803742...,"""In May 1991, Frances Conley, the first female...",gender studies
260,9780811840989,Woman: A Celebration,,"Photography of women, Portrait photography, Wo...",https://covers.openlibrary.org/b/isbn/97808118...,,gender studies
261,446603775,Dawn,,,https://covers.openlibrary.org/b/isbn/44660377...,,other


In [82]:
somestring = "(hello)"
somestring.title()

'(Hello)'

In [87]:
info.to_csv("../data/cleaned_book_info.csv")