In [72]:
import pandas as pd 
import numpy as np 
import seaborn as sns
import matplotlib.pyplot as plt

In [2]:
genres = pd.read_csv('books_genres.csv')
genres.drop(columns=['Unnamed: 0'],inplace=True)
genres.head()

Unnamed: 0,ISBN,genres
0,195153448,"Mythology, History, Classics, Nonfiction, Refe..."
1,2005018,"Fiction, Canada, Historical Fiction, Literary ..."
2,60973129,"History, World War II, Nonfiction, Military Hi..."
3,374157065,"History, Nonfiction, Science, Medical, Medicin..."
4,393045218,"History, Archaeology, Nonfiction, China, Anthr..."


In [3]:
genres[genres['genres'] == '[]']

Unnamed: 0,ISBN,genres
13,1552041778,[]
17,1881320189,[]
22,1879384493,[]
33,3442353866,[]
35,3442446937,[]
...,...,...
951,1928791107,[]
954,0877192820,[]
956,0793133955,[]
960,0380754851,[]


In [4]:
#Removing rows with no genres
droping_index = genres[genres['genres'] == '[]'].index
genres.drop(droping_index,axis=0,inplace=True)

genres = genres.reset_index()
genres.drop(columns=['index'],inplace=True)

In [5]:
genres.shape

(900, 2)

In [6]:
#creating a column where it will store list of genres instead of string
genres['genreslist'] = ''

for i in range(genres.shape[0]):
    list_of_genres = genres['genres'][i].split(', ')
    genres['genreslist'][i] = list_of_genres

In [7]:
#getting the count of highest count of genres is whole dataframe 
genre_count = dict()

for lists in genres['genreslist']:
    for word in lists:
        if word in genre_count:
            genre_count[word] +=1
            
        else:
            genre_count[word] = 1

In [8]:
genre_count = sorted(genre_count.items(), key=lambda x: x[1], reverse=True)
genre_count =  pd.DataFrame(genre_count,columns=['Genre', 'Count_of_genre'])

In [9]:
#taking only list of genres that have repeated more than 10 times 
genre_count = genre_count[genre_count['Count_of_genre']>10]
genre_count

Unnamed: 0,Genre,Count_of_genre
0,Fiction,684
1,Mystery,228
2,Fantasy,180
3,Classics,173
4,Romance,166
...,...,...
77,High Fantasy,12
78,20th Century,11
79,Travel,11
80,Action,11


In [10]:
genre_count_list = list(genre_count['Genre'])

In [11]:
#dropping those columns which have genres not present in genre_count_list
genres2 = genres.copy()
index_tobe_dropped = []

for lists in genres2['genreslist']:
    
    for word in lists:
        if word in genre_count_list:
            break 
        
    else:
        droping_index =  genres2[genres2['genres'] == ', '.join(lists)].index
        index_tobe_dropped.append(droping_index[0])

In [12]:
index_tobe_dropped

[329, 482, 622, 625, 743]

In [13]:
genres2.shape

(900, 3)

In [14]:
genres2.drop(index_tobe_dropped,axis=0,inplace=True)
genres2 = genres2.reset_index()
genres2.drop(columns=['index'],inplace=True)
genres2.shape

(895, 3)

In [37]:
#Exploding the genreslist i.e to maintain atomicity 
genres2 = genres2.explode('genreslist')
genres2.head()

Unnamed: 0,ISBN,genres,genreslist
0,195153448,"Mythology, History, Classics, Nonfiction, Refe...",Mythology
0,195153448,"Mythology, History, Classics, Nonfiction, Refe...",History
0,195153448,"Mythology, History, Classics, Nonfiction, Refe...",Classics
0,195153448,"Mythology, History, Classics, Nonfiction, Refe...",Nonfiction
0,195153448,"Mythology, History, Classics, Nonfiction, Refe...",Reference


In [40]:
genres2.drop(columns=['genres'],inplace=True)
genres2

Unnamed: 0,ISBN,genreslist
0,0195153448,Mythology
0,0195153448,History
0,0195153448,Classics
0,0195153448,Nonfiction
0,0195153448,Reference
...,...,...
892,0765341972,Cryptozoology
893,0590514776,Buffy The Vampire Slayer
893,0590514776,Nonfiction
894,0373037430,Harlequin


In [42]:
genres2.reset_index(inplace=True)
genres2.rename(columns={'index': 'Book_no','genreslist':'genres'},inplace=True)
genres2.head()

Unnamed: 0,Book_no,ISBN,genres
0,0,195153448,Mythology
1,0,195153448,History
2,0,195153448,Classics
3,0,195153448,Nonfiction
4,0,195153448,Reference


In [54]:
#if the genres is not present in index_tobe_dropped then we will drop it 
index_tobe_dropped = []
row_no = genres2.shape[0]

for word,no in zip(genres2['genres'],range(row_no)):
        
    if word not in genre_count_list: 
        index_tobe_dropped.append(no)
        
genres2.drop(index_tobe_dropped,inplace=True)
genres2.reset_index(drop=True,inplace=True)
genres2

## Saving the Data 

In [110]:
#genres2 = genres2.drop(columns=['Book_no'])
genres2.to_csv('Updated_Genres.csv')

In [116]:
genres2['ISBN'].unique().shape[0]

895

## Joining Data

In [100]:
books = pd.read_csv('Books_CorrectAuthorName.csv')
books.head()

Unnamed: 0.1,Unnamed: 0,ISBN,Book-Title,Book-Author,Year-Of-Publication,Publisher,Image-URL-S,Image-URL-M,Image-URL-L
0,0,195153448,Classical Mythology,Mark P. O. Morford,2002,Oxford University Press,http://images.amazon.com/images/P/0195153448.0...,http://images.amazon.com/images/P/0195153448.0...,http://images.amazon.com/images/P/0195153448.0...
1,1,2005018,Clara Callan,Richard Bruce Wright,2001,HarperFlamingo Canada,http://images.amazon.com/images/P/0002005018.0...,http://images.amazon.com/images/P/0002005018.0...,http://images.amazon.com/images/P/0002005018.0...
2,2,60973129,Decision in Normandy,Carlo D'Este,1991,HarperPerennial,http://images.amazon.com/images/P/0060973129.0...,http://images.amazon.com/images/P/0060973129.0...,http://images.amazon.com/images/P/0060973129.0...
3,3,374157065,Flu: The Story of the Great Influenza Pandemic...,Gina Bari Kolata,1999,Farrar Straus Giroux,http://images.amazon.com/images/P/0374157065.0...,http://images.amazon.com/images/P/0374157065.0...,http://images.amazon.com/images/P/0374157065.0...
4,4,393045218,The Mummies of Urumchi,E. J. W. Barber,1999,W. W. Norton &amp; Company,http://images.amazon.com/images/P/0393045218.0...,http://images.amazon.com/images/P/0393045218.0...,http://images.amazon.com/images/P/0393045218.0...


In [103]:
books.shape

(269737, 9)

In [101]:
genres2

Unnamed: 0,ISBN,genres
0,0195153448,History
1,0195153448,Classics
2,0195153448,Nonfiction
3,0195153448,Reference
4,0195153448,Religion
...,...,...
4849,0765341972,Horror
4850,0765341972,Mystery
4851,0765341972,History
4852,0590514776,Nonfiction


In [104]:
merged_data = pd.merge(left = books,right = genres2,  left_on='ISBN', right_on='ISBN')