In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import re
%matplotlib inline

## Read movielens and bx-book csv

In [None]:
df_movies = pd.read_csv('/home/ignacio/Datasets/ml-latest-small/movies_directors.csv')
df_movies.head()

In [None]:
df_genres = pd.read_csv('/home/ignacio/Datasets/ml-latest-small/genres.csv')
df_genres

In [None]:
df_book = pd.read_csv('/home/ignacio/Datasets/BX-CSV-Dump/BX-book-shelves.csv')
df_book.head()

## Get book shelves

#### All shelves

In [None]:
pop_shelves = df_book['shelves']
pop_shelves = pop_shelves.str.replace('[\[\]\s]', '')
pop_shelves.head()

In [None]:
pop_shelves = pop_shelves.str.split(',')
shelves_list = list(pop_shelves.values)
shelves = [ shelve for pop_shel in shelves_list for shelve in pop_shel ]
s_shelves = pd.Series(shelves, name='Popular shelves')
s_shelves.head()

In [None]:
s_shelves = s_shelves.drop_duplicates()

In [None]:
len(s_shelves)

#### Shared shelves between movies and books

In [None]:
def match_shelve(shelve_to_match, target):
    pattern = re.compile('(^'+shelve_to_match+'[\s-]*|.*[\s-]'+shelve_to_match+'[\s-]+.*$|.*[\s-]'+shelve_to_match+'$)')
    return bool(pattern.search(target))

In [None]:
def exist_category(row, df_genre):
    value = df_genre['genre'].apply( match_shelve, target= row )
    if len(value.unique()) == 2:
        return True
    else:
        return False

In [None]:
s_valid_shelves = s_shelves.apply(exist_category, df_genre=df_genres)
s_valid_shelves.head()

In [None]:
s_valid_shelves.value_counts()

In [None]:
s_invalid_shelves = s_valid_shelves[s_valid_shelves == False]

In [None]:
len(s_invalid_shelves)

In [None]:
s_shelves = s_shelves.drop(s_invalid_shelves.index)
s_shelves.head()

In [None]:
len(s_shelves)

In [None]:
s_shelves = s_shelves.apply(lambda x : str(x))
s_shelves = s_shelves[~s_shelves.str.contains('software')]

In [None]:
s_shelves.to_csv('/home/ignacio/Datasets/Graph analysis/popular_shelves.csv', index=False)

#### Filter popular shelves from books

In [None]:
def find_common_shelves(row, s_shelves):
    shelves = re.split(',', row)
    s_test_shelves = pd.Series(shelves, name='Popular shelves')
    common_shelves = list(set(s_shelves) & set(s_test_shelves))
    return common_shelves
    

In [None]:
df_book = df_book.reset_index()
df_book['shelves'] = df_book['shelves'].apply(lambda x : str(x))
df_book['shelves'] = df_book['shelves'].str.replace('[\[\]\s]', '')
df_book.head()

In [None]:
df_book['common-shelves'] = df_book['shelves'].apply(find_common_shelves, s_shelves= s_shelves)
df_book.head()

In [None]:
df_book['common-shelves'] = df_book['common-shelves'].apply( lambda x : str(x) )
df_book['common-shelves'] = df_book['common-shelves'].str.replace(r'[\[\]\'\']', '')
df_book['common-shelves'] = df_book['common-shelves'].str.replace(r',\s', '|')
df_book.head()

In [None]:
df_book = df_book.drop(axis=1, labels=['index'])
df_book.head()

In [None]:
df_book.to_csv('/home/ignacio/Datasets/Graph analysis/BX-book-shelves.csv', index=False)

## Genre movies analysis

In [None]:
df_movies['genres'] = df_movies['genres'].str.lower()
df_movies.head()

In [None]:
df_movies = df_movies[['movieId', 'title_mod', 'year', 'genres', 'director']]
df_movies = df_movies.rename(index=str, columns={"title_mod": "title"})
df_movies.head()

In [None]:
df_movies['director'] = df_movies['director'].str.replace(r'[\[\]\']', '')
df_movies.head()

In [None]:
df_movies.to_csv('/home/ignacio/Datasets/Graph analysis/ml-movies.csv', index=False)

In [None]:
genres = df_movies['genres']
genres = genres.drop_duplicates()
genres.head()

In [None]:
len(genres)

In [None]:
df_noir = df_movies[df_movies['genres'].str.contains('^film-noir$')]
df_noir

In [None]:
len(df_noir)

In [None]:
len(df_comedy)

In [None]:
df_genres['count'] = df_genres['genre'].apply(lambda x : len(df_movies[df_movies['genres'].str.contains(x)]))
df_genres.head()

In [None]:
df_genres[df_genres['genre']=='film-noir']

In [None]:
plt.figure(figsize=(12,10))
g = sns.barplot(x='genre', y='count', data=df_genres)
g.set_xticklabels(labels=df_genres['genre'],rotation=45)