### Preprocessing.py

* ratings_feature -> 추가 feature인 ratings_mean_users, books_rating_weighted 리턴
* users_preprocessing -> users 전처리
* books_preprocessing -> books 전처리

ratings_feature을 통해 ratings_mean_users와 books_rating_weighted 얻은 후, users 및 books 전처리 진행해야합니다.

In [40]:
import pandas as pd
import seaborn as sns
import numpy as np
from tqdm import tqdm_notebook

In [3]:
# ratings = pd.read_csv("BX-Book-Ratings.csv", encoding="latin-1")
# books = pd.read_csv("BX-Books.csv", encoding="latin-1")
# users = pd.read_csv("BX-Users.csv", encoding="latin-1")

In [49]:
def ratings_features(ratings):
    ratings_ = ratings.copy()

    # 사람 기준 rating mean -> 추후 users 데이터프레임에 추가
    ratings_mean_users = pd.DataFrame(ratings_.groupby("User-ID")["Book-Rating"].mean()).reset_index()
    ratings_mean_users.columns = ["User-ID", "Rating-Mean-Users"]
    
    # book 기준 weighted rating -> 추후 books 데이터프레임에 추가
    ## book 기준 평균
    ratings_mean_books = pd.DataFrame(ratings_.groupby("ISBN")["Book-Rating"].mean()).reset_index()
    ratings_mean_books.columns = ["ISBN", "Rating-Mean-Books"]

    ## book 기준 cnt
    ratings_cnt_books = pd.DataFrame(ratings_.groupby("ISBN")["Book-Rating"].count()).reset_index()
    ratings_cnt_books.columns = ["ISBN", "Rating-Cnt-Books"]
    
    books_rating_weighted = ratings_mean_books.merge(ratings_cnt_books, how='left')
    ## weighted rating = 평균 * log(cnt+1)
    books_rating_weighted["Rating-Weighted"] = books_rating_weighted["Rating-Mean-Books"] * \
        np.log(books_rating_weighted["Rating-Cnt-Books"]+1)

    return ratings_mean_users, books_rating_weighted

In [47]:
def users_preprocessing(users, ratings_mean_users):
    # 12세 이상, 57세 이하 기준으로 필터링
    users_ = users[(users['Age']>=12)&(users['Age']<=57)].copy()
    
    # Location에서 국가 추출
    users_['Country'] = users_['Location'].apply(lambda x : x.split()[-1])
    
    # 국가명 통일
    users_.Country = users_.Country.replace(['u.s.a.', 'state', 'states', 'texas', 'missouri', 'carolina'], 'usa')
    users_.Country = users_.Country.replace(['scotia'], 'canada')
    users_.Country = users_.Country.replace(['york', 'england'], 'uk')
    users_.Country = users_.Country.replace(['catalonia', 'catalunya', 'orense'], 'spain')
    
    # 주요 국가를 제외하고 나머지는 other로 그룹핑
    users_['Country_group'] = users_['Country'].apply(lambda x : x if x in ['usa',
                                                                            'germany',
                                                                            'canada',
                                                                            'spain',
                                                                            'kingdom',
                                                                            'france',
                                                                            'australia'] else 'other')
    users__ = users_.merge(ratings_mean_users, how='left')
    return users__

In [41]:
def books_preprocessing(books, books_rating_weighted):
    # 출간 연도 1911이상 2006년 이하 기준으로 필터링
    books_ = books[(books["Year-Of-Publication"]>=1911)&(books["Year-Of-Publication"]<=2006)].copy()
    
    # 안쓰는 이미지 url 컬럼 삭제
    books_.drop(['Image-URL-S','Image-URL-M','Image-URL-L'], axis=1, inplace=True)
    
    # 중복 데이터 행 삭제
    books_.drop_duplicates(inplace=True)
    
    # Publisher 정제(2분 30초 소요)
    books_.sort_values(by="Publisher", inplace=True)
    
    for i, publisher in tqdm_notebook(enumerate(books_["Publisher"])):
        publisher = publisher.replace(" Publishing","")
        publisher = publisher.replace(" Publishers","")
        publisher = publisher.replace(" Publications","")
        publisher = publisher.replace(" Publication","")
        publisher = publisher.replace(" Company","")
        publisher = publisher.replace(" Communications","")
        publisher = publisher.replace(" Corporation","")
        publisher = publisher.replace(" Editora","")
        publisher = publisher.replace(" Verlag","")
        publisher = publisher.replace(" verlag","")
        publisher = publisher.replace("-verlag","")
        publisher = publisher.replace(" Books","")
        publisher = publisher.replace(" GmbH","")
        publisher = publisher.replace(" Incorporated","")
        publisher = publisher.replace(" Inc","")
        publisher = publisher.replace(", LLC","")
        publisher = publisher.replace("&amp","")
        publisher = publisher.replace("-Imports","")
        publisher = publisher.replace(" (FL)","")
        publisher = publisher.replace(" (CA)","")
        publisher = publisher.replace(" (OH)","")
        publisher = publisher.replace(" (WA)","")
        publisher = publisher.replace(" (MA)","")
        publisher = publisher.replace(" (CT)","")
        publisher = publisher.replace(" (TX)","")
        publisher = publisher.replace(" (Mm)","")
        publisher = publisher.replace("'s","s")
        publisher = publisher.replace(" and","")
        publisher = publisher.replace(" Press","")
        publisher = publisher.replace(" Pr","")
        publisher = publisher.replace(" books","")
        publisher = publisher.replace(" Ltd","")
        publisher = publisher.replace(" Group","")
        publisher = publisher.replace(" Limited","")
        publisher = publisher.replace(" Editions","")
        publisher = publisher.replace(" UK","")
        publisher = publisher.replace(" (UK)","")
        publisher = publisher.replace(" (J)","")
        publisher = publisher.replace(" USA","")
        publisher = publisher.replace(" Book","")
        publisher = publisher.replace(" Sales","")
        publisher = publisher.replace(" Libraries","")
        publisher = publisher.replace(" Library","")
        publisher = publisher.replace(" Paperbacks","")
        publisher = publisher.replace(" Trade","")
        publisher = publisher.replace(" Corp","")
        publisher = publisher.replace(" Co","")
        publisher = publisher.replace(" Pub","")
        publisher = publisher.replace("Pub.","")
        publisher = publisher.replace("Pub","")
        publisher = publisher.replace(".","")
        publisher = publisher.replace(",","")
        publisher = publisher.replace("  "," ")



        books_.iloc[i,4] = publisher
    books__ = books_.merge(books_rating_weighted, how='left')
    return books__