In [1]:
# 책 추천 예제
# https://towardsdatascience.com/how-did-we-build-book-recommender-systems-in-an-hour-part-2-k-nearest-neighbors-and-matrix-c04b3c2ef55c
import pandas as pd
import numpy as np
from scipy.sparse import csr_matrix
import sklearn
from sklearn.decomposition import TruncatedSVD

book = pd.read_csv('BX-Books.csv', sep=';', error_bad_lines=False, encoding='latin-1')
book.columns = ['ISBN', 'bookTitle', 'bookAuthor', 'yearOfPublication', 'publisher', 'imageUrlS', 'imageUrlM', 'imageUrlL']
user = pd.read_csv('BX-Users.csv', sep=';', error_bad_lines=False, encoding='latin-1')
user.columns = ['userID', 'Location', 'Age']
rating = pd.read_csv('BX-Book-Ratings.csv', sep=';', error_bad_lines=False, encoding='latin-1')
rating.columns = ['userID', 'ISBN', 'bookRating']

b'Skipping line 6452: expected 8 fields, saw 9\nSkipping line 43667: expected 8 fields, saw 10\nSkipping line 51751: expected 8 fields, saw 9\n'
b'Skipping line 92038: expected 8 fields, saw 9\nSkipping line 104319: expected 8 fields, saw 9\nSkipping line 121768: expected 8 fields, saw 9\n'
b'Skipping line 144058: expected 8 fields, saw 9\nSkipping line 150789: expected 8 fields, saw 9\nSkipping line 157128: expected 8 fields, saw 9\nSkipping line 180189: expected 8 fields, saw 9\nSkipping line 185738: expected 8 fields, saw 9\n'
b'Skipping line 209388: expected 8 fields, saw 9\nSkipping line 220626: expected 8 fields, saw 9\nSkipping line 227933: expected 8 fields, saw 11\nSkipping line 228957: expected 8 fields, saw 10\nSkipping line 245933: expected 8 fields, saw 9\nSkipping line 251296: expected 8 fields, saw 9\nSkipping line 259941: expected 8 fields, saw 9\nSkipping line 261529: expected 8 fields, saw 9\n'
  has_raised = await self.run_ast_nodes(code_ast.body, cell_name,


In [2]:
book.head()

Unnamed: 0,ISBN,bookTitle,bookAuthor,yearOfPublication,publisher,imageUrlS,imageUrlM,imageUrlL
0,195153448,Classical Mythology,Mark P. O. Morford,2002,Oxford University Press,http://images.amazon.com/images/P/0195153448.0...,http://images.amazon.com/images/P/0195153448.0...,http://images.amazon.com/images/P/0195153448.0...
1,2005018,Clara Callan,Richard Bruce Wright,2001,HarperFlamingo Canada,http://images.amazon.com/images/P/0002005018.0...,http://images.amazon.com/images/P/0002005018.0...,http://images.amazon.com/images/P/0002005018.0...
2,60973129,Decision in Normandy,Carlo D'Este,1991,HarperPerennial,http://images.amazon.com/images/P/0060973129.0...,http://images.amazon.com/images/P/0060973129.0...,http://images.amazon.com/images/P/0060973129.0...
3,374157065,Flu: The Story of the Great Influenza Pandemic...,Gina Bari Kolata,1999,Farrar Straus Giroux,http://images.amazon.com/images/P/0374157065.0...,http://images.amazon.com/images/P/0374157065.0...,http://images.amazon.com/images/P/0374157065.0...
4,393045218,The Mummies of Urumchi,E. J. W. Barber,1999,W. W. Norton &amp; Company,http://images.amazon.com/images/P/0393045218.0...,http://images.amazon.com/images/P/0393045218.0...,http://images.amazon.com/images/P/0393045218.0...


In [3]:
user.head()

Unnamed: 0,userID,Location,Age
0,1,"nyc, new york, usa",
1,2,"stockton, california, usa",18.0
2,3,"moscow, yukon territory, russia",
3,4,"porto, v.n.gaia, portugal",17.0
4,5,"farnborough, hants, united kingdom",


In [4]:
rating.head()

Unnamed: 0,userID,ISBN,bookRating
0,276725,034545104X,0
1,276726,0155061224,5
2,276727,0446520802,0
3,276729,052165615X,3
4,276729,0521795028,6


In [5]:
# rating과 book을 ISBN을 기준으로 통합
combine_book_rating = pd.merge(rating, book, on='ISBN')
columns = ['yearOfPublication', 'publisher', 'imageUrlS', 'imageUrlM', 'imageUrlL']
combine_book_rating = combine_book_rating.drop(columns, axis=1)
combine_book_rating.head()

Unnamed: 0,userID,ISBN,bookRating,bookTitle,bookAuthor
0,276725,034545104X,0,Flesh Tones: A Novel,M. J. Rose
1,2313,034545104X,5,Flesh Tones: A Novel,M. J. Rose
2,6543,034545104X,0,Flesh Tones: A Novel,M. J. Rose
3,8680,034545104X,5,Flesh Tones: A Novel,M. J. Rose
4,10314,034545104X,9,Flesh Tones: A Novel,M. J. Rose


In [6]:
combine_book_rating[combine_book_rating["bookTitle"] == "Flesh Tones: A Novel"]

Unnamed: 0,userID,ISBN,bookRating,bookTitle,bookAuthor
0,276725,034545104X,0,Flesh Tones: A Novel,M. J. Rose
1,2313,034545104X,5,Flesh Tones: A Novel,M. J. Rose
2,6543,034545104X,0,Flesh Tones: A Novel,M. J. Rose
3,8680,034545104X,5,Flesh Tones: A Novel,M. J. Rose
4,10314,034545104X,9,Flesh Tones: A Novel,M. J. Rose
5,23768,034545104X,0,Flesh Tones: A Novel,M. J. Rose
6,28266,034545104X,0,Flesh Tones: A Novel,M. J. Rose
7,28523,034545104X,0,Flesh Tones: A Novel,M. J. Rose
8,39002,034545104X,0,Flesh Tones: A Novel,M. J. Rose
9,50403,034545104X,9,Flesh Tones: A Novel,M. J. Rose


In [7]:
# bookTitle이 NA인 row 제거
combine_book_rating = combine_book_rating.dropna(axis=0, subset = ['bookTitle'])

In [8]:
# bookTitle을 기준으로 그루핑을 하고 점수를 부여한 사용자의 명수를 계산한 컬럼을 추가
book_ratingCount = (combine_book_rating.
                    groupby(by = ['bookTitle'])['bookRating'].
                    count().
                    reset_index().
                    rename(columns = {'bookRating': 'totalRatingCount'})
                    [['bookTitle', 'totalRatingCount']]
                   )

In [9]:
book_ratingCount[book_ratingCount["bookTitle"] == "Flesh Tones: A Novel"]

Unnamed: 0,bookTitle,totalRatingCount
67829,Flesh Tones: A Novel,60


In [10]:
book_ratingCount.head()

Unnamed: 0,bookTitle,totalRatingCount
0,A Light in the Storm: The Civil War Diary of ...,4
1,Always Have Popsicles,1
2,Apple Magic (The Collector's series),1
3,"Ask Lily (Young Women of Faith: Lily Series, ...",1
4,Beyond IBM: Leadership Marketing and Finance ...,1


In [11]:
# combine_book_rating 과 book_ratingCount를 merge
rating_with_totalRatingCount = combine_book_rating.merge(book_ratingCount, left_on = 'bookTitle', right_on = 'bookTitle', how = 'left')
rating_with_totalRatingCount.head()

Unnamed: 0,userID,ISBN,bookRating,bookTitle,bookAuthor,totalRatingCount
0,276725,034545104X,0,Flesh Tones: A Novel,M. J. Rose,60
1,2313,034545104X,5,Flesh Tones: A Novel,M. J. Rose,60
2,6543,034545104X,0,Flesh Tones: A Novel,M. J. Rose,60
3,8680,034545104X,5,Flesh Tones: A Novel,M. J. Rose,60
4,10314,034545104X,9,Flesh Tones: A Novel,M. J. Rose,60


In [12]:
# 책 점수 통계, 책들이 얼마나 많이 평가를 받았나
pd.set_option('display.float_format', lambda x: '%.3f' % x)
print(book_ratingCount['totalRatingCount'].describe())

count   241071.000
mean         4.277
std         16.739
min          1.000
25%          1.000
50%          1.000
75%          3.000
max       2502.000
Name: totalRatingCount, dtype: float64


In [13]:
# 위의 통계로 봐서는 median에 해당하는 책조차 1번 밖에 평가를 받지 못했다.
# 그럼 상위 책들의 통계를 확인해보자.
print(book_ratingCount['totalRatingCount'].quantile(np.arange(.9, 1, .01)))

0.900    7.000
0.910    8.000
0.920    9.000
0.930   10.000
0.940   11.000
0.950   13.000
0.960   16.000
0.970   20.000
0.980   29.000
0.990   50.000
Name: totalRatingCount, dtype: float64


In [14]:
# 1%에 해당하는 책들만이 50개 이상의 평가를 받았다.
# 현재 너무 많은 책들이 데이터에 존재하므로 상위 1%의 책들만을 대상으로 하자 (총 2444 개의 책)
popularity_threshold = 50
rating_popular_book = rating_with_totalRatingCount.query('totalRatingCount >= @popularity_threshold')
rating_popular_book.head()

Unnamed: 0,userID,ISBN,bookRating,bookTitle,bookAuthor,totalRatingCount
0,276725,034545104X,0,Flesh Tones: A Novel,M. J. Rose,60
1,2313,034545104X,5,Flesh Tones: A Novel,M. J. Rose,60
2,6543,034545104X,0,Flesh Tones: A Novel,M. J. Rose,60
3,8680,034545104X,5,Flesh Tones: A Novel,M. J. Rose,60
4,10314,034545104X,9,Flesh Tones: A Novel,M. J. Rose,60


In [15]:
len(set(rating_popular_book['bookTitle']))

2444

In [16]:
# US와 Canada에서 출간된 책만을 선택
combined = rating_popular_book.merge(user, left_on = 'userID', right_on = 'userID', how = 'left')


In [17]:
# test
combined[combined['Location'].str.contains("use|canada")]

Unnamed: 0,userID,ISBN,bookRating,bookTitle,bookAuthor,totalRatingCount,Location,Age
37,173743,034545104X,0,Flesh Tones: A Novel,M. J. Rose,60,"gloucester, massachusetts, usa",35.000
45,220502,034545104X,3,Flesh Tones: A Novel,M. J. Rose,60,"dedham, massachusetts, usa",
73,24878,0446520802,7,The Notebook,Nicholas Sparks,650,"ottawa, ontario, canada",
75,27617,0446520802,9,The Notebook,Nicholas Sparks,650,"beaverton, ontario, canada",32.000
76,28204,0446520802,0,The Notebook,Nicholas Sparks,650,"south ohio, nova scotia, canada",61.000
...,...,...,...,...,...,...,...,...
288694,271476,0445046562,7,Thursday's Child,Victoria Poole,56,"thorold, ontario, canada",
288700,263877,0451211081,0,A Day Late and a Dollar Short,Terry McMillan,116,"victoria, british columbia, canada",44.000
288701,263877,0613335864,0,False Memory,Dean Koontz,227,"victoria, british columbia, canada",44.000
288702,263918,0453009603,0,Waiting to Exhale,Terry McMillan,134,"victoria, british columbia, canada",59.000


In [18]:
us_canada_user_rating = combined[combined['Location'].str.contains("usa|canada")]
us_canada_user_rating=us_canada_user_rating.drop('Age', axis=1)
us_canada_user_rating.head()

Unnamed: 0,userID,ISBN,bookRating,bookTitle,bookAuthor,totalRatingCount,Location
0,276725,034545104X,0,Flesh Tones: A Novel,M. J. Rose,60,"tyler, texas, usa"
1,2313,034545104X,5,Flesh Tones: A Novel,M. J. Rose,60,"cincinnati, ohio, usa"
2,6543,034545104X,0,Flesh Tones: A Novel,M. J. Rose,60,"strafford, missouri, usa"
3,8680,034545104X,5,Flesh Tones: A Novel,M. J. Rose,60,"st. charles county, missouri, usa"
4,10314,034545104X,9,Flesh Tones: A Novel,M. J. Rose,60,"beaverton, oregon, usa"


# 입력 데이터 만들기

In [19]:
if not us_canada_user_rating[us_canada_user_rating.duplicated(['userID', 'bookTitle'])].empty:
    initial_rows = us_canada_user_rating.shape[0]

    print('Initial dataframe shape {0}'.format(us_canada_user_rating.shape))
    us_canada_user_rating = us_canada_user_rating.drop_duplicates(['userID', 'bookTitle'])
    current_rows = us_canada_user_rating.shape[0]
    print('New dataframe shape {0}'.format(us_canada_user_rating.shape))
    print('Removed {0} rows'.format(initial_rows - current_rows))

Initial dataframe shape (251615, 7)
New dataframe shape (248949, 7)
Removed 2666 rows


In [20]:
us_canada_user_rating

Unnamed: 0,userID,ISBN,bookRating,bookTitle,bookAuthor,totalRatingCount,Location
0,276725,034545104X,0,Flesh Tones: A Novel,M. J. Rose,60,"tyler, texas, usa"
1,2313,034545104X,5,Flesh Tones: A Novel,M. J. Rose,60,"cincinnati, ohio, usa"
2,6543,034545104X,0,Flesh Tones: A Novel,M. J. Rose,60,"strafford, missouri, usa"
3,8680,034545104X,5,Flesh Tones: A Novel,M. J. Rose,60,"st. charles county, missouri, usa"
4,10314,034545104X,9,Flesh Tones: A Novel,M. J. Rose,60,"beaverton, oregon, usa"
...,...,...,...,...,...,...,...
288735,274061,1551663147,10,Fever,Elizabeth Lowell,68,"gahanna/columbus, ohio, usa"
288736,274808,0449701913,0,Homecoming,Cynthia Voigt,137,"paynesville, , usa"
288737,275970,0865714215,0,Stormy Weather,Guy Dauncey,103,"pittsburgh, pennsylvania, usa"
288738,275970,1586210661,9,Me Talk Pretty One Day,David Sedaris,457,"pittsburgh, pennsylvania, usa"


In [21]:
# 책-사용자 점수 matrix (선정한 컬럼을 가지고 테이블을 만들어줌 / 행,열,값 3개의 컬럼이 필요 => .pivot 이라는 함수를 사용)
us_canada_user_rating_pivot = us_canada_user_rating.pivot(index = 'bookTitle', columns = 'userID', values = 'bookRating').fillna(0)
us_canada_user_rating_matrix = csr_matrix(us_canada_user_rating_pivot.values)

In [22]:
us_canada_user_rating_pivot.head()

userID,8,9,14,16,17,23,26,32,39,42,...,278820,278824,278828,278832,278836,278843,278844,278846,278851,278854
bookTitle,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
10 Lb. Penalty,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
16 Lighthouse Road,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1984,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1st to Die: A Novel,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2010: Odyssey Two,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [23]:
us_canada_user_rating_pivot.shape

(2442, 40017)

In [24]:
# 데이터 타입: Compressed Sparse Row matrix
us_canada_user_rating_matrix

<2442x40017 sparse matrix of type '<class 'numpy.float64'>'
	with 97607 stored elements in Compressed Sparse Row format>

# Item에 kNN 적용

In [25]:
from sklearn.neighbors import NearestNeighbors

model_knn = NearestNeighbors(metric = 'cosine', algorithm = 'brute')
model_knn.fit(us_canada_user_rating_matrix)

NearestNeighbors(algorithm='brute', metric='cosine')

In [26]:
query_index = np.random.choice(us_canada_user_rating_pivot.shape[0])
print(query_index)
us_canada_user_rating_pivot.iloc[query_index, :]

764


userID
8        0.000
9        0.000
14       0.000
16       0.000
17       0.000
          ... 
278843   0.000
278844   0.000
278846   0.000
278851   0.000
278854   0.000
Name: Grave Secrets, Length: 40017, dtype: float64

In [27]:
#print(us_canada_user_rating_pivot.iloc[query_index, :].values)
#print(us_canada_user_rating_pivot.iloc[query_index, :].values.reshape(1, -1))

In [28]:
#ttemp = us_canada_user_rating_pivot.iloc[query_index, :].values.reshape(1, -1)[0]
#ttemp[ttemp > 0]

In [29]:
query_index = np.random.choice(us_canada_user_rating_pivot.shape[0])
distances, indices = model_knn.kneighbors(us_canada_user_rating_pivot.iloc[query_index, :].values.reshape(1, -1), n_neighbors = 6)
print(query_index)
for i in range(0, len(distances.flatten())):
    if i == 0:
        print('Recommendations for {0}:\n'.format(us_canada_user_rating_pivot.index[query_index]))
    else:
        print('{0}: {1}, with distance of {2}:'.format(i, us_canada_user_rating_pivot.index[indices.flatten()[i]], distances.flatten()[i]))

1962
Recommendations for The Kitchen God's Wife:

1: The Joy Luck Club, with distance of 0.8924720231590498:
2: The Bonesetter's Daughter, with distance of 0.9047940091889176:
3: The Hundred Secret Senses, with distance of 0.9077599420280281:
4: Talk Before Sleep: A Novel, with distance of 0.9158752431327916:
5: Anne of Ingleside (Anne of Green Gables Novels (Paperback)), with distance of 0.9176288685747935:


In [30]:
print(indices)
print(distances)

[[1962 1951 1719 1940 1646  170]]
[[0.         0.89247202 0.90479401 0.90775994 0.91587524 0.91762887]]


# Item-based Collaborative Filtering Using Matrix Factorization

In [31]:
# 이번에는 사용자가 row에 오도록
us_canada_user_rating_pivot2 = us_canada_user_rating.pivot(index = 'userID', columns = 'bookTitle', values = 'bookRating').fillna(0)

In [32]:
us_canada_user_rating_pivot2.head()

bookTitle,10 Lb. Penalty,16 Lighthouse Road,1984,1st to Die: A Novel,2010: Odyssey Two,204 Rosewood Lane,2061: Odyssey Three,24 Hours,2nd Chance,3rd Degree,...,YOU BELONG TO ME,Year of Wonders,You Belong To Me,You Shall Know Our Velocity,Young Wives,Zen and the Art of Motorcycle Maintenance: An Inquiry into Values,Zoya,"\O\"" Is for Outlaw""","\Surely You're Joking, Mr. Feynman!\"": Adventures of a Curious Character""",stardust
userID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
8,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
9,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
14,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
16,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
17,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [33]:
us_canada_user_rating_pivot2.shape

(40017, 2442)

In [34]:
us_canada_user_rating_pivot2.columns

Index(['10 Lb. Penalty', '16 Lighthouse Road', '1984', '1st to Die: A Novel',
       '2010: Odyssey Two', '204 Rosewood Lane', '2061: Odyssey Three',
       '24 Hours', '2nd Chance', '3rd Degree',
       ...
       'YOU BELONG TO ME', 'Year of Wonders', 'You Belong To Me',
       'You Shall Know Our Velocity', 'Young Wives',
       'Zen and the Art of Motorcycle Maintenance: An Inquiry into Values',
       'Zoya', '\O\" Is for Outlaw"',
       '\Surely You're Joking, Mr. Feynman!\": Adventures of a Curious Character"',
       'stardust'],
      dtype='object', name='bookTitle', length=2442)

In [35]:
X = pd.DataFrame(us_canada_user_rating_pivot2.values.T)
X.shape
X.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,40007,40008,40009,40010,40011,40012,40013,40014,40015,40016
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [36]:
#us_canada_user_rating_pivot2.T

In [37]:
# X의 컬럼과 인덱스 명 설정
X.columns = us_canada_user_rating_pivot2.index
X.index = us_canada_user_rating_pivot2.columns
X.head()

userID,8,9,14,16,17,23,26,32,39,42,...,278820,278824,278828,278832,278836,278843,278844,278846,278851,278854
bookTitle,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
10 Lb. Penalty,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
16 Lighthouse Road,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1984,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1st to Die: A Novel,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2010: Odyssey Two,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [38]:
# SVD 사용
import sklearn
from sklearn.decomposition import TruncatedSVD

SVD = TruncatedSVD(n_components=12, random_state=17)
matrix = SVD.fit_transform(X)
matrix.shape

(2442, 12)

In [39]:
# correlation coefficient matrix
import warnings
warnings.filterwarnings("ignore",category =RuntimeWarning)
corr = np.corrcoef(matrix)
corr.shape

(2442, 2442)

In [40]:
us_canada_book_title = us_canada_user_rating_pivot2.columns
us_canada_book_list = list(us_canada_book_title)
coffey_hands = us_canada_book_list.index("The Green Mile: Coffey's Hands (Green Mile Series)")
print(coffey_hands)

1906


In [41]:
corr_coffey_hands  = corr[coffey_hands]

In [42]:
print(corr_coffey_hands)

[0.19731361 0.13296355 0.46626909 ... 0.64458236 0.07844602 0.04131424]


In [43]:
list(us_canada_book_title[(corr_coffey_hands<1.0) & (corr_coffey_hands>0.9)])

['Needful Things',
 'The Bachman Books: Rage, the Long Walk, Roadwork, the Running Man',
 'The Green Mile: Coffey on the Mile (Green Mile Series)',
 'The Green Mile: Night Journey (Green Mile Series)',
 'The Green Mile: The Bad Death of Eduard Delacroix (Green Mile Series)',
 'The Green Mile: The Mouse on the Mile (Green Mile Series)',
 'The Shining',
 'The Two Dead Girls (Green Mile Series)']

# Collaborative Filtering Using NMF

In [44]:
X

userID,8,9,14,16,17,23,26,32,39,42,...,278820,278824,278828,278832,278836,278843,278844,278846,278851,278854
bookTitle,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
10 Lb. Penalty,0.000,0.000,0.000,0.000,0.000,0.000,0.000,0.000,0.000,0.000,...,0.000,0.000,0.000,0.000,0.000,0.000,0.000,0.000,0.000,0.000
16 Lighthouse Road,0.000,0.000,0.000,0.000,0.000,0.000,0.000,0.000,0.000,0.000,...,0.000,0.000,0.000,0.000,0.000,0.000,0.000,0.000,0.000,0.000
1984,0.000,0.000,0.000,0.000,0.000,0.000,0.000,0.000,0.000,0.000,...,0.000,0.000,0.000,0.000,0.000,0.000,0.000,0.000,0.000,0.000
1st to Die: A Novel,0.000,0.000,0.000,0.000,0.000,0.000,0.000,0.000,0.000,0.000,...,0.000,0.000,0.000,0.000,0.000,0.000,0.000,0.000,0.000,0.000
2010: Odyssey Two,0.000,0.000,0.000,0.000,0.000,0.000,0.000,0.000,0.000,0.000,...,0.000,0.000,0.000,0.000,0.000,0.000,0.000,0.000,0.000,0.000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
Zen and the Art of Motorcycle Maintenance: An Inquiry into Values,0.000,0.000,0.000,0.000,0.000,0.000,0.000,0.000,0.000,0.000,...,0.000,0.000,0.000,0.000,0.000,0.000,0.000,0.000,0.000,0.000
Zoya,0.000,0.000,0.000,0.000,0.000,0.000,0.000,0.000,0.000,0.000,...,0.000,0.000,0.000,0.000,0.000,0.000,0.000,0.000,0.000,0.000
"\O\"" Is for Outlaw""",0.000,0.000,0.000,0.000,0.000,0.000,0.000,0.000,0.000,0.000,...,0.000,0.000,0.000,0.000,0.000,0.000,0.000,0.000,0.000,0.000
"\Surely You're Joking, Mr. Feynman!\"": Adventures of a Curious Character""",0.000,0.000,0.000,0.000,0.000,0.000,0.000,0.000,0.000,0.000,...,0.000,0.000,0.000,0.000,0.000,0.000,0.000,0.000,0.000,0.000


In [45]:
# user별로 평가한 아이템 개수
review_count = pd.DataFrame(X[X > 0].count())


In [46]:
# user별로 평가한 아이템 개수가 20개를 넘는 경우의 user id list
id_selected = review_count[review_count[0] > 20].index

In [47]:
# 위의 선택된 id에 해당하는 컬럼만을 모아둔 것 (X에서)
X2 = X[id_selected]

In [48]:
X2.head()

userID,638,2276,2766,4017,5582,6073,6242,6251,6323,6543,...,273979,274061,274308,275970,276050,276165,276231,276680,277427,278633
bookTitle,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
10 Lb. Penalty,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
16 Lighthouse Road,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1984,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1st to Die: A Novel,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,9.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2010: Odyssey Two,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [49]:
# NMF 모델 객체 생성
from sklearn.decomposition import NMF
model = NMF(n_components=200)

In [50]:
# NMF 모델 학습 (Book-User Matix에 대해)
# W는 Book-Cluster matrix
W = model.fit_transform(X2)

In [51]:
# H는 Cluster-User matrix
H = model.components_

In [52]:
H.shape

(200, 692)

In [53]:
# H를 dataframe으로 바꾸고 X 컬럼명을 컬럼명으로 세팅
H = pd.DataFrame(np.round(model.components_,2), columns=X2.columns)

In [54]:
# W를 dataframe으로 바꾸고 X row명을 row명으로 세팅
W = pd.DataFrame(np.round(model.transform(X2),2))
W.index = X.index

In [55]:
H.head()  #user 클러스터 매트릭스

userID,638,2276,2766,4017,5582,6073,6242,6251,6323,6543,...,273979,274061,274308,275970,276050,276165,276231,276680,277427,278633
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.04
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.82,1.62,0.0,4.46,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [56]:
W.head()  #item 클러스터 매트릭스

Unnamed: 0_level_0,0,1,2,3,4,5,6,7,8,9,...,190,191,192,193,194,195,196,197,198,199
bookTitle,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
10 Lb. Penalty,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.01,0.0,0.0,0.0,0.03,0.0,0.0,0.0,0.0
16 Lighthouse Road,0.0,0.85,0.0,0.0,0.0,0.14,0.0,0.0,0.0,0.0,...,0.06,0.0,0.0,0.01,0.0,0.0,0.02,0.0,0.0,0.0
1984,0.4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.19,0.18,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1st to Die: A Novel,0.46,0.0,0.01,0.0,0.03,0.0,0.0,0.0,0.16,0.0,...,0.89,0.88,0.0,0.0,0.0,0.66,0.0,0.0,0.16,0.71
2010: Odyssey Two,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.1,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [57]:
# X matrix를 W*H로 reconstruction하여 reconstructed에 저장 colums, index 세팅
reconstructed = pd.DataFrame(np.round(np.dot(W,H),2), columns=X2.columns)
reconstructed.index = X2.index

In [58]:
#최종적인 매트릭스는 이렇게 만들어짐 (아까는 0이 매우 많았는데, 이제는 0이 아닌애들이 미묘하게 존재)
#관심이 있을 것 같은 책들에 대해서 점수를 내줌
reconstructed 

userID,638,2276,2766,4017,5582,6073,6242,6251,6323,6543,...,273979,274061,274308,275970,276050,276165,276231,276680,277427,278633
bookTitle,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
10 Lb. Penalty,0.000,0.000,0.060,0.000,0.000,0.000,0.000,0.010,0.000,0.000,...,0.000,0.000,0.020,0.020,0.050,0.010,0.030,0.000,0.000,0.000
16 Lighthouse Road,0.020,0.030,0.010,0.000,0.000,0.000,0.000,0.000,0.000,0.170,...,0.010,0.000,0.140,0.010,0.010,0.000,0.040,0.000,0.000,0.000
1984,0.630,0.260,0.130,0.010,0.030,0.180,0.010,0.000,0.380,0.000,...,0.280,0.000,0.830,0.340,0.120,0.650,0.120,0.400,0.310,0.060
1st to Die: A Novel,1.600,1.390,0.580,0.100,0.040,0.070,0.510,0.020,0.460,9.630,...,0.340,0.240,0.140,0.830,0.510,0.250,0.520,0.560,0.000,0.150
2010: Odyssey Two,0.140,0.050,0.010,0.000,0.000,0.000,0.040,0.570,0.000,0.000,...,0.040,0.000,0.010,0.060,0.280,0.510,0.000,0.040,0.000,0.100
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
Zen and the Art of Motorcycle Maintenance: An Inquiry into Values,0.070,0.030,0.050,0.000,0.020,0.070,0.270,0.030,0.170,0.330,...,0.010,0.000,0.120,0.130,0.020,0.130,0.370,0.090,0.220,0.020
Zoya,0.200,0.030,0.350,0.000,0.000,0.040,0.000,0.000,0.000,0.000,...,0.000,0.000,0.020,0.000,0.180,0.020,0.010,0.000,0.000,0.220
"\O\"" Is for Outlaw""",0.010,0.100,0.940,0.000,0.000,0.000,0.020,0.000,0.830,0.000,...,0.370,0.010,0.000,0.200,0.760,0.000,0.010,0.040,0.010,0.060
"\Surely You're Joking, Mr. Feynman!\"": Adventures of a Curious Character""",0.230,0.010,0.480,0.000,0.000,0.340,0.020,0.000,0.060,0.000,...,0.000,0.000,0.010,0.960,0.000,0.000,0.000,0.000,0.000,0.020


In [59]:
# user id 638 에 대한 reconstructed 점수 중 3이상인 것들 
# user 638이 관심있을 것으로 예측한 책들을 알아보자
temp = pd.DataFrame(reconstructed[reconstructed[638] > 3][638])

In [60]:
len(pd.DataFrame(reconstructed[reconstructed[638] > 3][638]))

8

In [61]:
#638번이 좋아할 것으로 예측되는 책
temp 

Unnamed: 0_level_0,638
bookTitle,Unnamed: 1_level_1
2nd Chance,3.09
A Walk to Remember,4.62
Suzanne's Diary for Nicholas,8.18
The Da Vinci Code,9.77
The Guardian,3.88
The Lovely Bones: A Novel,10.62
The Notebook,5.26
The Rescue,3.99


In [62]:
#pd.DataFrame.sort_values(by=temp, axis=0, ascending=True, inplace=False, kind='quicksort', na_position='last')
temp.sort_values(by=638,ascending=False)

Unnamed: 0_level_0,638
bookTitle,Unnamed: 1_level_1
The Lovely Bones: A Novel,10.62
The Da Vinci Code,9.77
Suzanne's Diary for Nicholas,8.18
The Notebook,5.26
A Walk to Remember,4.62
The Rescue,3.99
The Guardian,3.88
2nd Chance,3.09


In [63]:
# user id 638이 원래 평가한 점수 중 0 이상인 것들 
pd.DataFrame(X[X[638] > 0][638])

Unnamed: 0_level_0,638
bookTitle,Unnamed: 1_level_1
2nd Chance,9.0
A Map of the World,10.0
A Time to Kill,9.0
A Walk to Remember,9.0
Angel Falls,8.0
Barrel Fever : Stories and Essays (Barrel Fever),10.0
Big Trouble,9.0
Blackbird: A Childhood Lost and Found,9.0
Bridget Jones : The Edge of Reason,7.0
Chicken Soup for the College Soul : Inspiring and Humorous Stories for College Students (Chicken Soup for the Soul),10.0


In [64]:
# user별로 평가한 아이템 개수
review_count = pd.DataFrame(X[X > 0].count())

In [65]:
review_count[100:200]

Unnamed: 0_level_0,0
userID,Unnamed: 1_level_1
763,3
765,1
769,1
771,1
774,1
...,...
1412,5
1427,1
1433,1
1434,1


## 이번에는 같은 내용을 User-Book Matrix를 이용하여 계산해본다.

In [66]:
X2_tr = X2.T

In [67]:
X2_tr.head()

bookTitle,10 Lb. Penalty,16 Lighthouse Road,1984,1st to Die: A Novel,2010: Odyssey Two,204 Rosewood Lane,2061: Odyssey Three,24 Hours,2nd Chance,3rd Degree,...,YOU BELONG TO ME,Year of Wonders,You Belong To Me,You Shall Know Our Velocity,Young Wives,Zen and the Art of Motorcycle Maintenance: An Inquiry into Values,Zoya,"\O\"" Is for Outlaw""","\Surely You're Joking, Mr. Feynman!\"": Adventures of a Curious Character""",stardust
userID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
638,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,9.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2276,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,10.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2766,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4017,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5582,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [68]:
model_tr = NMF(n_components=200)
W = model.fit_transform(X2_tr)
H = pd.DataFrame(np.round(model.components_,2), columns=X2_tr.columns)



In [69]:
reconstructed = pd.DataFrame(np.round(np.dot(W,H),2), columns=X2_tr.columns)
reconstructed.index = X2_tr.index

In [70]:
reconstructed

bookTitle,10 Lb. Penalty,16 Lighthouse Road,1984,1st to Die: A Novel,2010: Odyssey Two,204 Rosewood Lane,2061: Odyssey Three,24 Hours,2nd Chance,3rd Degree,...,YOU BELONG TO ME,Year of Wonders,You Belong To Me,You Shall Know Our Velocity,Young Wives,Zen and the Art of Motorcycle Maintenance: An Inquiry into Values,Zoya,"\O\"" Is for Outlaw""","\Surely You're Joking, Mr. Feynman!\"": Adventures of a Curious Character""",stardust
userID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
638,0.010,0.060,0.590,0.700,0.020,0.150,0.190,0.370,2.410,0.460,...,0.100,0.200,0.100,0.090,0.030,0.070,0.010,0.010,0.180,0.180
2276,0.020,0.010,0.130,1.040,0.070,0.060,0.160,0.100,2.600,0.070,...,0.550,0.530,0.040,0.000,0.050,0.010,0.110,0.060,0.000,0.010
2766,0.070,0.010,0.150,0.820,0.010,0.180,0.020,0.000,0.110,0.000,...,0.000,0.110,0.030,0.020,0.010,0.090,0.010,1.010,0.500,0.110
4017,0.000,0.000,0.000,0.000,0.000,0.000,0.000,0.000,0.040,0.000,...,0.000,0.150,0.000,0.000,0.300,0.000,0.000,0.150,0.090,0.000
5582,0.000,0.000,0.020,0.010,0.000,0.000,0.000,0.000,0.160,0.010,...,0.010,0.150,0.070,0.000,0.000,0.190,0.000,0.000,0.000,0.080
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
276165,0.000,0.030,0.450,0.220,1.660,1.790,0.500,0.190,0.070,1.370,...,0.010,0.290,0.070,0.090,0.010,0.110,0.000,0.000,0.000,0.230
276231,0.020,0.060,0.070,0.560,0.010,0.080,0.000,0.030,1.490,0.090,...,0.840,0.010,0.070,0.000,0.050,0.440,0.040,0.010,0.000,0.020
276680,0.010,0.010,0.520,0.490,0.050,0.080,0.010,0.680,0.030,0.060,...,0.000,0.370,0.130,0.090,0.020,0.060,0.000,0.040,0.000,0.020
277427,0.130,0.000,0.060,0.330,0.000,0.000,0.000,9.020,0.000,0.000,...,0.000,0.050,0.020,0.020,0.000,0.200,0.000,0.000,0.000,0.000


In [71]:
reconstructed2 = reconstructed.T

In [72]:
reconstructed2

userID,638,2276,2766,4017,5582,6073,6242,6251,6323,6543,...,273979,274061,274308,275970,276050,276165,276231,276680,277427,278633
bookTitle,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
10 Lb. Penalty,0.010,0.020,0.070,0.000,0.000,0.010,0.010,0.000,0.000,0.000,...,0.010,0.000,0.030,0.030,0.040,0.000,0.020,0.010,0.130,0.010
16 Lighthouse Road,0.060,0.010,0.010,0.000,0.000,0.000,0.000,0.000,0.010,0.200,...,0.010,0.000,0.160,0.010,0.010,0.030,0.060,0.010,0.000,0.000
1984,0.590,0.130,0.150,0.000,0.020,0.550,0.050,0.000,0.440,0.000,...,0.300,0.290,0.580,0.840,0.030,0.450,0.070,0.520,0.060,0.010
1st to Die: A Novel,0.700,1.040,0.820,0.000,0.010,0.140,0.220,0.010,0.470,9.510,...,0.290,0.270,0.170,0.600,0.420,0.220,0.560,0.490,0.330,0.490
2010: Odyssey Two,0.020,0.070,0.010,0.000,0.000,0.020,0.040,0.660,0.050,0.000,...,0.030,0.000,0.050,0.060,0.030,1.660,0.010,0.050,0.000,0.200
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
Zen and the Art of Motorcycle Maintenance: An Inquiry into Values,0.070,0.010,0.090,0.000,0.190,0.040,0.280,0.000,0.140,0.100,...,0.020,0.000,0.100,0.140,0.010,0.110,0.440,0.060,0.200,0.030
Zoya,0.010,0.110,0.010,0.000,0.000,0.060,0.000,0.090,0.020,0.000,...,0.010,0.000,0.020,0.000,0.010,0.000,0.040,0.000,0.000,0.040
"\O\"" Is for Outlaw""",0.010,0.060,1.010,0.150,0.000,0.060,0.030,0.000,0.860,0.000,...,0.350,0.020,0.020,0.130,0.820,0.000,0.010,0.040,0.000,0.130
"\Surely You're Joking, Mr. Feynman!\"": Adventures of a Curious Character""",0.180,0.000,0.500,0.090,0.000,0.350,0.020,0.000,0.010,0.000,...,0.010,0.030,0.010,0.760,0.000,0.000,0.000,0.000,0.000,0.000


In [73]:
len(pd.DataFrame(reconstructed2[reconstructed2[638] > 2][638]))

44

In [74]:
temp = pd.DataFrame(reconstructed2[reconstructed2[638] > 2][638])
temp.sort_values(by=638,ascending=False)

Unnamed: 0_level_0,638
bookTitle,Unnamed: 1_level_1
The Lovely Bones: A Novel,10.58
A Walk to Remember,9.34
The Notebook,8.6
The Rescue,8.34
The Da Vinci Code,6.6
Message in a Bottle,6.38
The Guardian,5.05
Suzanne's Diary for Nicholas,4.41
Harry Potter and the Sorcerer's Stone (Book 1),3.75
Nights in Rodanthe,3.55
