In [1]:
import gzip
import numpy as np
import pandas as pd

## Importing and prelim column selection

In [2]:
# Function to process a chunk of data
def process_data(chunk, columns=None):
    # If columns is not None, keep only those columns
    if columns is not None:
        chunk = chunk[columns]
    return chunk

# Function to read data in chunks and process each chunk
def load_data(file_name, head = None, columns=None, chunksize = 1000):
    chunks = []
    count = 0
    with gzip.open(file_name) as fin:
        for chunk in pd.read_json(fin, lines=True, chunksize=chunksize):
            # Process the chunk
            processed_chunk = process_data(chunk, columns)
            chunks.append(processed_chunk)
            
            count += 1
            # break if reaches the head-th chunk
            if (head is not None) and (count > head):
                break

    # Combine all chunks into a single DataFrame
    df = pd.concat(chunks, ignore_index=True)
    
    return df

In [3]:
DIR = 'C:\\Users\\jesse\\Desktop\\Honors Project\\goodreads_data\\raw\\'
ya_reviews = load_data(DIR + 'goodreads_reviews_young_adult.json.gz', head = 1)

In [4]:
ya_reviews

Unnamed: 0,user_id,book_id,review_id,rating,review_text,date_added,date_updated,read_at,started_at,n_votes,n_comments
0,8842281e1d1347389f2ab93d60773d4d,2767052,248c011811e945eca861b5c31a549291,5,I cracked and finally picked this up. Very enj...,Wed Jan 13 13:38:25 -0800 2010,Wed Mar 22 11:46:36 -0700 2017,Sun Mar 25 00:00:00 -0700 2012,Fri Mar 23 00:00:00 -0700 2012,24,25
1,7504b2aee1ecb5b2872d3da381c6c91e,23302416,84c0936a0f9868f38e75d2f9a5cb761e,5,I read this book because my fifth grade son wa...,Wed Jan 21 18:40:59 -0800 2015,Wed Oct 26 03:44:13 -0700 2016,,,0,0
2,f8a89075dc6de14857561522e729f82c,18053080,785c8db878f4009da9741dea51f641da,4,Though the book started out slow and only star...,Sat Jan 11 17:58:41 -0800 2014,Tue Dec 02 11:43:07 -0800 2014,Sat Apr 12 00:00:00 -0700 2014,Fri Apr 11 00:00:00 -0700 2014,0,0
3,f8a89075dc6de14857561522e729f82c,17383543,34dc3c45d07e82718b05e73167259aef,2,"*Update - 10/27/13* - After some sleep, I thin...",Sun Apr 21 19:42:28 -0700 2013,Fri Aug 15 07:55:01 -0700 2014,Sat Oct 26 00:00:00 -0700 2013,Fri Oct 25 00:00:00 -0700 2013,0,0
4,f8a89075dc6de14857561522e729f82c,16651458,d8d6b590780256fef7ae4a9550fe3e0d,5,"This is a moving, heartbreaking, view into a l...",Fri Jan 11 11:42:42 -0800 2013,Fri Mar 01 09:31:01 -0800 2013,Mon Jan 14 00:00:00 -0800 2013,Sat Jan 12 00:00:00 -0800 2013,0,0
...,...,...,...,...,...,...,...,...,...,...,...
1995,d1e368a7d2870eb6fbf6e0d350568a2d,11235712,3e0f18f959ee2a25f82afc04c56e9111,4,I really like this twist on the classic Cinder...,Sun Jun 17 11:48:14 -0700 2012,Mon Oct 20 08:16:14 -0700 2014,Sun Oct 19 00:00:00 -0700 2014,Mon Oct 13 00:00:00 -0700 2014,0,0
1996,d1e368a7d2870eb6fbf6e0d350568a2d,12810834,008569bed43fcee3313c574266fc0b05,3,Interesting and well written. Starts a little ...,Thu Apr 26 16:43:55 -0700 2012,Fri Aug 25 11:25:03 -0700 2017,Tue May 01 00:00:00 -0700 2012,Thu Apr 26 00:00:00 -0700 2012,0,0
1997,d1e368a7d2870eb6fbf6e0d350568a2d,27183386,31f06dc1af01a471637d83123150a32f,3,"It took me 2 tries to finish this book, but I ...",Fri Mar 30 19:45:28 -0700 2012,Wed Apr 26 17:43:12 -0700 2017,Wed Apr 26 00:00:00 -0700 2017,Sat Apr 22 00:00:00 -0700 2017,0,0
1998,d1e368a7d2870eb6fbf6e0d350568a2d,8306857,a7aded22d01dcadf7b75ed595f1764c9,5,Wow! If you're looking for the next Hunger Gam...,Fri Mar 30 19:41:19 -0700 2012,Sat May 28 09:47:32 -0700 2016,Sun Apr 01 10:07:50 -0700 2012,Fri Mar 30 00:00:00 -0700 2012,0,0


Keep from ya_reviews:
['book_id', 'review_id', 'rating', 'review_text', 'n_votes', 'n_comments']

In [5]:
ya_reviews = load_data(DIR + 'goodreads_reviews_young_adult.json.gz', columns = ['book_id','review_id', 'rating', 'review_text', 'n_votes', 'n_comments'])
ya_reviews.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2389900 entries, 0 to 2389899
Data columns (total 6 columns):
 #   Column       Dtype 
---  ------       ----- 
 0   book_id      int64 
 1   review_id    object
 2   rating       int64 
 3   review_text  object
 4   n_votes      int64 
 5   n_comments   int64 
dtypes: int64(4), object(2)
memory usage: 109.4+ MB


In [6]:
#check for duplicate review_id in ya_reviews
ya_reviews['review_id'].duplicated().sum()

0

In [7]:
#print out the min and max for ya_reviews['book_id'] and rating and n_votes and n_comments
#for int and float types, print out the min and max, followed by the column name
for col in ya_reviews.columns:
    if ya_reviews[col].dtype == 'int64' or ya_reviews[col].dtype == 'float64':
        print(ya_reviews[col].min(), ya_reviews[col].max(), col)

50 36524503 book_id
0 5 rating
-3 3942 n_votes
-3 922 n_comments


For memory reasons, downgrade to min int fit

In [8]:
ya_reviews['book_id'] = ya_reviews['book_id'].astype('int32')
ya_reviews['rating'] = ya_reviews['rating'].astype('int8')
ya_reviews['n_votes'] = ya_reviews['n_votes'].astype('int16')
ya_reviews['n_comments'] = ya_reviews['n_comments'].astype('int16')

In [9]:
#replace blank values with NaN
ya_reviews.replace('', np.nan, inplace=True)

In [10]:
ya_reviews.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2389900 entries, 0 to 2389899
Data columns (total 6 columns):
 #   Column       Dtype 
---  ------       ----- 
 0   book_id      int32 
 1   review_id    object
 2   rating       int8  
 3   review_text  object
 4   n_votes      int16 
 5   n_comments   int16 
dtypes: int16(2), int32(1), int8(1), object(2)
memory usage: 57.0+ MB


no nulls

In [11]:
#check for nulls in ya_reviews
ya_reviews.isnull().sum()

book_id           0
review_id         0
rating            0
review_text    1246
n_votes           0
n_comments        0
dtype: int64

In [12]:
#drop rows with nulls
ya_reviews.dropna(inplace=True)

## Preprocessing review text

In [13]:
ya_reviews

Unnamed: 0,book_id,review_id,rating,review_text,n_votes,n_comments
0,2767052,248c011811e945eca861b5c31a549291,5,I cracked and finally picked this up. Very enj...,24,25
1,23302416,84c0936a0f9868f38e75d2f9a5cb761e,5,I read this book because my fifth grade son wa...,0,0
2,18053080,785c8db878f4009da9741dea51f641da,4,Though the book started out slow and only star...,0,0
3,17383543,34dc3c45d07e82718b05e73167259aef,2,"*Update - 10/27/13* - After some sleep, I thin...",0,0
4,16651458,d8d6b590780256fef7ae4a9550fe3e0d,5,"This is a moving, heartbreaking, view into a l...",0,0
...,...,...,...,...,...,...
2389895,6137154,1f6fc20b8187e8f36e12daf947b13e66,5,A really amazing book! It goes great with Grac...,0,0
2389896,10193062,fc86ebe129a7f14a09c0221c92f52f79,3,This book is better then the first. It has it'...,0,0
2389897,6186357,15d0f64aa991f270c490e1a4d5b4011b,2,I think this may have come from having expecta...,0,0
2389898,14740456,b048505c5b1e9ab8695afa3edce1b5d9,4,I generally liked the book. It had some parts ...,0,0


### Cleaning

For most part content in * * is not particularly valuable

In [64]:
#if a review has one or more words in all caps, add a column to the dataframe with a 1, else 0
ya_reviews['all_caps'] = ya_reviews['review_text'].str.contains(r'\b[A-Z]{2,}\b').astype(int)

In [65]:
#lowercase all text in review_text_clean
ya_reviews['review_text'] = ya_reviews['review_text'].str.lower()

In [66]:
#remove text in * _ * format in review_text to a new column
ya_reviews['review_text_clean'] = ya_reviews['review_text'].str.replace('\*.*?\*', '', regex=True)

While some quotes contain actual review content, particularly sarcasm, more often than not it is actual book quotes. Removing

Contractions

In [68]:
import contractions
#apply contractions fix to review_text_clean on the text level
ya_reviews['review_text_clean'] = ya_reviews['review_text_clean'].apply(lambda x: contractions.fix(x))

In [None]:
ya_reviews['review_text_clean'] = ya_reviews['review_text_clean'].replace(r'"[^"]*"', '', regex=True)

In [69]:
ya_reviews

Unnamed: 0,book_id,review_id,rating,review_text,n_votes,n_comments,review_text_clean,all_caps
0,2767052,248c011811e945eca861b5c31a549291,5,i cracked and finally picked this up. very enjoyable quick read - couldn't put it down - it was like crack. \n i'm a bit bothered by the lack of backstory of how panem and the hunger games come ab...,24,25,i cracked and finally picked this up. very enjoyable quick read - could not put it down - it was like crack. \n i am a bit bothered by the lack of backstory of how panem and the hunger games come ...,0
1,23302416,84c0936a0f9868f38e75d2f9a5cb761e,5,i read this book because my fifth grade son was required to for school. i'm so glad i did! i experienced a range of emotions & just loved it. glad these middle schoolers are being exposed to the t...,0,0,i read this book because my fifth grade son was required to for school. i am so glad i did! i experienced a range of emotions & just loved it. glad these middle schoolers are being exposed to the ...,0
2,18053080,785c8db878f4009da9741dea51f641da,4,"though the book started out slow and only started to get interesting towards page 100, overall, it was worth the read.",0,0,"though the book started out slow and only started to get interesting towards page 100, overall, it was worth the read.",0
3,17383543,34dc3c45d07e82718b05e73167259aef,2,"*update - 10/27/13* - after some sleep, i thinking about allegiant overall without the influence of other reviews, i changed my rating to a 2 which is deserved. \n i know that no author is going t...",0,0,"- after some sleep, i thinking about allegiant overall without the influence of other reviews, i changed my rating to a 2 which is deserved. \n i know that no author is going to please the masses...",0
4,16651458,d8d6b590780256fef7ae4a9550fe3e0d,5,"this is a moving, heartbreaking, view into a life of an obese 16 year old boy in high school. don't let the name ""butter"" cause you to overlook reading this book. you see the life of butter from h...",0,0,"this is a moving, heartbreaking, view into a life of an obese 16 year old boy in high school. do not let the name ""butter"" because you to overlook reading this book. you see the life of butter fro...",0
...,...,...,...,...,...,...,...,...
2389895,6137154,1f6fc20b8187e8f36e12daf947b13e66,5,a really amazing book! it goes great with graceling. it's connected to it in a way you don't realize until the end of either books. it really is amazing and semi-mind twisting.,0,0,a really amazing book! it goes great with graceling. it is connected to it in a way you do not realize until the end of either books. it really is amazing and semi-mind twisting.,0
2389896,10193062,fc86ebe129a7f14a09c0221c92f52f79,3,this book is better then the first. it has it's slow points \n (mostly the romance sections. i don't know why but it just doesn't interest me as much as it did in other books. might be because i l...,0,0,this book is better then the first. it has it is slow points \n (mostly the romance sections. i do not know why but it just does not interest me as much as it did in other books. might be because ...,0
2389897,6186357,15d0f64aa991f270c490e1a4d5b4011b,2,"i think this may have come from having expectations set to high. everything i heard and read i was expecting to be blown away and i simply wasn't. i really don't like thomas, i think he's an idiot...",0,0,"i think this may have come from having expectations set to high. everything i heard and read i was expecting to be blown away and i simply was not. i really do not like thomas, i think he is an id...",0
2389898,14740456,b048505c5b1e9ab8695afa3edce1b5d9,4,i generally liked the book. it had some parts that i felt were obvious but that is to be expected. the book also took many different turns that left me wanting to keep reading. the vampires are no...,0,0,i generally liked the book. it had some parts that i felt were obvious but that is to be expected. the book also took many different turns that left me wanting to keep reading. the vampires are no...,0


Stopword removal (NOT DOING)

In [70]:
# import nltk
# from nltk.corpus import stopwords
# from nltk.tokenize import word_tokenize
# nltk.download('stopwords')
# def remove_stopwords(text):
#     tokens = word_tokenize(text)
#     stop_words=['in','of','at','a','the','book','read','reading','to','and','is','it','was','for','this','that','with','on','as','be','but','have','are','from','so','if','an','or','will','can','my','they','their','them','there','then','than','which','by','about','after','before','between','because','while','during','after','some','i','we','he','she','it']
#     filtered_tokens = [word for word in tokens if not word in stop_words]
#     filtered_text = ' '.join(filtered_tokens)
#     return filtered_text

# ya_reviews['review_text_clean_sw'] = ya_reviews['review_text_clean'].apply(lambda x: remove_stopwords(x))

In [71]:
#set columns to a little wider and wrap text
pd.set_option('display.max_colwidth', 200)

In [73]:
ya_reviews

Unnamed: 0,book_id,review_id,rating,review_text,n_votes,n_comments,review_text_clean,all_caps
0,2767052,248c011811e945eca861b5c31a549291,5,i cracked and finally picked this up. very enjoyable quick read - couldn't put it down - it was like crack. \n i'm a bit bothered by the lack of backstory of how panem and the hunger games come ab...,24,25,i cracked and finally picked this up. very enjoyable quick read - could not put it down - it was like crack. \n i am a bit bothered by the lack of backstory of how panem and the hunger games come ...,0
1,23302416,84c0936a0f9868f38e75d2f9a5cb761e,5,i read this book because my fifth grade son was required to for school. i'm so glad i did! i experienced a range of emotions & just loved it. glad these middle schoolers are being exposed to the t...,0,0,i read this book because my fifth grade son was required to for school. i am so glad i did! i experienced a range of emotions & just loved it. glad these middle schoolers are being exposed to the ...,0
2,18053080,785c8db878f4009da9741dea51f641da,4,"though the book started out slow and only started to get interesting towards page 100, overall, it was worth the read.",0,0,"though the book started out slow and only started to get interesting towards page 100, overall, it was worth the read.",0
3,17383543,34dc3c45d07e82718b05e73167259aef,2,"*update - 10/27/13* - after some sleep, i thinking about allegiant overall without the influence of other reviews, i changed my rating to a 2 which is deserved. \n i know that no author is going t...",0,0,"- after some sleep, i thinking about allegiant overall without the influence of other reviews, i changed my rating to a 2 which is deserved. \n i know that no author is going to please the masses...",0
4,16651458,d8d6b590780256fef7ae4a9550fe3e0d,5,"this is a moving, heartbreaking, view into a life of an obese 16 year old boy in high school. don't let the name ""butter"" cause you to overlook reading this book. you see the life of butter from h...",0,0,"this is a moving, heartbreaking, view into a life of an obese 16 year old boy in high school. do not let the name ""butter"" because you to overlook reading this book. you see the life of butter fro...",0
...,...,...,...,...,...,...,...,...
2389895,6137154,1f6fc20b8187e8f36e12daf947b13e66,5,a really amazing book! it goes great with graceling. it's connected to it in a way you don't realize until the end of either books. it really is amazing and semi-mind twisting.,0,0,a really amazing book! it goes great with graceling. it is connected to it in a way you do not realize until the end of either books. it really is amazing and semi-mind twisting.,0
2389896,10193062,fc86ebe129a7f14a09c0221c92f52f79,3,this book is better then the first. it has it's slow points \n (mostly the romance sections. i don't know why but it just doesn't interest me as much as it did in other books. might be because i l...,0,0,this book is better then the first. it has it is slow points \n (mostly the romance sections. i do not know why but it just does not interest me as much as it did in other books. might be because ...,0
2389897,6186357,15d0f64aa991f270c490e1a4d5b4011b,2,"i think this may have come from having expectations set to high. everything i heard and read i was expecting to be blown away and i simply wasn't. i really don't like thomas, i think he's an idiot...",0,0,"i think this may have come from having expectations set to high. everything i heard and read i was expecting to be blown away and i simply was not. i really do not like thomas, i think he is an id...",0
2389898,14740456,b048505c5b1e9ab8695afa3edce1b5d9,4,i generally liked the book. it had some parts that i felt were obvious but that is to be expected. the book also took many different turns that left me wanting to keep reading. the vampires are no...,0,0,i generally liked the book. it had some parts that i felt were obvious but that is to be expected. the book also took many different turns that left me wanting to keep reading. the vampires are no...,0


In [84]:
replacements = {
    ',': '',
    ' - ': ' ',
    ' -': ' ',
    '- ': ' ',
    '-': '',
    '\\(': '',
    '\\)': '',
    '\\:': '',
    '\\;': '',
    '\\[': '',
    '\\]': '',
    '\\{': '',
    '\\}': '',
    "\\'": '',
    "\\.": '',
    '\\&': 'and',
    '\\"': ''
}
ya_reviews['review_text_clean'] = ya_reviews['review_text_clean'].replace(replacements, regex=True)

In [75]:
#remove new lines from review_text_clean
ya_reviews['review_text_clean'] = ya_reviews['review_text_clean'].str.replace('\n', ' ')

In [76]:
ya_reviews

Unnamed: 0,book_id,review_id,rating,review_text,n_votes,n_comments,review_text_clean,all_caps
0,2767052,248c011811e945eca861b5c31a549291,5,i cracked and finally picked this up. very enjoyable quick read - couldn't put it down - it was like crack. \n i'm a bit bothered by the lack of backstory of how panem and the hunger games come ab...,24,25,i cracked and finally picked this up very enjoyable quick read could not put it down it was like crack i am a bit bothered by the lack of backstory of how panem and the hunger games come about i...,0
1,23302416,84c0936a0f9868f38e75d2f9a5cb761e,5,i read this book because my fifth grade son was required to for school. i'm so glad i did! i experienced a range of emotions & just loved it. glad these middle schoolers are being exposed to the t...,0,0,i read this book because my fifth grade son was required to for school i am so glad i did! i experienced a range of emotions and just loved it glad these middle schoolers are being exposed to the ...,0
2,18053080,785c8db878f4009da9741dea51f641da,4,"though the book started out slow and only started to get interesting towards page 100, overall, it was worth the read.",0,0,though the book started out slow and only started to get interesting towards page 100 overall it was worth the read,0
3,17383543,34dc3c45d07e82718b05e73167259aef,2,"*update - 10/27/13* - after some sleep, i thinking about allegiant overall without the influence of other reviews, i changed my rating to a 2 which is deserved. \n i know that no author is going t...",0,0,after some sleep i thinking about allegiant overall without the influence of other reviews i changed my rating to a 2 which is deserved i know that no author is going to please the masses i had...,0
4,16651458,d8d6b590780256fef7ae4a9550fe3e0d,5,"this is a moving, heartbreaking, view into a life of an obese 16 year old boy in high school. don't let the name ""butter"" cause you to overlook reading this book. you see the life of butter from h...",0,0,"this is a moving heartbreaking view into a life of an obese 16 year old boy in high school do not let the name ""butter"" because you to overlook reading this book you see the life of butter from hi...",0
...,...,...,...,...,...,...,...,...
2389895,6137154,1f6fc20b8187e8f36e12daf947b13e66,5,a really amazing book! it goes great with graceling. it's connected to it in a way you don't realize until the end of either books. it really is amazing and semi-mind twisting.,0,0,a really amazing book! it goes great with graceling it is connected to it in a way you do not realize until the end of either books it really is amazing and semimind twisting,0
2389896,10193062,fc86ebe129a7f14a09c0221c92f52f79,3,this book is better then the first. it has it's slow points \n (mostly the romance sections. i don't know why but it just doesn't interest me as much as it did in other books. might be because i l...,0,0,this book is better then the first it has it is slow points mostly the romance sections i do not know why but it just does not interest me as much as it did in other books might be because i lik...,0
2389897,6186357,15d0f64aa991f270c490e1a4d5b4011b,2,"i think this may have come from having expectations set to high. everything i heard and read i was expecting to be blown away and i simply wasn't. i really don't like thomas, i think he's an idiot...",0,0,i think this may have come from having expectations set to high everything i heard and read i was expecting to be blown away and i simply was not i really do not like thomas i think he is an idiot...,0
2389898,14740456,b048505c5b1e9ab8695afa3edce1b5d9,4,i generally liked the book. it had some parts that i felt were obvious but that is to be expected. the book also took many different turns that left me wanting to keep reading. the vampires are no...,0,0,i generally liked the book it had some parts that i felt were obvious but that is to be expected the book also took many different turns that left me wanting to keep reading the vampires are not l...,0


noticing that some things are in weird combos of " ' and ' " so not getting properly removed UGH but gonna leave as is

In [83]:
#print reviews containing "" in review_text_clean
ya_reviews[ya_reviews['review_text_clean'].str.contains('\\"')]

Unnamed: 0,book_id,review_id,rating,review_text,n_votes,n_comments,review_text_clean,all_caps
45,33807229,457bfda01cd106842baf08e78761aaef,4,"4.5 addictive stars!! \n ""i'm easton royal, superficial and only interested in how to have a good time. \n i don't want to look deep into my being and see the bottomless, black, boring pool of not...",3,0,45 addictive stars!! i am not a fan of overthetop drama or crazy plots so imagine my delight when i read fallen heir almost in one sitting! addictive funny and fastpaced this book might be my...,0
65,17558078,8a2b98d454aa820742aa6ee812560a0f,5,"breathtaking and ultimately beautiful. \n this is not just a simple retelling of hopeless! colleen hoover once again weaves an emotionally powerful, often painful and always beautiful story that y...",2,0,breathtaking and ultimately beautiful this is not just a simple retelling of hopeless! colleen hoover once again weaves an emotionally powerful often painful and always beautiful story that you ...,0
69,16151178,9a38a1c096cdcf4e069c4906efb52a5a,5,i am simply speechless. one of the best books i've ever read. where did katja millay go? she needs to write another book! a moving story about second chances and the power of unconditional love. t...,26,5,i am simply speechless one of the best books i have ever read where did katja millay go? she needs to write another book! a moving story about second chances and the power of unconditional love ts...,0
93,17465470,1c950084a419e045f156ffae157b68f6,3,"aarrggghhh!! i have to say once we got to ""the story' it was good but really - why did we have to go beyond lost stories - renamed #11 when it came out is paperback????? it was a fabulous wrap-up ...",0,0,"aarrggghhh!! i have to say once we got to it is a must"" !! now as i am sure all will/halt fans did we read them all so much rehashing from about book 8 onwards is/was really annoying let good st...",0
226,12127810,58ed16fc5189ddb7c3acd685a0946507,4,"october 11, 2013: i finally finished this wonderful book!!! \n happy! :) : \n -probably some people are tired of it, but i enjoy the ""fillers"" or the adventures they go through that distract their...",47,10,october 11 2013 i finally finished this wonderful book!!! happy! probably some people are tired of it but i enjoy the or the adventures they go through that distract their quest i guess the...,0
...,...,...,...,...,...,...,...,...
2387242,17623975,013c145e820b1b750c5ed27429887995,4,"'he showed me how to get lost, and then i showed myself how to get found"" - allyson",0,0,"he showed me how to get lost and then i showed myself how to get found"" allyson",0
2387808,8492825,a41824d0aa06c646e4f16c3173fdf15b,4,"where she went was written from adam's point of view, 3 years after mia and him parted ways. adam became a famous rock star and mia became a julliard cello star. \n all the time i was reading this...",0,0,where she went was written from adams point of view 3 years after mia and him parted ways adam became a famous rock star and mia became a julliard cello star all the time i was reading this i wa...,0
2387843,9644151,165fae114380f75595bb7b30b6d604d6,3,"while this book starts of too slowly to be a favorite, it has definite moments that make it worth reading. it seems to definitely have been inspired by some scenes in the road, so if you liked the...",0,0,while this book starts of too slowly to be a favorite it has definite moments that make it worth reading it seems to definitely have been inspired by some scenes in the road so if you liked the ro...,0
2388872,37732,e8b3f53934bfa59b4108aa2c3b1b4546,5,"judy blume, the sensational writer of popular children and young adult books. in her classic book, ""are you the god? it's me margaret,"" judy made it hard for me to forget her first written book an...",1,0,judy blume the sensational writer of popular children and young adult books in her classic book judy made it hard for me to forget her first written book and my first great read back in junior hi...,0


## Lemmitization

In [85]:
ya_reviews_lem = ya_reviews.copy()
ya_reviews_lem.head()

Unnamed: 0,book_id,review_id,rating,review_text,n_votes,n_comments,review_text_clean,all_caps
0,2767052,248c011811e945eca861b5c31a549291,5,i cracked and finally picked this up. very enjoyable quick read - couldn't put it down - it was like crack. \n i'm a bit bothered by the lack of backstory of how panem and the hunger games come ab...,24,25,i cracked and finally picked this up very enjoyable quick read could not put it down it was like crack i am a bit bothered by the lack of backstory of how panem and the hunger games come about i...,0
1,23302416,84c0936a0f9868f38e75d2f9a5cb761e,5,i read this book because my fifth grade son was required to for school. i'm so glad i did! i experienced a range of emotions & just loved it. glad these middle schoolers are being exposed to the t...,0,0,i read this book because my fifth grade son was required to for school i am so glad i did! i experienced a range of emotions and just loved it glad these middle schoolers are being exposed to the ...,0
2,18053080,785c8db878f4009da9741dea51f641da,4,"though the book started out slow and only started to get interesting towards page 100, overall, it was worth the read.",0,0,though the book started out slow and only started to get interesting towards page 100 overall it was worth the read,0
3,17383543,34dc3c45d07e82718b05e73167259aef,2,"*update - 10/27/13* - after some sleep, i thinking about allegiant overall without the influence of other reviews, i changed my rating to a 2 which is deserved. \n i know that no author is going t...",0,0,after some sleep i thinking about allegiant overall without the influence of other reviews i changed my rating to a 2 which is deserved i know that no author is going to please the masses i had...,0
4,16651458,d8d6b590780256fef7ae4a9550fe3e0d,5,"this is a moving, heartbreaking, view into a life of an obese 16 year old boy in high school. don't let the name ""butter"" cause you to overlook reading this book. you see the life of butter from h...",0,0,this is a moving heartbreaking view into a life of an obese 16 year old boy in high school do not let the name because you to overlook reading this book you see the life of butter from his eyes h...,0


In [89]:
import nltk
from nltk.corpus import wordnet
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize

def get_wordnet_pos(treebank_tag):
    if treebank_tag.startswith('J'):
        return wordnet.ADJ
    elif treebank_tag.startswith('V'):
        return wordnet.VERB
    elif treebank_tag.startswith('N'):
        return wordnet.NOUN
    elif treebank_tag.startswith('R'):
        return wordnet.ADV
    else:
        return wordnet.NOUN  # Default to NOUN

def lemmatize_text(text):
    lemmatizer = WordNetLemmatizer()
    word_list = nltk.word_tokenize(text)
    pos_list = nltk.pos_tag(word_list)
    return ' '.join([lemmatizer.lemmatize(w[0], get_wordnet_pos(w[1])) for w in pos_list])

[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\jesse\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping taggers\averaged_perceptron_tagger.zip.
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\jesse\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [91]:
ya_reviews['review_text_lemmatized'] = ya_reviews['review_text_clean'].apply(lemmatize_text)

KeyboardInterrupt: 