# Data Preprocessing

In [14]:
# Install the surprise package
!pip install -q -U scikit-surprise
from surprise import Dataset, Reader
import pandas as pd
import os
import matplotlib.pyplot as plt
import numpy as np
from scipy.io import arff
import re
import nltk
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Leonie\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

## Loading Data

In [15]:
# Reading and processing the data in chunks for large datasets or busy RAM
def read_and_process_json_in_chunks(path, chunksize=10000, dtype=None):
    chunks = []
    for chunk in pd.read_json(path, lines=True, dtype=dtype, chunksize=chunksize):
        chunks.append(chunk)
    return pd.concat(chunks, ignore_index=True)

In [32]:
path_ratings = os.path.expanduser('../data/Kindle_Store_5.json.gz')
ratings = read_and_process_json_in_chunks(path_ratings, dtype= {'reviewerID': str, 'asin': str, 'overall': int, 'reviewTime': str})

ValueError: Could not reserve memory block

In [16]:
path_meta = os.path.expanduser('../data/meta_Kindle_Store.json.gz')
meta_data = read_and_process_json_in_chunks(path_ratings,  dtype= {'category': str})

In [17]:
preprocessing_ratings = ratings.copy()

In [18]:
preprocessing_meta = meta_data.copy()

## Preprocessing Ratings

In [19]:
# change the name of the ratings column from overall to rating to make this more clear
preprocessing_ratings.rename(columns={'overall':'rating'}, inplace=True)

In [20]:
# Filter out every style except Kindle (Paperback, Audible, Hardcover, MP3 CD remove)
if 'style' in preprocessing_ratings.columns:
    # Filter the DataFrame to include only rows where 'style' column contains only Kindle
    preprocessing_ratings = preprocessing_ratings[preprocessing_ratings['style'].astype(str).str.contains("{'Format:': ' Kindle Edition'}")]
    # Remove verified column, as we assume, all remaining ratings are for Products within the Kindle Edition
    preprocessing_ratings = preprocessing_ratings.drop(columns=['style'])

In [29]:
# Filter by verified and remove unverified reviews
if 'verified' in preprocessing_ratings.columns:
    preprocessing_ratings = preprocessing_ratings[preprocessing_ratings['verified'] == True]
    # Remove verified column, as we assume, all remaining ratings are verified
    preprocessing_ratings.drop(columns=['verified'], inplace=True)

# Drop unixReviewTime
if 'unixReviewTime' in preprocessing_ratings.columns:
    preprocessing_ratings.drop(columns=['unixReviewTime'], inplace=True)

# Drop unixReviewTime
if 'reviewTime' in preprocessing_ratings.columns:
    preprocessing_ratings.drop(columns=['reviewTime'], inplace=True)

# Drop the 'image' column
if 'image' in preprocessing_ratings.columns:
    preprocessing_ratings = preprocessing_ratings.drop(columns=['image'])

# Drop the 'reviewText' column
if 'reviewText' in preprocessing_ratings.columns:
    preprocessing_ratings = preprocessing_ratings.drop(columns=['reviewText'])

# Drop the 'summary' column
if 'summary' in preprocessing_ratings.columns:
    preprocessing_ratings = preprocessing_ratings.drop(columns=['summary'])

# Drop the 'reviewerName' column
if 'reviewerName' in preprocessing_ratings.columns:
    preprocessing_ratings = preprocessing_ratings.drop(columns=['reviewerName'])

# Drop the 'vote' column
if 'vote' in preprocessing_ratings.columns:
    preprocessing_ratings = preprocessing_ratings.drop(columns=['vote'])

In [30]:
# Drop Duplicate Rating entries
preprocessing_ratings.drop_duplicates(subset=None, keep='first', inplace=True, ignore_index=False)

In [31]:
preprocessing_ratings.head()

Unnamed: 0,rating,reviewerID,asin
0,4.0,A2LSKD2H9U8N0J,B000FA5KK0
1,5.0,A2QP13XTJND1QS,B000FA5KK0
2,5.0,A8WQ7MAG3HFOZ,B000FA5KK0
3,5.0,A1E0MODSRYP7O,B000FA5KK0
4,5.0,AYUTCGVSM1H7T,B000FA5KK0
