In [1]:
import os
import time
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from nltk import pos_tag, word_tokenize
from nltk.tokenize import RegexpTokenizer
from nltk.tokenize import WordPunctTokenizer
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import WhitespaceTokenizer
import re

In [2]:
prefix = 'data'
file_name = 'Musical_Instruments'
suffix = '_5.json'

os.path.join(prefix,file_name+suffix).replace("\\","/")

'data/Musical_Instruments_5.json'

In [3]:
df = pd.read_json(os.path.join(prefix,file_name+suffix).replace("\\","/"), lines=True)

df = df[['reviewerID', 'asin', 'overall', 'reviewText']]
df.columns = ['userID', 'itemID', 'ratings', 'reviewText']

df.drop_duplicates(['userID' ,'itemID'], inplace=True, keep='first')
df.duplicated(['userID' ,'itemID']).sum()

0

In [4]:
df = df.drop(df[[not isinstance(x, str) or len(x) == 0 for x in df['reviewText']]].index)  # erase null reviews

In [5]:
stop_words = stopwords.words('english')
lemmatizer = WordNetLemmatizer() 
tokenizer = RegexpTokenizer(r'\w+')

def clean_review(review):  # clean a review using stop words and useless punctuations
        review = review.lower()
        review = tokenizer.tokenize(review) # remove punctuation
        review = [word for word in review if word not in stop_words] # remove stopword
#         review2 = []
#         for word, tag in pos_tag(review):
#             wntag = tag[0].lower()
#             wntag = wntag if wntag in ['a', 'r', 'n', 'v'] else None
#             lemma = lemmatizer.lemmatize(word, wntag) if wntag else word
#             review2.append(lemma)
        
#         return review2

        return ' '.join(review)

In [6]:
df['reviewText'] = df['reviewText'].apply(clean_review)

In [7]:
# For testing clean reivew text

# df2 = df.copy()
# df2['reviewText'] = df2['reviewText'].apply(clean_review)
# df2.head()

In [8]:
# user = df.userID.unique()
# user_dict = dict(zip(user, range(1, len(user)+1)))
# inverse_user_dict = {v: k for k, v in user_dict.items()}

# item = df.itemID.unique()
# item_dict = dict(zip(item, range(1, len(item)+1)))
# inverse_item_dict = {v: k for k, v in item_dict.items()}

user = df.userID.unique()
user_dict = dict(zip(user, range(0, len(user))))
inverse_user_dict = {v: k for k, v in user_dict.items()}

item = df.itemID.unique()
item_dict = dict(zip(item, range(0, len(item))))
inverse_item_dict = {v: k for k, v in item_dict.items()}



df['userID'] = df['userID'].map(user_dict)
df['itemID'] = df['itemID'].map(item_dict)

In [9]:
# map user(or item) to number
# df['userID'] = df.groupby(df['userID']).ngroup()
# df['itemID'] = df.groupby(df['itemID']).ngroup(

In [10]:
df['ratings'].values.astype(np.float32)

array([5., 5., 5., ..., 4., 1., 5.], dtype=float32)

In [11]:
# Split to train valid test (60% 20% 20%)

train, valid = train_test_split(df, test_size= 0.4, random_state=111)  
valid, test = train_test_split(valid, test_size=0.5, random_state=111)


In [12]:
train.to_csv(os.path.join(prefix,file_name+'_train.csv').replace("\\","/"),index=False)
valid.to_csv(os.path.join(prefix,file_name+'_valid.csv').replace("\\","/"),index=False)
test.to_csv(os.path.join(prefix,file_name+'_test.csv').replace("\\","/"),index=False)