In [1]:
import pandas as pd
import numpy as np
import re
import spacy

In [3]:
dataset = pd.read_csv('../data/amazon_product_reviews/Reviews.csv')

In [4]:
amazon_dataset = dataset.loc[:,['Score','Text']]

In [5]:
amazon_dataset.rename(columns={'Text':'review'},inplace=True)

In [6]:
amazon_dataset['Score'] = amazon_dataset['Score'].apply(lambda x: 1 if x>=4 else 0)

In [7]:
from spacy.lang.en import English
from nltk.corpus import stopwords

In [8]:
stop_words1 = spacy.lang.en.stop_words.STOP_WORDS 
stop_words2 = stopwords.words('english')
stop_words = set(list(stop_words1) + list(stop_words2))

In [9]:
nlp = spacy.load('en_core_web_sm')

In [10]:
amazon_dataset.head()

Unnamed: 0,Score,review
0,1,I have bought several of the Vitality canned d...
1,0,Product arrived labeled as Jumbo Salted Peanut...
2,1,This is a confection that has been around a fe...
3,0,If you are looking for the secret ingredient i...
4,1,Great taffy at a great price. There was a wid...


In [11]:
def preprocessing(reviews):
    
    length = len(reviews)
    
    clean_review = []
    
    for idx, review in enumerate(reviews):
        
        percent = '{:.3f}'.format((idx/length)*100)
        
        if float(percent) in range(0,101,10):
            print(f'{percent}%')
    
        review = re.sub('<[^>]*>', '', review)  # html tag remove

        review = re.sub('[^a-zA-Z]',' ', review) # 특수 문자 제거

        words = [word.lemma_.lower() for word in nlp(review)] # 각 word를 소문자

        words = [w for w in words if w not in stop_words]

        words = [w for w in words if ' ' not in w]
        
        if len(words) <= 68:
            
            clean_review.append(words)
            
        else:
            
            clean_review.append(np.nan)
        
    return clean_review

In [12]:
amazon_dataset.loc[:,'review'] = preprocessing(amazon_dataset['review'])

0.000%
0.000%
0.000%
10.000%
10.000%
10.000%
10.000%
10.000%
10.000%
20.000%
20.000%
20.000%
20.000%
20.000%
20.000%
30.000%
30.000%
30.000%
30.000%
30.000%
30.000%
40.000%
40.000%
40.000%
40.000%
40.000%
40.000%
50.000%
50.000%
50.000%
50.000%
50.000%
60.000%
60.000%
60.000%
60.000%
60.000%
60.000%


KeyboardInterrupt: 

In [None]:
amazon_dataset.loc[:,'review'] = [l for l in amazon_dataset['review'] if l]  # 이상치 제거

In [None]:
amazon_dataset.dropna(axis=0,inplace=True)

In [None]:
pretrain_target=amazon_dataset['Score'].values

In [None]:
pretrain_target

In [None]:
len(pretrain_target)   # 513429

In [None]:
np.load('pretrain_target.npy',pretrain_target)