In [3]:
import pandas as pd
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, SimpleRNN, Dense
import re

In [4]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


### **1. 데이터 수집**

In [6]:
# Load dataset
path = '/content/drive/MyDrive/Samples/Amazon_Reviews.csv'
data = pd.read_csv(path)
print(data.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5000 entries, 0 to 4999
Data columns (total 18 columns):
 #   Column               Non-Null Count  Dtype  
---  ------               --------------  -----  
 0   id                   5000 non-null   object 
 1   dateAdded            5000 non-null   object 
 2   dateUpdated          5000 non-null   object 
 3   name                 5000 non-null   object 
 4   categories           5000 non-null   object 
 5   primaryCategories    5000 non-null   object 
 6   manufacturerNumber   5000 non-null   object 
 7   reviews.date         5000 non-null   object 
 8   reviews.dateAdded    1052 non-null   object 
 9   reviews.dateSeen     5000 non-null   object 
 10  reviews.doRecommend  5000 non-null   bool   
 11  reviews.id           29 non-null     float64
 12  reviews.numHelpful   5000 non-null   int64  
 13  reviews.rating       5000 non-null   int64  
 14  reviews.sourceURLs   5000 non-null   object 
 15  reviews.text         5000 non-null   o

In [7]:
display(data.head())

Unnamed: 0,id,dateAdded,dateUpdated,name,categories,primaryCategories,manufacturerNumber,reviews.date,reviews.dateAdded,reviews.dateSeen,reviews.doRecommend,reviews.id,reviews.numHelpful,reviews.rating,reviews.sourceURLs,reviews.text,reviews.title,reviews.username
0,AVqVGZNvQMlgsOJE6eUY,2017-03-03T16:56:05Z,2018-10-25T16:36:31Z,"Amazon Kindle E-Reader 6"" Wifi (8th Generation...","Computers,Electronics Features,Tablets,Electro...",Electronics,B00ZV9PXP2,2017-09-03T00:00:00.000Z,,"2018-05-27T00:00:00Z,2017-09-18T00:00:00Z,2017...",False,,0,3,http://reviews.bestbuy.com/3545/5442403/review...,I thought it would be as big as small paper bu...,Too small,llyyue
1,AVqVGZNvQMlgsOJE6eUY,2017-03-03T16:56:05Z,2018-10-25T16:36:31Z,"Amazon Kindle E-Reader 6"" Wifi (8th Generation...","Computers,Electronics Features,Tablets,Electro...",Electronics,B00ZV9PXP2,2017-06-06T00:00:00.000Z,,"2018-05-27T00:00:00Z,2017-07-07T00:00:00Z,2017...",True,,0,5,http://reviews.bestbuy.com/3545/5442403/review...,This kindle is light and easy to use especiall...,Great light reader. Easy to use at the beach,Charmi
2,AVqVGZNvQMlgsOJE6eUY,2017-03-03T16:56:05Z,2018-10-25T16:36:31Z,"Amazon Kindle E-Reader 6"" Wifi (8th Generation...","Computers,Electronics Features,Tablets,Electro...",Electronics,B00ZV9PXP2,2018-04-20T00:00:00.000Z,,2018-05-27T00:00:00Z,True,,0,4,https://reviews.bestbuy.com/3545/5442403/revie...,Didnt know how much i'd use a kindle so went f...,Great for the price,johnnyjojojo
3,AVqVGZNvQMlgsOJE6eUY,2017-03-03T16:56:05Z,2018-10-25T16:36:31Z,"Amazon Kindle E-Reader 6"" Wifi (8th Generation...","Computers,Electronics Features,Tablets,Electro...",Electronics,B00ZV9PXP2,2017-11-02T17:33:31.000Z,,2018-10-09T00:00:00Z,True,177283626.0,3,5,https://redsky.target.com/groot-domain-api/v1/...,I am 100 happy with my purchase. I caught it o...,A Great Buy,Kdperry
4,AVqVGZNvQMlgsOJE6eUY,2017-03-03T16:56:05Z,2018-10-25T16:36:31Z,"Amazon Kindle E-Reader 6"" Wifi (8th Generation...","Computers,Electronics Features,Tablets,Electro...",Electronics,B00ZV9PXP2,2018-04-24T00:00:00.000Z,,2018-05-27T00:00:00Z,True,,0,5,https://reviews.bestbuy.com/3545/5442403/revie...,Solid entry level Kindle. Great for kids. Gift...,Solid entry-level Kindle. Great for kids,Johnnyblack


### **2. 데이터 전처리**

In [8]:
# Convert to lowercase and remove special characters
data['reviews.text'] = data['reviews.text'].str.lower().str.replace('[^\w\s]', '', regex=True)

In [10]:
# Tokenizing
max_features = 10000    # corpus size
tokenizer = Tokenizer(num_words=max_features)
tokenizer.fit_on_texts(data['reviews.text'])

In [11]:
# Sequence padding
max_len = 100           # max length of each review
sequences = tokenizer.texts_to_sequences(data['reviews.text'])
data_padded = pad_sequences(sequences, maxlen=max_len)

In [12]:
# Divide category by using ratings
def map_rating(rating):
    if rating == 1:
        return 'very negative'
    if rating == 2:
        return 'negative'
    if rating == 3:
        return 'neutral'
    if rating == 4:
        return 'positive'
    elif rating == 5:
        return 'very positive'
data['sentiment'] = data['reviews.rating'].apply(map_rating)

In [13]:
# Label encoding
encoder = LabelEncoder()
labels = encoder.fit_transform(data['sentiment'])

In [14]:
# Split the dataset
X_train, X_test, y_train, y_test = train_test_split(data_padded, labels, test_size=0.2, random_state=42)

### **3. 모델링**

In [15]:
# Build model
model = Sequential()
model.add(Embedding(max_features, 64))                                  # 10000 * 64 = 640000
model.add(SimpleRNN(32, activation='tanh', return_sequences=False))     # (64 + 32 + 1) * 32 = 3104
model.add(Dense(16, activation='tanh'))                                 # (32 + 1) * 16 = 528
model.add(Dense(5, activation='softmax'))                               # (16 + 1) * 5 = 85
model.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, None, 64)          640000    
                                                                 
 simple_rnn (SimpleRNN)      (None, 32)                3104      
                                                                 
 dense (Dense)               (None, 16)                528       
                                                                 
 dense_1 (Dense)             (None, 5)                 85        
                                                                 
Total params: 643717 (2.46 MB)
Trainable params: 643717 (2.46 MB)
Non-trainable params: 0 (0.00 Byte)
_________________________________________________________________


In [16]:
# Compile
model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])

In [17]:
# Fit the model
model.fit(X_train, y_train, epochs=10, validation_data=(X_test, y_test))

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.src.callbacks.History at 0x7a23ec4ddff0>

In [18]:
# Evaluate the model
test_loss, test_acc = model.evaluate(X_test, y_test)
print(f'Test loss: {test_loss:.4f}')
print(f'Test accuracy: {test_acc:.4f}')

Test loss: 1.2841
Test accuracy: 0.7140


### **4. 예측**

In [19]:
# Create sample review
sample_review = "This product was great in terms of usability and quality!"

In [22]:
# Preprocess the review
def preprocess_text(text):
    text = text.lower()
    text = re.sub(r'[^\w\s]', '', text)
    sequence = tokenizer.texts_to_sequences([text])
    padded_sequence = pad_sequences(sequence, maxlen=max_len)
    return padded_sequence
sample_review_padded = preprocess_text(sample_review)

In [23]:
# Prediction
prediction = model.predict(sample_review_padded)
predicted_sentiment = encoder.inverse_transform([prediction.argmax()])[0]
print(f'The predicted sentiment for the review is: {predicted_sentiment}')

The predicted sentiment for the review is: very positive
