In [67]:
import pandas as pd
import tensorflow as tf
import re
import string

from tensorflow import keras
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense

from sklearn.model_selection import train_test_split

from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

### Отримуємо дані

In [68]:
df = pd.read_csv('../lab2/amazon_reviews.csv',  header=None, nrows=40000)
df.columns = ['Polarity', 'Title', 'Review']
df.shape

(40000, 3)

In [69]:
df.head()

Unnamed: 0,Polarity,Title,Review
0,2,Great CD,My lovely Pat has one of the GREAT voices of h...
1,2,One of the best game music soundtracks - for a...,Despite the fact that I have only played a sma...
2,1,Batteries died within a year ...,I bought this charger in Jul 2003 and it worke...
3,2,"works fine, but Maha Energy is better",Check out Maha Energy's website. Their Powerex...
4,2,Great for the non-audiophile,Reviewed quite a bit of the combo players and ...


### Препроцесинг

In [70]:
# Відділяємо непотрібну колонку
Data = df[['Polarity', 'Review']].reset_index(drop=True)
Data.head(10)

Unnamed: 0,Polarity,Review
0,2,My lovely Pat has one of the GREAT voices of h...
1,2,Despite the fact that I have only played a sma...
2,1,I bought this charger in Jul 2003 and it worke...
3,2,Check out Maha Energy's website. Their Powerex...
4,2,Reviewed quite a bit of the combo players and ...
5,1,I also began having the incorrect disc problem...
6,1,"I love the style of this, but after a couple y..."
7,1,I cannot scroll through a DVD menu that is set...
8,2,"Exotic tales of the Orient from the 1930's. ""D..."
9,1,"Firstly,I enjoyed the format and tone of the b..."


In [71]:
Data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 40000 entries, 0 to 39999
Data columns (total 2 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   Polarity  40000 non-null  int64 
 1   Review    40000 non-null  object
dtypes: int64(1), object(1)
memory usage: 625.1+ KB


In [72]:
# нема жодних нульових параметрів
df.isnull().sum()

Polarity    0
Title       3
Review      0
dtype: int64

### Токенізація

In [73]:
# Токенізуймо для прикладу один відгук
print(Data['Review'][0])
words = word_tokenize(Data['Review'][0])
words

My lovely Pat has one of the GREAT voices of her generation. I have listened to this CD for YEARS and I still LOVE IT. When I'm in a good mood it makes me feel better. A bad mood just evaporates like sugar in the rain. This CD just oozes LIFE. Vocals are jusat STUUNNING and lyrics just kill. One of life's hidden gems. This is a desert isle CD in my book. Why she never made it big is just beyond me. Everytime I play this, no matter black, white, young, old, male, female EVERYBODY says one thing "Who was that singing ?"


['My',
 'lovely',
 'Pat',
 'has',
 'one',
 'of',
 'the',
 'GREAT',
 'voices',
 'of',
 'her',
 'generation',
 '.',
 'I',
 'have',
 'listened',
 'to',
 'this',
 'CD',
 'for',
 'YEARS',
 'and',
 'I',
 'still',
 'LOVE',
 'IT',
 '.',
 'When',
 'I',
 "'m",
 'in',
 'a',
 'good',
 'mood',
 'it',
 'makes',
 'me',
 'feel',
 'better',
 '.',
 'A',
 'bad',
 'mood',
 'just',
 'evaporates',
 'like',
 'sugar',
 'in',
 'the',
 'rain',
 '.',
 'This',
 'CD',
 'just',
 'oozes',
 'LIFE',
 '.',
 'Vocals',
 'are',
 'jusat',
 'STUUNNING',
 'and',
 'lyrics',
 'just',
 'kill',
 '.',
 'One',
 'of',
 'life',
 "'s",
 'hidden',
 'gems',
 '.',
 'This',
 'is',
 'a',
 'desert',
 'isle',
 'CD',
 'in',
 'my',
 'book',
 '.',
 'Why',
 'she',
 'never',
 'made',
 'it',
 'big',
 'is',
 'just',
 'beyond',
 'me',
 '.',
 'Everytime',
 'I',
 'play',
 'this',
 ',',
 'no',
 'matter',
 'black',
 ',',
 'white',
 ',',
 'young',
 ',',
 'old',
 ',',
 'male',
 ',',
 'female',
 'EVERYBODY',
 'says',
 'one',
 'thing',
 '``',
 'Who',
 'was

### Видаляємо стоп-слова та пунктуацію

In [74]:
def clean_text(text): 
    text = str(text) # Конвертація вхідного тексту у строку
    text = text.lower() #Перетворення тексту у нижній регістр
    text = re.sub("\d", " ", text) #Видалення всіх цифр з тексту та заміна їх на пробіли
    text = re.sub("@\S+", " ", text) #Видалення слів, які починаються з символу "@" та заміна їх на пробіли
    text = re.sub("https*\S+", " ", text) #Видалення посилань та заміна на пробіли
    text = re.sub("#\S+", " ", text) #Видалення слів, які починаються з символу "#"
    text = re.sub('[%s]' % re.escape(string.punctuation), ' ', text) #Видалення всіх знаків пунктуації
    text = re.sub('\n', ' ', text) #Видалення символів нового рядка
    text = re.sub('\s{2,}',' ', text)  #Заміна двох або більше пробілів на один
    stop_words = stopwords.words("english") #Створення списку стоп-слів для англ. мови за допомогою бібліотеки nltk
    text = ' '.join([word for word in text.split(' ') if word not in stop_words]) #Розбиття тексту на слова, видалення стоп-слів і з'єднання залишених слів назад в текст
    return text

Data['Review'] = [clean_text(review) for review in Data['Review']]
Data

Unnamed: 0,Polarity,Review
0,2,lovely pat one great voices generation listene...
1,2,despite fact played small portion game music h...
2,1,bought charger jul worked ok design nice conve...
3,2,check maha energy website powerex mh c f charg...
4,2,reviewed quite bit combo players hesitant due ...
...,...,...
39995,2,thought writing good soundtrack excellent cine...
39996,1,garden state starring zach braff natalie portm...
39997,2,first zack braff garden state seems beautiful ...
39998,1,slow avg story line done hundred times


### RNN (Recurrent Neural Nertwork) 
[використовуючи тексти, що були використані у 2 лабораторній роботі]

In [75]:
X_train, X_test, y_train, y_test = train_test_split(Data['Review'], Data['Polarity'], test_size=0.30)

In [76]:
vocab_size = 10000
oov_token = ""
tokenizer = Tokenizer(num_words = vocab_size, oov_token=oov_token)
tokenizer.fit_on_texts(X_train)

In [77]:
X_train_text_sequences = tokenizer.texts_to_sequences(X_train)
X_test_text_sequences = tokenizer.texts_to_sequences(X_test)

In [78]:
max_length = 2000
padding_type = "post"
trunction_type="post"
X_train = keras.utils.pad_sequences(X_train_text_sequences,maxlen=max_length, padding=padding_type, truncating=trunction_type)
X_test = keras.utils.pad_sequences(X_test_text_sequences,maxlen=max_length, padding=padding_type, truncating=trunction_type)
X_train.shape

(28000, 2000)

In [79]:
emb_dim = 128

model = Sequential()
model.add(Embedding(len(tokenizer.word_index) + 1, emb_dim, input_length=max_length))
model.add(LSTM(64))
model.add(Dense(1, activation='selu'))

ValueError: Unrecognized keyword arguments passed to Embedding: {'input_length': 2000}

In [None]:
model.compile(optimizer='adam',
              loss = tf.keras.losses.binary_crossentropy,
              metrics=['accuracy']
)

In [None]:
model.fit(X_train, y_train, epochs=2, validation_data=(X_test, y_test))

In [None]:
loss, accuracy = model.evaluate(X_test, y_test)
print(loss)
print(accuracy)