# Import Libraries

In [6]:
from collections import Counter
from datetime import datetime

import json

from keras.layers import Embedding, LSTM, Dense, Conv1D, MaxPooling1D, Dropout, Activation
from keras.models import Sequential
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences

import tensorflow as tf
import pandas as pd
import numpy as np

# Load Dataset

In [7]:
# Load the reviews and parse JSON
reviews = pd.read_csv('./data/yelp_academic_base_dataset.csv')

In [8]:
reviews.head()

Unnamed: 0,businessId,meanStars,reviewCount,reviewStars,text,date
0,6iYb2HFDywm3zjuRg0shjw,4.0,86,5.0,Stopped in on a busy Friday night. Despite the...,2018-03-04 00:59:21
1,6iYb2HFDywm3zjuRg0shjw,4.0,86,2.0,Went there about 1 PM on a Monday. It wasn't ...,2018-08-14 05:22:00
2,6iYb2HFDywm3zjuRg0shjw,4.0,86,5.0,This was the place the be on Friday Night! If ...,2018-03-17 14:22:48
3,6iYb2HFDywm3zjuRg0shjw,4.0,86,4.0,Went to this place with my family over the wee...,2018-04-04 21:16:50
4,6iYb2HFDywm3zjuRg0shjw,4.0,86,4.0,"Stopped on a midweek afternoon, and so glad th...",2018-04-28 19:17:04


In [9]:
# Get a balanced sample of positive and negative reviews
texts =  reviews['text']

In [10]:
# Cleaning
reviews_cleaned = reviews.drop(['businessId', 'meanStars', 'reviewCount', 'date'], axis=1)

In [11]:
# Convert our 5 classes into 2 (negative or positive)
binstars = [0 if review <= 3.0 else 1 for review in reviews_cleaned['reviewStars']]
balanced_texts = []
balanced_labels = []
limit = 100000  # Change this to grow/shrink the dataset
neg_pos_counts = [0, 0]
for i in range(len(texts)):
    polarity = binstars[i]
    if neg_pos_counts[polarity] < limit:
        balanced_texts.append(texts[i])
        balanced_labels.append(binstars[i])
        neg_pos_counts[polarity] += 1

In [12]:
Counter(balanced_labels)

Counter({1: 100000, 0: 100000})

In [13]:
balanced_labels[:10]

[1, 0, 1, 1, 1, 1, 1, 1, 1, 1]

# Text Tokenization

In [14]:
tokenizer = Tokenizer(num_words=5)
toytexts = ["Is is a common word", "So is the", "the is common", "discombobulation is not common"]
tokenizer.fit_on_texts(toytexts)
sequences = tokenizer.texts_to_sequences(toytexts)
print(sequences)

[[1, 1, 4, 2], [1, 3], [3, 1, 2], [1, 2]]


In [15]:
print(tokenizer.word_index)

{'is': 1, 'common': 2, 'the': 3, 'a': 4, 'word': 5, 'so': 6, 'discombobulation': 7, 'not': 8}


In [16]:
padded_sequences = pad_sequences(sequences)
print(padded_sequences)

[[1 1 4 2]
 [0 0 1 3]
 [0 3 1 2]
 [0 0 1 2]]


In [17]:
tokenizer = Tokenizer(num_words=20000)
tokenizer.fit_on_texts(balanced_texts)
sequences = tokenizer.texts_to_sequences(balanced_texts)
data = pad_sequences(sequences, maxlen=300) 

# Build a Neural Network

In [18]:
# TensorFlow checking with appropriate compiler flags.
tf.sysconfig.get_compile_flags()

['-I/usr/local/lib/python3.9/site-packages/tensorflow/include',
 '-D_GLIBCXX_USE_CXX11_ABI=0',
 '-DEIGEN_MAX_ALIGN_BYTES=64']

In [None]:
model = Sequential()
model.add(Embedding(20000, 128, input_length=300))
model.add(LSTM(128, dropout=0.2, recurrent_dropout=0.2))
model.add(Dense(1, activation='sigmoid'))
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

In [None]:
model.fit(data, np.array(balanced_labels), validation_split=0.5, epochs=3)

# Neural Network Optimization

Andiamo ad aggiungere più layer in modo da ottimizzare la rete neurale e addestrare un modello con accuracy maggiore

In [None]:
model = Sequential()
model.add(Embedding(20000, 128, input_length=300))
model.add(Dropout(0.2))
model.add(Conv1D(64, 5, activation='relu'))
model.add(MaxPooling1D(pool_size=4))
model.add(LSTM(128))
model.add(Dense(1, activation='sigmoid'))
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
model.fit(data, np.array(balanced_labels), validation_split=0.5, epochs=3)

# Support Vector Machine Model

In [19]:
from sklearn.svm import LinearSVC
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import cross_val_score

t1 = datetime.now()
vectorizer = TfidfVectorizer(ngram_range=(1,2), min_df=3)
classifier = LinearSVC()
Xs = vectorizer.fit_transform(balanced_texts)

print(datetime.now() - t1)
print(Xs.shape)

score = cross_val_score(classifier, Xs, balanced_labels, cv=2, n_jobs=-1)

print(datetime.now() - t1)
print(score)
print(sum(score) / len(score))

0:00:38.394486
(200000, 639342)
0:00:43.187111
[0.89995 0.8985 ]
0.8992249999999999


In [20]:
!pip install sklearn

[33mDEPRECATION: Configuring installation scheme with distutils config files is deprecated and will no longer work in the near future. If you are using a Homebrew or Linuxbrew Python, please see discussion at https://github.com/Homebrew/homebrew-core/issues/76621[0m


# Packaging Model and Testing on Input Data

In [22]:
import pickle

# save the tokenizer and model
with open("keras_tokenizer.pickle", "wb") as f:
   pickle.dump(tokenizer, f)
model.save("yelp_sentiment_model.hdf5")

NameError: name 'model' is not defined

In [23]:
from keras.models import load_model
from keras.preprocessing.sequence import pad_sequences
import pickle

# load the tokenizer and the model
with open("keras_tokenizer.pickle", "rb") as f:
   tokenizer = pickle.load(f)

model = load_model("yelp_sentiment_model.hdf5")

# replace with the data you want to classify
newtexts = ["It's so bad", "Food is delicious"]

# note that we shouldn't call "fit" on the tokenizer again
sequences = tokenizer.texts_to_sequences(newtexts)
data = pad_sequences(sequences, maxlen=300)

# get predictions for each of your new texts
predictions = model.predict(data)
print(predictions)

2021-09-19 11:13:39.105549: I tensorflow/core/platform/cpu_feature_guard.cc:142] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
2021-09-19 11:13:39.803336: I tensorflow/compiler/mlir/mlir_graph_optimization_pass.cc:185] None of the MLIR Optimization Passes are enabled (registered 2)


[[0.49636674]
 [0.22827312]]


In [24]:
import joblib

svmClassifier = "svm_classifier.pickle"
tfidfVectorizer = "tfidf_vectorizer.pickle"
classifier = pickle.load(open(svmFile, 'rb'))
result = loaded_model.score(X_test, Y_test)
print(result)

from sklearn.externals import joblib

vectorizer = joblib.load("tfidf_vectorizer.pickle")
classifier = joblib.load("svm_classifier.pickle")

# replace with the data you want to classify
newtexts = ["Your new data", "More new data"]

# note that we should call "transform" here instead of the "fit_transform" from earlier
Xs = vectorizer.transform(newtexts)

# get predictions for each of your new texts
predictions = classifier.predict(Xs)
print(predictions)

ImportError: cannot import name 'joblib' from 'sklearn.externals' (/usr/local/lib/python3.9/site-packages/sklearn/externals/__init__.py)

In [None]:
!python -m pip install sklearn --upgrade