[View in Colaboratory](https://colab.research.google.com/github/inpaner/nlp-ml/blob/master/ML_based_classification.ipynb)

# Load Data

In [0]:
import pickle
from keras.datasets import imdb
from keras.utils.data_utils import get_file
indexes = imdb.get_word_index()
index_arr = sorted(indexes, key=indexes.get)
index_to_word = {index: word for word, index in indexes.items()}
path = get_file('imdb_full.pkl',
                origin='https://s3.amazonaws.com/text-datasets/imdb_full.pkl',
                md5_hash='d091312047c43cf9e4e38fef92437263')
f = open(path, 'rb')
(x_train, scores_train), (x_test, scores_test) = pickle.load(f)

In [0]:
texts_train = []
for row in x_train:
  text = ' '.join([index_to_word[index] for index in row])
  texts_train.append(text)

In [0]:
texts_test = []
for row in x_test:
  text = ' '.join([index_to_word[index] for index in row])
  texts_test.append(text)

In [0]:
print(len(texts_train), len(texts_test))
print(texts_train[0])
print(texts_test[0])

# Multilayer perceptron

In [0]:
from sklearn.feature_extraction.text import CountVectorizer
count_vect = CountVectorizer()
sample_texts = [
    "hello world",
    "hi world",
    "world world"]
sample_counts = count_vect.fit_transform(sample_texts)
print(sample_counts)

In [0]:
from sklearn.feature_extraction.text import CountVectorizer
count_vect = CountVectorizer()
counts_train = count_vect.fit_transform(texts_train)

In [0]:
print(counts_train)

In [0]:
from sklearn.neural_network import MLPClassifier
clf = MLPClassifier(hidden_layer_sizes=(10,)).fit(counts_train, scores_train)

In [0]:
from sklearn.pipeline import Pipeline
text_clf = Pipeline([('vect', CountVectorizer(max_features=500)),
                      ('clf', MLPClassifier(hidden_layer_sizes=(100, 100)))])
text_clf.fit(texts_train, scores_train)

In [0]:
predicted_scores = text_clf.predict(texts_test)

In [0]:
print(predicted_scores)

In [0]:
# Evaluate
correct_sentiments = 0
for predicted_score, actual_score in zip(predicted_scores, scores_test):
  if int(predicted_score) == int(actual_score):
    correct_sentiments += 1
print(correct_sentiments / len(predicted_scores))