In [37]:
import pandas as pd
import numpy as np
import re
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn.model_selection import train_test_split

This is an example from https://www.freecodecamp.org/news/how-to-extract-keywords-from-text-with-tf-idf-and-pythons-scikit-learn-b2a0f3d7e667/

In [2]:
df_idf = pd.read_csv('amazon/reviews.csv')

In [3]:
print("Schema:\n", df_idf.dtypes)
print("Shape of database =", df_idf.shape)

Schema:
 asin             object
name             object
rating            int64
date             object
verified           bool
title            object
body             object
helpfulVotes    float64
dtype: object
Shape of database = (82815, 8)


In [4]:
def pre_process(text):
    # to lowercase
    text=text.lower()
    
    # remove tags
    text = re.sub("&lt;/?.*?&gt;", "&lt;&gt; ", text)
    
    # remove special characters and digits
    text = re.sub("(\\d|\\W)+", " ", text)
    
    return text

In [5]:
df_idf['text'] = df_idf['title'] + " " + df_idf['body']
df_idf['text'] = df_idf['text'].apply(lambda x: pre_process(str(x)))

In [6]:
df_idf['text'][2]

'love this phone this is a great reliable phone i also purchased this phone after my samsung a died the menu is easily comprehendable and speed dialing is available for around numbers voice dialing is also a nice feature but it takes longer than speed dialing the only thing that bothers me is the games nokia seems to have taken snake and off their phones there is a skydiving game bowling and tennis like pong the ringers are very nice and a feature is available to choose a different ringer for each person calling however ringtones are not available online to download to this phone you re pretty much stuck with what you have there are vibrating ringtones and regular midi polyphonic tones all they need are covers in a reasonable price range '

In [7]:
def get_stop_words(stop_file_path):
    with open(stop_file_path, 'r', encoding='utf-8') as f:
        stopwords = f.readlines()
        stop_set = set(m.strip() for m in stopwords)
        return frozenset(stop_set)

In [8]:
stopwords = get_stop_words('stopwords.txt')
docs = df_idf['text'].tolist()

In [9]:
cv = CountVectorizer(max_df = .85, stop_words=stopwords)
wordCountVec = cv.fit_transform(docs)

  'stop_words.' % sorted(inconsistent))


In [10]:
list(cv.vocabulary_.keys())[:10]

['def',
 'best',
 'worst',
 'samsung',
 'awhile',
 'absolute',
 'doo',
 'read',
 'review',
 'detect']

In [11]:
tfidf = TfidfTransformer(smooth_idf=True, use_idf=True)
tfidf.fit(wordCountVec)

TfidfTransformer(norm='l2', smooth_idf=True, sublinear_tf=False, use_idf=True)

In [12]:
def sort_coo(coo_matrix):
    tuples = zip(coo_matrix.col, coo_matrix.data)
    return sorted(tuples, key=lambda x: (x[1] , x[0]), reverse=True)

def extract_topn_from_vector(feature_names, sorted_items, topn=10):
    sorted_items = sorted_items[:topn]
    
    score_vals = []
    feature_vals = []
    
    for idx, score in sorted_items:
        score_vals.append(round(score, 3))
        feature_vals.append(feature_names[idx])
        
        results = {}
        for idx in range(len(feature_vals)):
            results[feature_vals[idx]]=score_vals[idx]
            
        return results

In [13]:
feature_names = cv.get_feature_names()

doc = docs[1]

tf_idf_vector = tfidf.transform(cv.transform([doc]))

sorted_items = sort_coo(tf_idf_vector.tocoo())

keywords = extract_topn_from_vector(feature_names, sorted_items, 10)


In [14]:
test = cv.transform([doc])

print(keywords)
# for idx in range(len(sorted_items)):
#     print(feature_names[sorted_items[idx][0]], sorted_items[idx][1])

{'sprint': 0.442}


In [15]:
y = df_idf['rating']
# fixing the labels, if > 3.5 is going to be 1 which is positive, else 0
y = y.apply(lambda x: 1 if x > 3.5 else 0) 
y = y.to_numpy()
x = wordCountVec.toarray()
print(x.shape, y.shape)
X_train, X_test, y_train, y_test = train_test_split(x, y, test_size=0.5)
print(X_train.shape, y_train.shape)
print(X_test.shape, y_test.shape)

(82815, 34848) (82815,)
(41407, 34848) (41407,)
(41408, 34848) (41408,)


In [16]:
print(X_train[:, :], y_train[:])

[[0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 ...
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]] [1 1 1 ... 1 1 0]


In [17]:
from sklearn.neural_network import MLPClassifier
clf = MLPClassifier(solver='lbfgs', alpha=1e-5, hidden_layer_sizes=(5, 2), random_state=1)
clf.fit(X_train, y_train)

MLPClassifier(activation='relu', alpha=1e-05, batch_size='auto', beta_1=0.9,
       beta_2=0.999, early_stopping=False, epsilon=1e-08,
       hidden_layer_sizes=(5, 2), learning_rate='constant',
       learning_rate_init=0.001, max_iter=200, momentum=0.9,
       n_iter_no_change=10, nesterovs_momentum=True, power_t=0.5,
       random_state=1, shuffle=True, solver='lbfgs', tol=0.0001,
       validation_fraction=0.1, verbose=False, warm_start=False)

In [17]:
# Save the model in a binary file
import pickle
filename = 'model2.sav'
pickle.dump(clf, open(filename, 'wb'))

NameError: name 'clf' is not defined

In [18]:
# Loads the model from the binary file
import pickle
filename = 'model.sav'
clf = pickle.load(open(filename, 'rb'))

In [20]:
print(X_test[:, 0], y_test[0])

[0 0 0 ... 0 0 0] 0


In [21]:
y_train_p = clf.predict(X_train)
y_test_p = clf.predict(X_test)

In [22]:
for i in range(len(y_test)):
    print(y_test_p[i], y_test[i])

0 0
0 0
0 0
1 1
1 1
0 0
1 1
0 0
0 0
1 1
0 0
0 0
1 1
1 1
0 0
1 1
1 1
0 0
1 1
0 0
1 1
1 1
0 0
1 0
1 1
1 1
0 0
1 1
1 1
1 1
1 1
1 1
0 0
1 1
0 0
1 1
0 0
0 0
1 1
0 0
1 1
0 0
1 1
0 0
1 1
1 1
0 0
1 1
1 1
1 1
0 0
0 1
1 1
1 1
0 1
1 0
0 0
1 1
1 1
1 1
1 1
1 1
0 0
0 0
0 0
1 1
1 1
1 1
1 1
1 1
1 1
1 1
1 1
0 0
1 1
1 1
1 1
1 1
1 1
1 1
1 1
0 0
1 1
0 1
1 1
0 0
1 1
1 1
1 1
0 0
1 1
1 1
1 1
0 0
1 1
0 0
0 0
1 1
1 1
1 1
1 0
1 1
0 0
0 0
1 1
1 1
1 1
0 0
1 1
0 0
0 0
1 1
1 1
1 1
1 1
1 1
1 1
0 0
1 1
1 1
0 0
0 1
0 1
1 1
0 0
1 1
1 0
1 0
1 1
1 1
1 1
1 1
1 1
1 1
1 1
1 1
1 1
1 1
0 0
1 0
1 1
0 0
1 1
1 1
1 1
1 1
1 1
0 1
0 0
1 1
1 1
1 1
0 0
1 0
1 1
1 1
1 1
1 1
0 1
1 1
1 1
1 1
0 0
1 1
1 1
1 1
1 1
1 1
0 0
1 1
1 1
0 0
1 1
0 0
1 1
0 0
0 0
1 1
0 0
1 1
1 1
1 1
1 1
1 1
1 1
1 1
0 1
1 1
1 1
1 1
0 0
0 0
1 1
1 1
0 1
1 1
1 1
0 0
1 1
1 1
1 1
0 0
0 0
1 1
1 1
1 1
1 1
0 0
1 0
1 1
1 1
0 0
1 1
0 0
1 1
0 1
1 1
1 1
1 1
0 0
0 0
0 0
1 1
1 1
1 1
1 1
1 1
1 1
1 1
0 1
0 0
1 1
1 1
0 0
0 0
0 0
1 1
0 0
0 0
1 1
1 1
1 1
0 0
1 1
0 0
1 1
1 1
1 0
1 1
1 1


1 1
0 0
1 1
0 0
1 1
0 0
1 1
0 0
1 1
0 0
0 0
0 0
1 1
1 1
0 0
1 1
0 0
1 1
1 1
0 0
1 1
1 1
1 1
1 1
0 0
1 1
1 1
1 1
1 1
1 1
1 1
0 0
0 0
1 1
1 1
1 1
0 0
1 1
1 1
1 1
1 1
0 0
0 0
0 0
1 1
1 1
1 1
1 1
1 1
1 1
0 0
1 1
0 0
0 0
1 1
1 0
1 1
1 1
0 0
1 1
1 1
1 1
0 0
1 1
1 1
0 0
1 0
1 1
1 1
1 1
1 1
0 0
1 1
0 0
0 0
1 1
1 1
1 1
0 0
0 0
1 1
1 1
1 1
1 1
1 1
0 0
0 0
1 1
1 1
1 1
1 1
1 1
0 0
1 1
0 0
1 0
1 1
0 0
1 1
1 1
1 1
1 1
1 1
0 1
1 1
1 1
1 1
1 1
0 1
0 0
1 0
0 0
0 1
1 1
1 1
1 1
1 0
1 1
1 1
0 0
1 1
1 1
1 1
0 0
0 0
0 0
1 1
1 1
1 1
0 0
0 0
1 1
1 1
0 0
0 0
1 1
1 1
0 0
1 1
1 1
0 0
0 0
1 1
1 1
1 1
1 1
1 1
1 1
1 1
1 1
1 1
1 1
1 1
1 1
1 1
0 0
1 1
1 1
1 1
1 1
1 0
1 1
1 1
0 0
1 1
0 0
1 1
1 1
0 0
1 0
0 0
0 0
0 1
1 1
1 1
1 1
1 1
1 1
1 1
1 1
0 0
1 1
0 0
0 0
1 1
1 1
0 0
0 0
0 0
0 1
0 0
1 1
0 0
0 0
1 1
0 0
1 1
1 1
1 1
0 0
1 1
1 1
1 1
1 1
0 0
1 1
1 1
1 1
0 0
1 1
0 0
1 1
1 1
1 1
1 1
1 1
1 1
1 1
1 1
1 1
1 0
1 1
1 1
1 1
1 1
1 1
1 1
1 1
0 0
1 1
1 1
1 1
1 1
1 1
0 0
0 0
1 1
0 1
0 0
1 1
1 1
0 0
1 0
0 0
1 1
0 0
0 0
1 1
1 1
1 1


1 1
1 1
1 1
1 1
0 0
0 0
0 0
1 1
1 1
0 0
1 1
1 1
1 1
1 1
0 0
0 0
1 1
1 1
1 1
1 1
0 0
1 1
1 1
0 0
0 0
1 1
0 0
1 1
1 1
0 0
1 1
1 1
1 1
1 1
0 0
1 1
1 0
1 1
1 1
1 1
1 1
1 1
0 0
1 1
1 1
1 1
0 0
0 0
0 0
1 1
0 0
1 1
1 0
1 1
1 1
1 1
0 0
1 1
0 0
1 1
1 1
1 1
1 1
1 1
0 0
1 0
1 1
0 0
1 1
1 1
1 1
0 1
1 1
1 1
0 0
0 0
1 1
0 0
1 1
0 0
1 1
0 0
1 0
0 0
1 1
0 0
1 0
1 1
1 1
1 1
0 0
1 1
0 1
0 0
0 0
1 1
1 1
0 0
0 1
0 0
0 0
1 1
1 1
1 1
1 1
0 0
1 1
1 1
1 1
0 0
1 1
1 1
0 0
1 1
1 1
1 1
0 0
1 1
1 1
0 0
0 0
1 0
1 1
0 0
1 1
1 1
0 0
0 0
0 0
1 1
0 0
1 1
1 1
1 1
0 0
0 1
0 0
0 0
1 1
0 0
1 1
1 1
0 0
1 1
0 0
1 1
1 0
0 1
0 0
1 0
1 1
1 1
1 1
1 1
1 1
1 1
1 1
0 1
1 1
1 1
1 1
0 0
1 1
0 1
1 1
0 0
1 1
1 1
1 1
1 1
0 0
0 0
1 1
0 0
1 1
1 1
1 1
1 1
0 0
1 1
0 0
1 1
1 1
1 1
0 0
0 0
1 0
1 1
1 0
1 1
0 0
1 1
1 1
0 0
1 1
1 1
0 0
0 0
0 0
1 1
1 1
1 1
1 1
0 0
0 0
1 1
0 0
0 0
0 0
0 0
1 1
1 1
1 1
1 1
0 0
1 1
1 1
0 0
1 1
1 0
1 1
0 0
0 0
1 1
1 1
0 0
1 1
0 0
1 1
1 1
1 1
1 1
1 1
0 0
1 1
1 0
1 1
1 0
1 1
0 0
1 1
1 1
1 1
1 1
1 1
1 1
1 1
1 1
1 1
0 0


0 0
0 0
1 1
0 0
1 1
1 1
1 1
1 1
1 1
0 0
1 1
1 1
1 1
1 1
0 0
1 1
1 1
0 1
1 1
0 1
1 0
1 1
1 1
1 1
1 1
0 1
1 1
1 1
1 1
0 0
1 1
1 0
0 1
1 1
1 1
0 0
0 0
1 1
1 1
1 1
1 1
1 1
0 0
1 1
1 1
0 0
1 0
0 0
1 1
0 0
1 1
1 1
1 1
0 0
1 1
1 0
1 1
1 1
1 1
1 1
1 1
1 1
1 1
1 1
1 1
1 0
1 0
0 0
1 1
0 0
1 1
1 1
0 0
0 1
0 0
0 0
1 1
1 1
1 1
0 0
0 0
0 0
0 0
0 0
1 1
0 0
1 1
1 1
1 1
0 0
1 1
1 1
1 1
1 1
0 0
1 1
0 0
1 1
0 0
1 1
1 1
1 1
1 1
1 0
0 0
0 0
1 1
1 1
1 1
0 0
0 0
1 1
0 1
1 1
1 1
1 1
1 1
1 1
1 1
0 0
1 1
0 0
1 1
1 1
0 0
1 1
1 1
1 1
1 1
1 1
0 0
1 1
1 1
1 1
0 1
1 0
0 0
0 0
1 1
0 0
0 1
1 1
1 1
1 1
1 1
1 1
1 1
0 0
1 1
1 0
1 1
0 0
1 1
1 1
1 1
1 1
0 0
0 0
1 1
1 1
0 0
0 0
1 1
0 0
0 0
1 1
1 1
1 1
1 0
1 1
1 1
0 0
1 1
1 1
1 1
0 0
1 1
1 1
1 1
1 1
1 1
0 0
1 1
0 0
1 1
1 1
0 0
1 1
0 0
1 1
1 1
1 1
0 0
0 0
1 1
1 1
1 1
1 1
0 0
1 1
1 1
0 0
1 1
1 0
1 1
0 0
1 0
0 0
1 1
0 0
1 1
1 1
1 1
1 1
0 0
1 1
1 1
1 1
1 1
1 1
0 0
1 1
1 1
1 1
1 1
1 1
0 0
0 0
1 1
1 1
1 1
1 1
1 1
0 0
1 1
0 0
1 1
0 0
0 1
1 1
0 0
1 1
1 1
1 1
0 0
1 1
0 0
1 1
1 1
0 0


1 1
0 0
0 0
1 1
1 1
1 1
0 0
1 1
1 1
1 1
1 1
1 1
0 0
1 1
1 1
1 1
1 1
0 0
1 1
0 0
1 1
1 1
1 1
1 1
1 1
1 1
1 1
0 0
1 1
1 1
1 1
1 1
1 1
1 1
1 1
1 1
0 0
1 1
1 1
1 1
1 1
1 1
1 1
0 0
0 0
0 0
1 1
1 1
1 1
1 1
1 1
1 1
1 1
1 1
0 0
1 1
1 1
1 1
1 1
1 1
0 0
0 0
0 0
1 1
0 0
1 1
1 1
0 0
0 0
0 0
1 0
1 1
1 1
0 0
1 1
1 1
0 0
0 1
1 1
0 0
1 0
1 1
0 1
1 1
1 1
0 0
1 1
1 1
1 1
1 1
0 0
1 1
1 1
0 1
0 0
1 1
0 0
0 0
1 1
0 0
0 1
1 1
1 1
1 1
1 1
0 0
0 0
1 1
1 1
1 1
1 1
0 0
1 1
0 0
1 1
1 0
0 0
1 0
0 0
1 1
0 0
1 0
0 0
0 0
0 0
1 1
1 1
1 1
1 1
1 1
1 1
1 1
1 1
1 1
1 0
1 1
1 1
1 1
1 1
1 1
0 0
1 1
1 1
0 0
0 1
1 1
0 0
0 0
1 1
0 0
1 1
1 1
1 1
0 0
0 0
0 0
1 1
1 1
1 1
1 0
1 1
0 0
1 1
0 0
1 1
1 1
1 1
1 1
1 1
0 0
1 1
1 1
1 1
1 1
1 1
1 1
1 1
1 1
0 0
1 1
1 1
1 1
1 1
1 1
0 0
1 1
1 1
1 1
1 1
1 1
1 1
1 1
1 0
1 1
1 1
0 0
1 1
1 1
1 1
1 0
1 1
0 0
1 1
1 1
1 1
1 0
1 1
1 1
1 1
0 1
0 0
1 1
1 1
0 0
1 1
1 0
1 1
0 0
1 1
0 0
0 0
1 1
1 1
1 1
1 1
1 1
1 1
1 1
1 1
1 1
0 0
1 1
0 0
0 0
0 0
0 0
1 1
0 0
0 0
1 1
1 0
0 0
1 1
0 0
0 0
1 1
1 1
1 0
1 1
1 1


0 0
0 0
1 1
1 1
1 1
1 1
1 1
1 1
0 0
1 1
1 1
1 1
0 0
0 0
1 0
1 1
1 1
1 1
0 0
1 1
1 1
0 0
1 1
1 1
0 0
1 1
1 1
1 1
1 1
0 0
1 1
1 1
1 0
1 1
0 1
0 0
0 0
1 1
0 0
1 1
0 0
1 1
1 1
1 1
1 1
1 1
1 1
0 0
0 0
0 0
0 0
1 1
1 1
0 0
0 0
1 1
1 1
1 1
1 1
1 1
0 0
0 0
1 1
1 1
1 1
1 1
0 0
1 1
0 0
1 1
1 1
0 0
1 1
1 1
1 1
1 1
0 0
1 0
1 1
0 0
1 1
1 1
1 1
0 0
1 1
1 1
1 1
0 0
1 1
1 1
1 1
1 1
0 0
1 1
0 1
0 0
1 1
0 0
1 1
1 1
1 1
1 1
0 0
1 1
1 0
1 1
0 0
1 1
0 1
1 1
1 1
1 1
0 0
0 0
0 0
1 1
1 1
1 1
1 0
0 0
1 1
1 1
1 1
1 1
1 1
1 1
1 1
1 1
1 1
1 1
1 1
1 1
1 1
1 1
1 1
1 1
1 1
1 1
1 1
0 0
0 0
1 1
1 1
0 0
0 1
0 0
0 0
0 0
1 1
1 1
1 1
1 1
0 0
0 1
1 1
1 1
0 0
0 0
0 0
0 0
1 1
1 1
1 1
1 1
0 1
1 1
0 0
1 1
1 1
0 0
1 1
1 1
1 1
1 1
1 1
1 1
1 1
1 1
1 0
1 1
1 1
1 1
0 0
1 1
0 0
0 1
0 0
0 0
0 0
0 0
1 1
1 1
0 0
1 1
1 1
1 1
1 1
1 1
1 1
1 1
0 0
1 1
1 1
1 1
1 0
0 0
1 1
1 1
1 1
0 1
0 0
1 1
0 0
1 1
1 1
1 1
0 0
0 0
1 1
0 0
1 1
0 0
1 1
0 0
1 0
0 0
1 1
1 1
0 0
0 0
0 0
1 1
0 0
1 1
0 0
0 0
1 1
0 0
0 0
1 1
1 1
1 1
1 1
0 0
1 1
0 0
1 1
0 0
0 0
1 1


1 1
1 1
1 1
1 1
1 1
1 1
0 1
0 1
1 1
1 1
1 1
1 1
0 0
1 1
1 1
1 0
0 1
1 1
0 0
1 1
1 1
1 1
1 1
0 0
1 1
1 1
1 1
0 0
1 1
0 0
0 0
0 0
1 0
1 1
0 0
0 0
1 1
0 0
1 1
1 1
0 0
1 1
0 0
1 1
1 1
0 0
1 1
1 1
1 0
0 0
0 0
0 0
1 1
1 1
1 0
1 1
0 0
1 1
0 0
1 1
1 1
0 1
1 0
1 1
0 0
1 0
1 1
1 0
1 1
1 1
1 1
0 0
0 0
1 1
1 1
0 1
1 0
1 1
1 1
1 1
1 1
1 1
1 0
1 1
1 1
1 1
0 0
0 0
1 1
0 0
0 0
0 0
1 1
0 0
1 1
1 1
1 1
1 1
0 0
1 1
1 1
1 1
1 1
1 1
1 1
1 1
1 1
1 1
1 1
1 1
1 1
0 0
1 1
0 0
0 0
1 1
0 0
0 0
1 1
1 1
1 1
1 1
1 1
1 0
1 1
1 1
1 1
0 0
1 1
1 1
1 1
1 1
0 0
1 1
1 1
0 0
1 1
0 0
0 0
1 1
0 0
1 1
0 0
0 0
1 1
1 1
0 0
1 1
1 1
1 1
0 0
0 0
1 1
1 1
1 1
0 0
0 0
1 1
1 1
1 1
1 1
1 1
1 1
1 1
1 1
0 0
1 1
0 1
0 0
0 0
1 1
1 1
1 1
1 1
0 0
1 1
0 0
1 1
1 1
0 0
0 0
0 0
0 0
1 1
1 1
0 0
1 1
1 1
1 1
0 0
1 1
1 1
1 1
0 0
1 1
1 1
1 1
0 0
0 0
1 1
1 1
0 0
1 1
1 1
0 0
1 1
0 0
1 1
0 0
1 1
1 1
1 1
0 1
1 1
0 1
1 1
0 0
1 1
1 1
1 1
1 1
1 0
0 0
1 1
1 1
1 1
1 1
0 0
1 1
0 0
1 1
0 0
1 1
1 1
1 1
1 1
1 1
1 1
0 0
1 1
1 1
1 1
1 1
1 1
1 1
0 0
1 1
1 1
1 1
1 0


0 0
1 1
1 1
1 1
1 1
1 1
1 1
1 1
0 0
1 1
0 0
1 1
0 0
1 1
0 0
1 1
1 1
1 1
0 0
1 1
0 0
1 1
1 1
0 1
1 1
1 1
1 1
0 0
1 1
1 1
1 1
0 0
0 0
1 1
1 1
1 1
1 1
1 1
0 0
1 1
0 0
1 1
1 1
1 1
1 1
1 1
1 1
0 0
1 1
1 1
0 0
1 0
1 1
1 1
1 1
1 1
1 1
0 0
1 0
0 0
0 0
1 1
1 0
1 1
0 0
1 1
0 1
0 0
0 0
1 1
1 1
0 0
0 0
1 0
0 0
0 0
0 0
1 1
0 0
1 1
0 1
1 1
0 0
0 0
1 1
0 0
0 1
1 1
1 1
1 1
1 0
0 0
0 0
1 1
1 1
1 1
1 1
1 1
1 1
1 1
1 1
1 1
1 1
1 0
1 1
1 1
1 1
1 1
0 0
1 1
1 0
1 0
0 0
1 1
1 1
1 1
1 1
0 0
1 1
0 0
1 1
0 0
1 1
1 1
0 0
1 1
1 0
1 1
1 1
1 1
1 1
1 1
1 1
1 1
1 1
1 1
1 1
1 1
1 1
1 1
1 1
1 1
1 1
1 1
1 0
0 1
1 1
1 1
1 1
0 0
1 1
1 1
1 1
0 0
1 1
0 0
1 1
1 1
0 0
0 0
1 1
1 1
0 0
1 0
0 0
1 1
0 0
0 0
1 1
1 1
1 1
1 1
1 1
0 0
1 1
1 1
1 1
1 1
0 0
1 1
1 1
1 1
1 1
0 0
1 1
1 1
0 0
1 1
0 0
1 1
1 1
0 0
0 0
1 1
1 1
0 0
1 1
0 0
1 1
1 1
0 1
1 1
1 1
1 1
0 0
1 1
1 1
0 0
1 1
1 1
1 1
1 1
1 1
1 1
1 1
0 0
1 1
1 1
0 0
1 1
1 1
0 0
0 0
0 0
1 1
1 1
0 0
1 1
1 1
0 0
1 1
0 1
1 1
0 1
1 1
0 0
0 0
1 1
0 0
0 1
1 1
1 1
1 1
1 1
0 0
1 1
0 0
1 1
1 1
1 1


0 0
0 0
1 1
0 0
1 1
1 1
1 1
1 1
0 0
0 0
1 1
1 1
1 1
1 1
1 1
1 1
1 1
1 1
0 0
1 1
1 1
0 0
1 1
1 1
0 0
0 1
1 1
1 1
0 0
1 1
1 1
1 1
1 1
1 1
1 1
1 1
0 0
1 1
1 1
1 1
0 0
1 1
1 1
1 1
0 0
1 1
1 1
1 1
1 1
0 1
1 1
1 1
0 0
1 1
0 0
0 0
1 1
1 1
1 1
0 0
0 0
1 1
1 1
0 0
1 1
0 0
0 0
0 1
1 1
0 0
1 1
1 1
1 1
1 1
0 0
0 0
0 0
0 0
0 0
1 1
1 1
1 1
0 0
1 1
1 1
0 0
0 0
1 1
1 1
0 0
1 1
1 1
1 1
0 1
1 1
1 1
1 1
0 1
1 1
1 1
0 0
0 0
1 1
1 1
1 1
1 1
1 1
1 1
1 1
1 1
1 1
1 1
1 1
1 1
1 1
1 1
1 1
1 1
1 1
1 1
1 1
0 1
0 0
1 1
1 1
1 1
1 1
0 0
1 1
1 1
1 1
1 1
1 1
0 0
1 1
1 1
0 0
0 0
0 0
1 1
0 0
1 1
0 0
0 0
1 1
0 0
0 0
0 0
1 1
0 0
1 1
1 1
1 1
0 0
0 0
0 0
0 0
1 1
1 1
1 1
0 0
1 1
1 1
1 1
1 1
0 0
0 0
1 1
1 1
1 1
1 0
1 1
1 1
0 0
1 1
1 1
1 1
0 0
0 0
1 1
1 1
0 0
1 1
1 1
1 1
0 0
1 1
1 1
1 1
1 1
0 0
0 0
1 1
0 0
1 1
1 1
1 1
1 1
1 1
0 0
1 1
1 1
1 1
1 1
1 1
1 1
1 1
1 1
1 1
0 0
1 1
1 1
1 1
1 1
1 1
1 1
0 0
1 1
1 1
0 0
1 1
0 1
1 1
1 1
0 0
1 1
1 1
1 0
1 1
1 1
0 0
1 1
1 1
0 1
1 1
1 1
0 0
1 1
1 1
1 1
0 0
1 1
1 1
1 1
1 1
1 1
0 0
1 1
0 0
0 0


1 1
1 1
0 1
1 1
1 1
1 1
1 1
1 1
1 0
1 1
0 0
0 1
0 0
0 0
0 0
1 1
0 1
1 1
0 1
0 0
1 1
1 1
1 1
1 1
0 1
1 1
1 1
0 0
1 1
0 0
0 0
0 1
0 0
0 0
0 0
0 1
1 1
0 0
1 1
0 0
1 1
1 1
1 1
1 1
1 1
1 1
1 1
1 1
0 0
1 1
1 1
1 1
0 0
0 0
1 1
1 1
1 1
0 1
0 0
0 0
1 1
1 1
1 1
1 1
1 1
1 1
1 1
0 0
0 0
1 1
1 1
1 1
1 1
1 1
1 1
0 0
1 1
1 1
0 1
1 1
1 1
0 0
1 1
1 1
0 0
1 1
1 1
1 1
1 1
0 0
1 1
1 1
1 1
0 0
1 1
1 1
0 0
1 1
1 1
1 1
1 1
1 1
1 1
1 1
0 0
1 1
1 1
1 1
1 1
0 0
1 1
1 0
1 1
0 1
0 0
0 0
1 1
1 1
1 1
1 1
1 1
1 1
1 1
1 1
1 1
1 1
0 0
1 1
1 1
1 1
1 1
0 0
1 1
1 0
0 0
0 0
1 1
1 1
1 1
1 1
0 0
1 1
1 1
0 0
0 0
0 0
1 1
1 1
0 1
1 1
0 0
1 1
1 1
0 0
1 1
1 1
1 1
1 1
1 1
0 0
1 1
0 0
0 0
1 1
0 0
1 1
1 1
1 1
1 1
1 1
0 0
1 0
0 0
1 1
0 0
1 1
1 1
1 1
1 0
0 1
0 0
1 1
0 0
0 0
1 1
1 1
1 1
1 1
1 0
1 1
1 1
0 0
0 0
1 1
1 1
0 1
0 0
1 1
1 1
0 0
0 0
1 1
0 0
0 0
1 1
0 0
1 1
1 1
1 1
0 0
1 1
0 0
1 1
1 1
1 1
1 1
1 1
1 1
1 1
1 0
1 1
1 1
1 1
1 1
1 1
1 1
1 0
1 1
1 1
1 1
1 1
1 1
1 1
1 1
1 1
1 1
1 1
1 1
1 1
0 0
1 1
1 1
1 1
1 1
1 1
1 1
0 0
1 1
1 1
1 1


0 0
1 1
1 1
0 0
1 1
0 0
0 0
1 1
1 1
0 0
1 0
1 1
1 0
1 1
0 0
1 1
1 1
0 0
0 0
0 0
0 0
0 0
1 1
1 1
1 1
0 0
1 1
1 1
1 1
0 0
1 1
1 1
0 0
1 1
0 0
0 0
1 0
0 0
0 1
1 1
0 0
1 1
1 1
0 0
1 1
1 1
1 1
1 1
1 0
0 1
0 0
0 0
1 1
1 1
0 1
0 0
1 0
1 1
1 1
1 1
1 1
1 1
0 0
1 1
1 1
1 1
1 1
1 1
0 0
1 1
1 1
1 1
0 0
0 0
1 1
1 1
0 0
1 1
1 1
1 1
1 1
0 0
1 1
0 0
0 0
1 1
1 1
0 0
1 1
1 1
1 1
1 1
1 1
1 1
1 1
1 1
1 1
0 0
1 1
0 0
1 0
1 1
1 1
1 1
1 1
0 0
1 1
1 0
1 1
1 1
1 1
1 1
1 1
1 1
1 1
1 1
0 0
1 1
1 1
0 0
0 0
0 0
1 1
1 1
0 0
1 1
1 1
1 1
1 1
1 1
1 1
0 0
0 0
0 0
1 1
1 1
1 1
1 0
1 1
1 1
0 0
1 1
1 1
1 1
1 1
1 1
1 1
1 1
1 1
1 1
1 1
0 0
1 1
0 0
0 0
1 1
0 0
1 1
0 0
1 1
1 1
1 1
0 0
1 0
1 1
1 1
1 1
0 0
1 1
0 0
1 1
1 1
0 0
1 1
0 0
1 1
0 0
1 1
1 1
1 1
0 0
0 0
1 1
0 0
1 1
1 1
1 1
1 1
1 1
1 1
1 1
0 0
1 1
1 1
1 1
0 0
1 1
1 1
1 1
0 1
0 0
1 1
1 1
1 1
0 0
0 0
1 1
1 1
1 1
1 1
0 0
1 1
1 1
1 1
0 0
1 1
1 1
1 1
1 1
1 1
1 1
0 0
1 0
0 0
1 1
1 1
0 0
0 0
1 1
0 0
0 0
1 1
0 0
1 1
0 0
1 1
1 1
1 1
1 1
0 0
1 1
1 1
1 0
1 1
1 1
1 1
1 1
1 1
1 1
1 1


0 1
1 0
1 1
1 1
0 0
1 1
1 1
0 0
1 1
1 1
1 1
1 1
1 1
0 1
1 1
0 0
1 1
1 1
1 1
0 0
0 0
0 0
1 1
0 0
0 1
1 1
0 0
1 1
0 0
1 1
1 1
1 1
0 0
1 1
1 1
1 1
0 0
0 0
0 0
0 0
0 0
1 1
1 1
1 1
0 0
1 1
1 1
0 0
1 1
1 1
1 1
0 0
0 0
1 1
1 1
1 0
1 1
0 0
0 0
1 1
0 0
1 1
1 1
1 1
0 1
1 1
1 1
1 1
1 1
0 0
1 1
0 0
1 1
0 1
1 1
0 0
1 1
1 1
1 1
1 0
1 1
0 0
1 1
1 0
0 0
0 0
0 1
1 1
1 1
1 1
0 0
1 1
0 0
0 1
1 1
0 0
1 1
1 1
1 1
0 0
0 0
1 1
0 0
0 0
1 1
0 0
1 1
1 1
0 0
0 0
1 1
0 0
1 1
0 0
1 1
0 0
1 1
0 0
1 1
1 1
1 1
0 0
1 1
1 0
1 1
1 1
0 0
0 0
0 0
1 1
1 1
0 0
1 1
1 1
0 0
1 1
1 1
1 1
1 1
1 0
0 0
0 0
1 1
1 1
1 1
0 1
1 1
0 0
0 0
1 1
1 1
0 0
1 1
0 0
1 1
1 1
1 1
1 1
1 1
0 0
1 1
0 0
1 1
1 1
0 0
1 1
0 0
1 0
1 1
1 1
0 0
1 1
1 1
1 1
1 1
1 1
0 0
0 1
1 1
0 0
1 1
1 1
1 1
0 0
1 1
1 1
1 1
1 1
0 0
0 0
1 1
0 0
1 1
0 0
1 1
0 0
1 1
1 1
1 1
1 1
0 0
1 1
1 1
1 1
1 1
1 1
0 0
1 1
0 0
1 1
1 1
1 1
0 0
1 1
0 1
0 0
1 1
0 0
1 1
1 1
1 0
1 1
0 0
1 1
0 1
0 0
1 1
1 1
1 1
0 0
1 1
0 0
1 1
1 1
1 1
0 0
1 1
0 0
1 1
1 1
0 0
0 0
0 0
1 1
1 1
1 1
1 1
0 0
0 1
0 0


1 1
0 1
1 1
1 1
0 0
1 1
1 1
1 1
1 1
1 1
1 1
1 1
0 1
1 1
1 1
1 1
1 1
1 0
0 0
0 0
0 0
1 1
1 1
1 1
1 1
1 1
0 0
1 1
0 0
1 1
1 1
1 1
0 0
1 1
0 1
1 1
1 1
0 0
0 0
1 0
0 0
1 1
0 0
0 0
0 0
1 1
1 1
1 1
1 1
1 1
1 1
1 1
1 1
1 1
1 1
1 1
1 1
1 1
1 1
0 0
1 1
1 1
1 1
0 0
1 1
0 0
0 0
1 1
1 1
1 1
1 1
1 1
1 1
1 1
0 0
1 1
1 1
1 1
0 0
1 1
1 1
1 1
0 0
1 1
1 1
1 1
1 1
1 1
1 1
1 1
1 1
1 1
0 0
1 1
1 1
1 1
0 0
1 1
1 1
1 0
1 1
0 0
0 0
1 1
1 1
1 1
0 0
0 0
1 1
1 1
1 1
1 1
0 0
0 0
1 1
1 1
0 0
1 1
0 0
0 0
1 0
1 1
1 1
1 1
0 0
0 0
0 0
1 1
0 1
1 1
1 1
1 1
1 1
0 0
1 1
0 0
1 1
0 0
1 1
1 1
1 1
0 0
1 1
1 1
0 0
1 1
1 1
0 0
1 1
1 1
1 1
0 0
1 1
0 0
1 1
0 1
0 0
0 0
0 0
1 1
0 0
0 0
1 1
0 0
1 1
1 1
1 1
1 1
0 1
1 1
0 0
0 0
0 0
0 0
1 1
0 1
0 0
0 0
1 1
0 1
0 0
0 0
1 1
1 1
1 1
1 1
1 0
0 1
1 1
1 1
1 1
0 0
1 1
1 1
1 1
0 0
1 1
0 0
1 1
1 0
1 1
1 1
0 0
1 1
0 0
0 0
1 1
0 0
1 1
1 1
1 1
1 1
1 1
1 1
1 1
1 0
1 1
1 1
1 1
1 1
1 1
1 1
0 0
1 1
1 1
1 1
1 1
0 0
1 1
1 1
1 1
0 0
1 1
0 0
1 1
1 1
0 1
1 1
1 1
1 1
0 0
1 0
1 1
0 0
0 0
1 1
1 1
0 0
1 1
1 1


1 1
1 1
0 1
0 0
0 0
1 1
1 0
0 0
1 1
1 1
0 0
1 1
1 1
0 0
1 1
1 1
0 0
1 1
1 1
1 1
1 1
1 1
1 0
1 1
1 1
0 0
1 1
0 1
1 1
0 0
1 1
1 1
1 1
0 0
0 0
0 0
1 1
1 1
0 0
0 0
0 0
0 0
1 1
1 1
1 1
0 0
1 1
1 1
1 1
1 1
0 0
1 1
0 0
1 1
1 1
1 1
0 1
1 1
1 1
1 1
1 1
1 0
1 1
0 0
0 0
1 1
1 1
1 1
1 1
1 1
0 0
0 0
1 1
1 1
0 0
0 0
1 1
1 1
1 1
1 1
0 0
0 0
0 1
1 1
0 0
1 1
0 0
1 1
1 1
1 1
1 1
1 1
0 0
1 1
0 0
1 0
1 1
0 0
0 0
0 0
0 0
0 0
0 0
0 0
0 0
1 1
0 0
1 1
1 1
1 1
1 1
1 1
1 1
1 0
1 1
0 0
1 1
1 1
0 0
1 1
1 1
0 0
1 1
1 1
1 1
1 1
0 0
0 0
0 0
0 0
0 0
0 0
1 0
1 1
0 0
0 0
1 1
1 1
0 0
1 1
1 1
1 1
0 0
1 1
1 1
1 1
1 1
0 0
1 1
1 1
1 1
1 1
1 1
1 1
1 1
1 1
1 1
1 1
1 0
1 1
1 1
1 1
0 0
0 0
1 1
1 1
1 1
1 1
0 0
1 1
0 0
1 1
1 0
1 1
0 0
0 0
1 1
1 1
0 1
0 0
0 0
0 1
1 1
0 0
0 0
0 0
1 1
0 0
1 1
0 0
1 1
0 1
1 1
0 0
1 1
1 1
1 1
0 0
0 0
1 1
1 1
1 1
1 1
0 0
0 0
1 1
1 1
1 1
1 1
1 1
1 1
1 1
0 0
0 0
0 0
0 0
1 1
0 0
1 1
1 1
0 0
1 1
1 0
1 1
1 1
0 0
0 0
0 0
1 1
1 1
1 1
0 0
1 1
0 0
1 1
0 0
1 1
0 0
1 1
0 0
0 0
0 0
1 1
0 0
1 1
0 0
0 0
1 0
1 1
1 1


1 1
1 1
0 0
1 1
1 1
0 1
0 1
1 0
1 1
0 0
1 1
1 1
1 1
1 1
1 1
0 0
1 1
1 1
1 1
1 1
1 1
1 1
0 0
1 1
1 1
1 1
1 1
0 0
1 1
0 0
1 1
1 1
1 1
1 1
1 1
1 1
1 0
1 1
0 0
0 0
1 1
0 0
1 0
1 1
0 0
0 1
1 1
1 1
1 1
1 1
0 0
1 1
1 1
0 0
1 1
0 0
0 0
1 1
1 1
1 1
1 1
1 1
1 1
1 1
0 0
1 1
1 1
0 0
0 1
1 1
0 0
1 1
0 0
1 1
1 1
1 1
1 1
1 1
1 1
0 0
1 1
1 1
0 0
0 0
0 0
1 0
1 1
1 1
0 0
1 1
1 0
1 1
0 0
1 1
0 1
1 1
1 1
1 0
1 1
1 1
1 1
1 1
1 1
0 0
1 1
0 1
1 1
1 1
1 0
0 1
1 1
0 0
1 1
1 1
1 1
1 1
0 0
1 1
0 0
1 1
0 0
1 1
0 0
1 1
1 1
1 1
0 0
0 0
1 1
0 0
0 0
1 1
0 0
0 0
1 1
0 0
1 1
1 1
1 1
0 0
0 0
1 1
0 0
1 1
0 0
1 1
1 1
1 1
1 0
1 1
0 0
1 1
0 0
1 1
1 1
1 1
1 1
1 1
0 0
0 0
1 1
0 0
1 1
1 1
1 1
0 0
1 1
1 1
1 1
1 1
1 1
1 1
1 0
1 1
0 0
1 1
0 0
1 1
1 1
1 1
0 1
1 1
1 1
1 1
1 1
0 0
1 1
0 1
0 1
1 0
1 1
1 1
1 1
0 0
1 1
1 0
1 1
1 1
1 1
1 1
1 1
0 0
0 0
1 1
1 1
1 1
0 0
1 1
1 1
0 0
1 1
0 0
1 1
1 1
0 0
1 1
1 1
1 1
1 1
0 0
1 1
1 1
1 1
1 1
1 1
0 0
0 0
1 1
0 0
1 1
0 0
0 0
1 1
0 0
1 1
1 1
0 0
1 1
0 0
1 1
0 0
1 1
1 1
1 1
1 0
1 1
1 1
0 0
0 0
1 1


1 1
1 1
1 1
1 1
1 1
0 0
0 0
0 0
0 0
1 1
1 1
1 1
1 1
0 0
1 1
1 1
1 1
1 1
0 0
1 1
1 1
1 0
0 0
0 0
1 1
1 1
1 1
0 0
1 1
1 1
1 1
1 1
0 0
1 1
1 1
1 1
1 1
1 1
0 0
1 1
1 1
0 1
0 0
0 0
1 1
1 1
1 1
0 0
1 1
1 1
1 1
0 0
1 1
1 1
1 1
0 0
1 1
1 1
1 1
0 0
1 1
0 0
0 0
1 1
1 1
1 1
0 0
1 1
0 0
0 0
1 1
1 1
1 1
0 0
1 1
1 1
0 0
1 1
0 1
0 0
0 0
1 1
1 1
1 1
1 1
1 1
1 1
1 1
1 0
0 0
1 1
0 1
1 1
0 0
1 1
0 0
1 1
1 1
1 1
1 1
1 1
0 0
0 0
0 0
1 1
1 1
0 0
0 0
0 0
1 1
1 1
1 1
1 1
1 1
1 1
0 1
1 1
0 0
1 1
0 0
1 1
1 1
1 1
1 1
1 0
0 0
1 1
0 0
1 1
1 1
0 0
0 0
1 1
1 1
0 0
1 1
1 0
1 1
1 1
1 1
1 1
1 1
1 1
1 1
1 0
1 1
0 1
1 1
0 0
1 1
1 1
1 1
0 0
1 1
0 0
1 1
1 1
1 1
0 0
1 1
0 0
0 1
0 0
1 1
0 0
0 0
1 1
1 1
0 0
0 0
0 0
1 1
1 1
0 0
0 0
1 1
1 1
1 1
1 1
0 0
1 1
1 1
1 1
1 1
0 0
1 1
1 1
1 1
1 1
0 1
0 0
1 1
1 1
0 0
0 0
1 0
1 1
1 1
1 0
1 1
1 1
1 1
1 1
1 1
1 1
0 0
1 1
1 1
0 1
1 1
1 1
1 1
0 0
1 1
1 1
1 0
1 1
1 1
1 1
0 0
1 1
0 0
1 1
1 1
0 0
1 1
0 0
0 0
1 1
0 0
1 1
1 1
1 1
0 0
1 1
1 1
0 0
0 0
1 1
0 0
0 0
1 1
1 1
0 0
1 1
0 1
1 0
0 0
1 1
1 1


0 0
0 0
0 0
0 0
0 0
1 1
1 1
1 1
0 0
0 1
1 1
1 0
0 0
1 1
0 0
1 1
0 0
0 0
1 1
1 1
1 0
1 1
1 1
1 1
0 1
0 0
0 0
1 1
1 1
0 0
1 1
1 1
1 1
0 0
1 1
1 1
1 1
1 1
1 1
1 1
1 1
1 1
1 1
0 0
0 0
1 1
0 0
1 1
1 1
1 1
1 1
0 0
1 1
1 1
1 0
1 1
1 1
0 1
1 1
1 1
1 1
1 1
1 1
1 1
1 1
0 0
1 1
1 1
1 1
1 1
0 0
0 0
0 0
1 1
0 0
1 1
1 1
1 1
0 0
0 0
1 1
0 0
1 1
0 0
1 1
1 1
1 1
1 1
1 1
1 1
1 1
0 0
0 1
1 1
0 0
1 0
1 1
1 1
1 1
0 0
1 1
1 1
0 0
0 0
1 1
1 1
1 1
0 1
1 1
0 0
1 1
1 1
1 1
1 1
1 0
0 0
1 1
0 0
1 1
1 1
0 0
0 0
1 1
1 1
1 1
0 0
1 1
1 1
0 1
0 0
1 1
1 1
1 0
1 0
0 0
1 1
1 1
1 1
1 1
1 1
1 1
0 0
1 1
0 1
0 0
1 1
1 1
1 1
1 1
0 0
1 0
0 1
0 0
0 0
1 1
1 1
1 1
1 1
1 1
1 1
1 1
1 1
1 1
0 0
1 1
1 1
1 1
1 1
0 0
0 0
1 0
0 0
0 1
1 1
1 1
1 1
1 1
1 1
1 1
0 0
0 0
1 1
0 0
0 0
1 0
1 1
1 1
1 1
1 1
0 0
0 0
0 1
1 1
0 0
1 1
1 1
1 1
1 1
0 0
1 1
1 1
1 1
1 1
1 1
1 0
1 0
0 0
0 0
1 1
1 1
1 1
1 1
1 1
1 1
1 1
1 1
0 1
1 1
1 1
1 1
1 1
1 1
0 0
1 1
0 0
1 1
1 1
1 1
1 1
1 1
1 1
0 0
0 0
0 0
1 1
1 1
1 1
1 1
1 1
0 0
1 1
1 1
0 0
1 1
1 1
1 1
0 0
0 0
0 0
1 1


In [23]:
from sklearn.metrics import accuracy_score, confusion_matrix
print("Accuracy in testing set:", accuracy_score(y_test, y_test_p))
print("Accuracy in training set:", accuracy_score(y_train, y_train_p))
print(confusion_matrix(y_test, y_test_p))
print(confusion_matrix(y_train, y_train_p))

Accuracy in testing set: 0.9296029752704791
Accuracy in training set: 0.9318714227063057
[[12034  1523]
 [ 1392 26459]]
[[12066  1470]
 [ 1351 26520]]


In [30]:
test = cv.transform(["Hate", "Good", "Awful", "Best"]).toarray()
clf.predict(test)

array([0, 1, 0, 1])

In [28]:
for i in range(110, 120):
    test = cv.transform([docs[i]]).toarray()
    p = clf.predict(test)
    print(docs[i], p, y[i])
    
    print(type(test))

tracphone samsung t g prepaid phone i feel this phone has met my expectations it s for my yo son it s easy to use a good first phone for him i especially like the gps feature on this for a lost child person i purchased this for safety i would have recommended this to other parents for their children  [1] 1
<class 'numpy.ndarray'>
bad bluetooth works only one way this phone is good except for the bluetooth there is no way to place the phone into discoverable mode therefore no other bluetooth devices can see it  [0] 0
<class 'numpy.ndarray'>
cheap bluetooth as many have commented the internet connection button is a rip off just be careful and avoid it for the price i haven t found a better value  [0] 1
<class 'numpy.ndarray'>
five stars excelent phone this phone work very good in venezuela with movistar movilnet and digitel this phone is unlocked thanks  [1] 1
<class 'numpy.ndarray'>
the phone was exactly what i wanted except it the phone was exactly what i wanted except it did not come 

In [38]:
def build_generator():

    model = Sequential()

    model.add(Dense(256, input_dim=latent_dim))
    model.add(LeakyReLU(alpha=0.2))
    model.add(BatchNormalization(momentum=0.8))
    model.add(Dense(512))
    model.add(LeakyReLU(alpha=0.2))
    model.add(BatchNormalization(momentum=0.8))
    model.add(Dense(1024))
    model.add(LeakyReLU(alpha=0.2))
    model.add(BatchNormalization(momentum=0.8))
    model.add(Dense(np.prod(mail_shape), activation='tanh'))
    model.add(Reshape(mail_shape))

    model.summary()

    noise = Input(shape=(latent_dim,))
    mail = model(noise)

    return Model(noise, mail)

def build_discriminator():

    model = Sequential()

    model.add(Flatten(input_shape=mail_shape))
    model.add(Dense(512))
    model.add(LeakyReLU(alpha=0.2))
    model.add(Dense(256))
    model.add(LeakyReLU(alpha=0.2))
    model.add(Dense(1, activation='sigmoid'))
    model.summary()

    mail = Input(shape=mail_shape)
    validity = model(mail)

    return Model(mail, validity)


In [39]:
from keras.optimizers import Adam
from keras.models import Sequential, Model
from keras.layers import Input, Dense, Reshape, Flatten, Dropout
from keras.layers.advanced_activations import LeakyReLU
from keras.layers import BatchNormalization, Activation, ZeroPadding2D

mail_rows = 8
mail_cols = 8
mail_shape = (mail_rows, mail_cols, 1)
latent_dim = 100

optimizer = Adam(0.0002, 0.5)

# Build and compile the discriminator
discriminator = build_discriminator()
discriminator.compile(loss='binary_crossentropy',
    optimizer=optimizer,
    metrics=['accuracy'])

# Build and compile the discriminator2
discriminator2 = build_discriminator()
discriminator2.compile(loss='binary_crossentropy',
    optimizer=optimizer,
    metrics=['accuracy'])

# Build the generator
generator = build_generator()

# The generator takes noise as input and generates mails
z = Input(shape=(latent_dim,))
mail = generator(z)

# For the combined model we will only train the generator
discriminator.trainable = False

# The discriminator takes generated images as input and determines validity
validity = discriminator(mail)

# The combined model  (stacked generator and discriminator)
# Trains the generator to fool the discriminator
combined = Model(z, validity)
combined.compile(loss='binary_crossentropy', optimizer=optimizer)


_________________________________________________________________
Layer (type)                 Output Shape              Param #   
flatten_6 (Flatten)          (None, 64)                0         
_________________________________________________________________
dense_18 (Dense)             (None, 512)               33280     
_________________________________________________________________
leaky_re_lu_13 (LeakyReLU)   (None, 512)               0         
_________________________________________________________________
dense_19 (Dense)             (None, 256)               131328    
_________________________________________________________________
leaky_re_lu_14 (LeakyReLU)   (None, 256)               0         
_________________________________________________________________
dense_20 (Dense)             (None, 1)                 257       
Total params: 164,865
Trainable params: 164,865
Non-trainable params: 0
_________________________________________________________________
____

In [41]:
def results(self, pred, actual):
    results = confusion_matrix(actual, pred)
    print('Confusion Matrix :')
    print(results)
    print ('Accuracy Score :',accuracy_score(actual, pred))
    print ('Report : ')
    print(classification_report(actual, pred))
    print()

In [None]:
def train( epochs, batch_size=128, sample_interval=50):

    # Test data
    disc_loss = [0] * epochs
    gen_loss = [0] * epochs

    disc_acc= [0] * epochs
    gen_acc= [0] * epochs

    x = np.loadtxt('spambase/spambase.data', delimiter=',')
    y = x[:, 57]

    data_max = np.zeros(58)
    data_min= np.zeros(58)
    for n in range(len(x[0])):
        data_max[n] = max(x[:, n])
        data_min[n] = min(x[:, n])

    data = empty([x.shape[0], 58])
    for r in range(len(x)):
        for c in range(len(x[0])):
            if(c != 57):
                data[r, c] = ((x[r, c] - data_min[c]) / (data_max[c] - data_min[c]))
            else:
                data[r, c] = x[r, c]

    X_train, X_t, Y_train, Y_t = train_test_split(data, y, test_size = 0.3, random_state = 0)

    ts = empty([X_t.shape[0], 65])
    for n in range(len(X_t)):
        ts[n] = np.append(X_t[n], [0, 0, 0, 0, 0, 0, 0], axis=0)

    X_test = np.asarray(ts)
    X_test[:, 57] = 0
    Y_test = np.asarray(Y_t)

    X_test = X_test[:, : 64]
    X_test = X_test.reshape(X_test.shape[0], 8, 8)
    # Error could be here
    Y_test = Y_test.reshape(Y_test.shape[0], 1)
    #X_test = X_test / 127.5 - 1
    X_test = np.expand_dims(X_test, axis=3)

    spam = []
    email = []

    tr = empty([X_train.shape[0], 65])
    for n in range(len(X_train)):
        tr[n] = np.append(X_train[n], [0, 0, 0, 0, 0, 0, 0], axis=0)
        if tr[n][57]== 1:
            tr[n][57] = 0
            spam.append(tr[n])
        else:
            tr[n][57] = 0
            email.append(tr[n])

    X_train_spam = np.asarray(spam)
    X_train_email = np.asarray(email)

    X_train_spam = X_train_spam[:, : 64]  # verify again the actual value that it should have
    # change the 8's by mail_rows
    X_train_spam = X_train_spam.reshape(X_train_spam.shape[0], 8, 8)

    X_train_email = X_train_email[:, : 64]  # verify again the actual value that it should have

    # change the 8's by mail_rows
    X_train_email = X_train_email.reshape(X_train_email.shape[0], 8, 8)

    X_train_spam = np.expand_dims(X_train_spam, axis=3)

    X_train_email = np.expand_dims(X_train_email, axis=3)

    # Adversarial ground truths
    valid_spam = np.ones((batch_size, 1))
    valid_email = np.zeros((batch_size, 1))
    fake = np.ones((batch_size, 1))

    for epoch in range(epochs):
        # Train the discriminator
        # Select a random batch of images
        idx_spam = np.random.randint(0, X_train_spam.shape[0], batch_size)
        idx_email = np.random.randint(0, X_train_email.shape[0], batch_size)

        mails_spam = X_train_spam[idx_spam]
        mails_spam[:, 7, 1:] = 0

        mails_email = X_train_email[idx_email]
        mails_email[:, 7, 1:] = 0

        idx_test = np.random.randint(0, X_test.shape[0], batch_size)
        mails_test = X_test[idx_test]

        noise = np.random.normal(0, 1, (batch_size, latent_dim))

        # Generate a batch of new images
        gen_mails = generator.predict(noise)
        gen_mails[:, 7, 1:] = 0

        # Train the normal model
        n_loss = discriminator2.train_on_batch(mails_spam, valid_spam)
        n_loss_fake = discriminator2.train_on_batch(mails_email, valid_email)
        n_loss = 0.5 * np.add(n_loss, n_loss_fake)

        # Train the discriminator
        d_loss_real_spam = discriminator.train_on_batch(mails_spam, valid_spam)
        d_loss_real_email = discriminator.train_on_batch(mails_email, valid_email)
        d_loss_fake = discriminator.train_on_batch(gen_mails, fake)

        d_tot = np.add(d_loss_real_spam, d_loss_fake)
        d_loss = 0.33 * np.add(d_tot, d_loss_real_email)


        # Train the generator
        noise = np.random.normal(0, 1, (batch_size, latent_dim))

        # Train the generator (to have the discriminator label samples as valid)
        g_loss = combined.train_on_batch(noise, valid_email)

        disc_loss[epoch] = d_loss[0]
        gen_loss[epoch] = g_loss

        disc_acc[epoch] = 100 * d_loss[1]
        gen_acc[epoch] = 100 *  n_loss[1]

        # Plot the progress
        print ("%d [D loss: %f, acc.: %.2f%%, nacc.: %.2f%%] [G loss: %f]" % (epoch, d_loss[0], 100*d_loss[1], 100*n_loss[1], g_loss))
