In [1]:
import pandas as pd
import numpy as np
import re
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn.model_selection import train_test_split

This is an example from https://www.freecodecamp.org/news/how-to-extract-keywords-from-text-with-tf-idf-and-pythons-scikit-learn-b2a0f3d7e667/

In [2]:
df_idf = pd.read_csv('amazon/reviews.csv')

In [3]:
print("Schema:\n", df_idf.dtypes)
print("Shape of database =", df_idf.shape)

Schema:
 asin             object
name             object
rating            int64
date             object
verified           bool
title            object
body             object
helpfulVotes    float64
dtype: object
Shape of database = (82815, 8)


In [4]:
def pre_process(text):
    # to lowercase
    text=text.lower()
    
    # remove tags
    text = re.sub("&lt;/?.*?&gt;", "&lt;&gt; ", text)
    
    # remove special characters and digits
    text = re.sub("(\\d|\\W)+", " ", text)
    
    return text

In [5]:
df_idf['text'] = df_idf['title'] + " " + df_idf['body']
df_idf['text'] = df_idf['text'].apply(lambda x: pre_process(str(x)))

In [6]:
df_idf['text'][2]

'love this phone this is a great reliable phone i also purchased this phone after my samsung a died the menu is easily comprehendable and speed dialing is available for around numbers voice dialing is also a nice feature but it takes longer than speed dialing the only thing that bothers me is the games nokia seems to have taken snake and off their phones there is a skydiving game bowling and tennis like pong the ringers are very nice and a feature is available to choose a different ringer for each person calling however ringtones are not available online to download to this phone you re pretty much stuck with what you have there are vibrating ringtones and regular midi polyphonic tones all they need are covers in a reasonable price range '

In [7]:
def get_stop_words(stop_file_path):
    with open(stop_file_path, 'r', encoding='utf-8') as f:
        stopwords = f.readlines()
        stop_set = set(m.strip() for m in stopwords)
        return frozenset(stop_set)

In [8]:
stopwords = get_stop_words('stopwords.txt')
docs = df_idf['text'].tolist()

In [9]:
cv = CountVectorizer(max_df = .85, stop_words=stopwords)
wordCountVec = cv.fit_transform(docs)

  'stop_words.' % sorted(inconsistent))


In [10]:
list(cv.vocabulary_.keys())[:10]

['def',
 'best',
 'worst',
 'samsung',
 'awhile',
 'absolute',
 'doo',
 'read',
 'review',
 'detect']

In [11]:
tfidf = TfidfTransformer(smooth_idf=True, use_idf=True)
tfidf.fit(wordCountVec)

TfidfTransformer(norm='l2', smooth_idf=True, sublinear_tf=False, use_idf=True)

In [12]:
def sort_coo(coo_matrix):
    tuples = zip(coo_matrix.col, coo_matrix.data)
    return sorted(tuples, key=lambda x: (x[1] , x[0]), reverse=True)

def extract_topn_from_vector(feature_names, sorted_items, topn=10):
    sorted_items = sorted_items[:topn]
    
    score_vals = []
    feature_vals = []
    
    for idx, score in sorted_items:
        score_vals.append(round(score, 3))
        feature_vals.append(feature_names[idx])
        
        results = {}
        for idx in range(len(feature_vals)):
            results[feature_vals[idx]]=score_vals[idx]
            
        return results

In [13]:
feature_names = cv.get_feature_names()

doc = docs[1]

tf_idf_vector = tfidf.transform(cv.transform([doc]))

sorted_items = sort_coo(tf_idf_vector.tocoo())

keywords = extract_topn_from_vector(feature_names, sorted_items, 10)


In [14]:
test = cv.transform([doc])

print(keywords)
# for idx in range(len(sorted_items)):
#     print(feature_names[sorted_items[idx][0]], sorted_items[idx][1])

{'sprint': 0.442}


In [15]:
y = df_idf['rating']
# fixing the labels, if > 3.5 is going to be 1 which is positive, else 0
y = y.apply(lambda x: 1 if x > 3.5 else 0) 
y = y.to_numpy()
x = wordCountVec.toarray()
print(x.shape, y.shape)
X_train, X_test, y_train, y_test = train_test_split(x, y, test_size=0.5)
print(X_train.shape, y_train.shape)
print(X_test.shape, y_test.shape)

(82815, 34848) (82815,)
(41407, 34848) (41407,)
(41408, 34848) (41408,)


In [16]:
# print(X_train[:, :], y_train[:])

In [17]:
from sklearn.neural_network import MLPClassifier
clf = MLPClassifier(solver='lbfgs', alpha=1e-5, hidden_layer_sizes=(5, 2), random_state=1)
# clf.fit(X_train, y_train)

In [18]:
# Save the model in a binary file
import pickle
filename = 'model2.sav'
pickle.dump(clf, open(filename, 'wb'))

In [19]:
# Loads the model from the binary file
import pickle
filename = 'model.sav'
clf = pickle.load(open(filename, 'rb'))

In [20]:
# print(X_test[:, 0], y_test[0])

In [21]:
# y_train_p = clf.predict(X_train)
# y_test_p = clf.predict(X_test)

In [22]:
# for i in range(len(y_test)):
#     print(y_test_p[i], y_test[i])

In [23]:
# from sklearn.metrics import accuracy_score, confusion_matrix
# print("Accuracy in testing set:", accuracy_score(y_test, y_test_p))
# print("Accuracy in training set:", accuracy_score(y_train, y_train_p))
# print(confusion_matrix(y_test, y_test_p))
# print(confusion_matrix(y_train, y_train_p))

In [24]:
test = cv.transform(["Hate", "Good", "Awful", "Best"]).toarray()
clf.predict(test)
print(test.shape)

(4, 34848)


In [25]:
# for i in range(110, 120):
#     test = cv.transform([docs[i]]).toarray()
#     p = clf.predict(test)
#     print(docs[i], p, y[i])
    
#     print(type(test))

In [26]:
def build_generator(img_shape):

    noise_shape = (100,)

    model = Sequential()

    model.add(Dense(256, input_shape=noise_shape))
    model.add(LeakyReLU(alpha=0.2))
    model.add(BatchNormalization(momentum=0.8))
    model.add(Dense(512))
    model.add(LeakyReLU(alpha=0.2))
    model.add(BatchNormalization(momentum=0.8))
    model.add(Dense(1024))
    model.add(LeakyReLU(alpha=0.2))
    model.add(BatchNormalization(momentum=0.8))
    model.add(Dense(np.prod(img_shape), activation='relu'))
    model.add(Reshape(img_shape))

    model.summary()

    noise = Input(shape=noise_shape)
    img = model(noise)

    return Model(noise, img)

def build_discriminator(shape):

    img_shape = shape

    model = Sequential()

#     model.add(Flatten(input_shape=img_shape)) # is one dimension
    model.add(Dense(512, input_shape=img_shape))
    model.add(LeakyReLU(alpha=0.2))
    model.add(Dense(256))
    model.add(LeakyReLU(alpha=0.2))
    model.add(Dense(1, activation='sigmoid'))
    model.summary()

    img = Input(shape=img_shape)
    validity = model(img)

    return Model(img, validity)


In [27]:
from keras.optimizers import Adam
from keras.models import Sequential, Model
from keras.layers import Input, Dense, Reshape, Flatten, Dropout
from keras.layers.advanced_activations import LeakyReLU
from keras.layers import BatchNormalization, Activation, ZeroPadding2D

img_rows = 1
img_cols = X_train[0].shape
img_shape = (img_cols)

optimizer = Adam(0.0002, 0.5)

# Build and compile the discriminator
discriminator = build_discriminator(img_shape)
discriminator.compile(loss='binary_crossentropy',
    optimizer=optimizer,
    metrics=['accuracy'])

# Build and compile the generator
generator = build_generator(img_shape)
generator.compile(loss='binary_crossentropy', optimizer=optimizer)

# The generator takes noise as input and generated imgs
z = Input(shape=(100,))
img = generator(z)

# For the combined model we will only train the generator
discriminator.trainable = False

# The valid takes generated images as input and determines validity
valid = discriminator(img)

# The combined model  (stacked generator and discriminator) takes
# noise as input => generates images => determines validity
combined = Model(z, valid)
combined.compile(loss='binary_crossentropy', optimizer=optimizer)

Using TensorFlow backend.


Instructions for updating:
Colocations handled automatically by placer.
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense_1 (Dense)              (None, 512)               17842688  
_________________________________________________________________
leaky_re_lu_1 (LeakyReLU)    (None, 512)               0         
_________________________________________________________________
dense_2 (Dense)              (None, 256)               131328    
_________________________________________________________________
leaky_re_lu_2 (LeakyReLU)    (None, 256)               0         
_________________________________________________________________
dense_3 (Dense)              (None, 1)                 257       
Total params: 17,974,273
Trainable params: 17,974,273
Non-trainable params: 0
_________________________________________________________________
__________________________________________________________

In [28]:
def results(self, pred, actual):
    results = confusion_matrix(actual, pred)
    print('Confusion Matrix :')
    print(results)
    print ('Accuracy Score :',accuracy_score(actual, pred))
    print ('Report : ')
    print(classification_report(actual, pred))
    print()

In [29]:
def train(epochs, data, batch_size=128):


        # Rescale -1 to 1
        X_train = data #(X_train.astype(np.float32) - 127.5) / 127.5
#         X_train = np.expand_dims(X_train, axis=3)

        half_batch = int(batch_size / 2)

        for epoch in range(epochs):

            # ---------------------
            #  Train Discriminator
            # ---------------------

            # Select a random half batch of images
            idx = np.random.randint(0, X_train.shape[1], half_batch)
            imgs = X_train[idx]

            noise = np.random.normal(0, 1, (half_batch, 100))

            # Generate a half batch of new images
            gen_imgs = generator.predict(noise)

            # Train the discriminator
            d_loss_real = discriminator.train_on_batch(imgs, np.ones((half_batch, 1)))
            d_loss_fake = discriminator.train_on_batch(gen_imgs, np.zeros((half_batch, 1)))
            d_loss = 0.5 * np.add(d_loss_real, d_loss_fake)


            # ---------------------
            #  Train Generator
            # ---------------------

            noise = np.random.normal(0, 1, (batch_size, 100))

            # The generator wants the discriminator to label the generated samples
            # as valid (ones)
            valid_y = np.array([1] * batch_size)

            # Train the generator
            g_loss = combined.train_on_batch(noise, valid_y)

            # Plot the progress
            print ("%d [D loss: %f, acc.: %.2f%%] [G loss: %f]" % (epoch, d_loss[0], 100*d_loss[1], g_loss))



In [61]:
train(epochs=1000, data=X_train)

0 [D loss: 0.587166, acc.: 65.62%] [G loss: 6.348205]
1 [D loss: 0.574017, acc.: 67.97%] [G loss: 5.664258]
2 [D loss: 0.555976, acc.: 67.19%] [G loss: 4.776833]
3 [D loss: 0.560892, acc.: 62.50%] [G loss: 5.411107]
4 [D loss: 0.593142, acc.: 66.41%] [G loss: 5.350187]
5 [D loss: 0.555765, acc.: 68.75%] [G loss: 6.015429]
6 [D loss: 0.565835, acc.: 65.62%] [G loss: 5.533709]
7 [D loss: 0.520985, acc.: 71.88%] [G loss: 4.854037]
8 [D loss: 0.586494, acc.: 63.28%] [G loss: 5.781159]
9 [D loss: 0.548066, acc.: 67.19%] [G loss: 5.086875]
10 [D loss: 0.540024, acc.: 66.41%] [G loss: 5.375452]
11 [D loss: 0.582889, acc.: 66.41%] [G loss: 5.000913]
12 [D loss: 0.543629, acc.: 66.41%] [G loss: 5.040783]
13 [D loss: 0.603248, acc.: 62.50%] [G loss: 4.897422]
14 [D loss: 0.623279, acc.: 60.16%] [G loss: 5.110558]
15 [D loss: 0.579054, acc.: 67.19%] [G loss: 6.208938]
16 [D loss: 0.596465, acc.: 63.28%] [G loss: 4.833700]
17 [D loss: 0.544111, acc.: 68.75%] [G loss: 5.006203]
18 [D loss: 0.622012

149 [D loss: 0.581010, acc.: 60.16%] [G loss: 4.953751]
150 [D loss: 0.545897, acc.: 60.94%] [G loss: 4.071805]
151 [D loss: 0.596154, acc.: 55.47%] [G loss: 3.619408]
152 [D loss: 0.668439, acc.: 47.66%] [G loss: 4.422657]
153 [D loss: 0.588760, acc.: 56.25%] [G loss: 3.907811]
154 [D loss: 0.589330, acc.: 53.12%] [G loss: 4.541921]
155 [D loss: 0.559779, acc.: 58.59%] [G loss: 4.025639]
156 [D loss: 0.544348, acc.: 75.78%] [G loss: 4.023120]
157 [D loss: 0.531327, acc.: 67.97%] [G loss: 4.241893]
158 [D loss: 0.590813, acc.: 74.22%] [G loss: 3.711803]
159 [D loss: 0.636484, acc.: 65.62%] [G loss: 4.533108]
160 [D loss: 0.553205, acc.: 63.28%] [G loss: 3.970257]
161 [D loss: 0.586125, acc.: 53.91%] [G loss: 3.891372]
162 [D loss: 0.705902, acc.: 46.88%] [G loss: 3.370425]
163 [D loss: 0.606973, acc.: 57.03%] [G loss: 3.481086]
164 [D loss: 0.547182, acc.: 82.03%] [G loss: 4.745164]
165 [D loss: 0.523916, acc.: 73.44%] [G loss: 3.964365]
166 [D loss: 0.569041, acc.: 62.50%] [G loss: 3.

296 [D loss: 0.589038, acc.: 64.06%] [G loss: 3.295136]
297 [D loss: 0.651098, acc.: 48.44%] [G loss: 3.127608]
298 [D loss: 0.613583, acc.: 79.69%] [G loss: 3.865799]
299 [D loss: 0.577296, acc.: 64.06%] [G loss: 2.940751]
300 [D loss: 0.629288, acc.: 49.22%] [G loss: 3.297644]
301 [D loss: 0.615208, acc.: 57.03%] [G loss: 2.921267]
302 [D loss: 0.578261, acc.: 62.50%] [G loss: 2.782526]
303 [D loss: 0.596350, acc.: 71.09%] [G loss: 2.890146]
304 [D loss: 0.660574, acc.: 49.22%] [G loss: 3.186441]
305 [D loss: 0.610281, acc.: 58.59%] [G loss: 2.601128]
306 [D loss: 0.570572, acc.: 72.66%] [G loss: 2.730935]
307 [D loss: 0.565530, acc.: 60.94%] [G loss: 3.031898]
308 [D loss: 0.618450, acc.: 73.44%] [G loss: 2.791296]
309 [D loss: 0.598730, acc.: 67.19%] [G loss: 3.239827]
310 [D loss: 0.665566, acc.: 58.59%] [G loss: 2.837789]
311 [D loss: 0.515110, acc.: 81.25%] [G loss: 3.142740]
312 [D loss: 0.559197, acc.: 84.38%] [G loss: 2.957182]
313 [D loss: 0.577430, acc.: 75.00%] [G loss: 3.

443 [D loss: 0.566173, acc.: 72.66%] [G loss: 3.056012]
444 [D loss: 0.562867, acc.: 73.44%] [G loss: 2.403865]
445 [D loss: 0.560103, acc.: 71.09%] [G loss: 3.073780]
446 [D loss: 0.660042, acc.: 48.44%] [G loss: 2.885254]
447 [D loss: 0.554357, acc.: 72.66%] [G loss: 2.571122]
448 [D loss: 0.538771, acc.: 79.69%] [G loss: 2.186873]
449 [D loss: 0.560763, acc.: 71.88%] [G loss: 2.494104]
450 [D loss: 0.587617, acc.: 61.72%] [G loss: 2.660037]
451 [D loss: 0.528356, acc.: 85.16%] [G loss: 2.716027]
452 [D loss: 0.598535, acc.: 67.97%] [G loss: 2.989748]
453 [D loss: 0.553343, acc.: 69.53%] [G loss: 2.795500]
454 [D loss: 0.522192, acc.: 80.47%] [G loss: 2.903798]
455 [D loss: 0.636518, acc.: 75.78%] [G loss: 2.859994]
456 [D loss: 0.608851, acc.: 64.06%] [G loss: 3.566692]
457 [D loss: 0.621771, acc.: 67.19%] [G loss: 3.578422]
458 [D loss: 0.634117, acc.: 49.22%] [G loss: 2.659484]
459 [D loss: 0.646504, acc.: 63.28%] [G loss: 2.675527]
460 [D loss: 0.549695, acc.: 73.44%] [G loss: 2.

590 [D loss: 0.543946, acc.: 80.47%] [G loss: 2.451404]
591 [D loss: 0.629865, acc.: 63.28%] [G loss: 2.721171]
592 [D loss: 0.579586, acc.: 71.09%] [G loss: 1.843076]
593 [D loss: 0.655693, acc.: 78.12%] [G loss: 1.905042]
594 [D loss: 0.567195, acc.: 82.03%] [G loss: 1.960489]
595 [D loss: 0.566319, acc.: 86.72%] [G loss: 1.865016]
596 [D loss: 0.572798, acc.: 80.47%] [G loss: 2.063351]
597 [D loss: 0.609754, acc.: 60.94%] [G loss: 1.793812]
598 [D loss: 0.621961, acc.: 57.03%] [G loss: 1.630969]
599 [D loss: 0.577197, acc.: 77.34%] [G loss: 2.581708]
600 [D loss: 0.576010, acc.: 72.66%] [G loss: 1.766149]
601 [D loss: 0.621947, acc.: 57.81%] [G loss: 1.787879]
602 [D loss: 0.634247, acc.: 53.91%] [G loss: 2.040864]
603 [D loss: 0.638455, acc.: 52.34%] [G loss: 1.955602]
604 [D loss: 0.591274, acc.: 68.75%] [G loss: 2.291584]
605 [D loss: 0.596488, acc.: 81.25%] [G loss: 1.634437]
606 [D loss: 0.683956, acc.: 48.44%] [G loss: 2.152349]
607 [D loss: 0.568567, acc.: 78.12%] [G loss: 2.

737 [D loss: 0.552804, acc.: 78.12%] [G loss: 1.798980]
738 [D loss: 0.561622, acc.: 86.72%] [G loss: 1.493197]
739 [D loss: 0.507454, acc.: 85.16%] [G loss: 1.606327]
740 [D loss: 0.513519, acc.: 87.50%] [G loss: 2.084687]
741 [D loss: 0.610825, acc.: 73.44%] [G loss: 1.465688]
742 [D loss: 0.542819, acc.: 86.72%] [G loss: 1.467928]
743 [D loss: 0.576616, acc.: 85.16%] [G loss: 1.962228]
744 [D loss: 0.512198, acc.: 89.84%] [G loss: 1.970083]
745 [D loss: 0.551693, acc.: 79.69%] [G loss: 2.023402]
746 [D loss: 0.514349, acc.: 87.50%] [G loss: 1.584431]
747 [D loss: 0.570279, acc.: 71.09%] [G loss: 1.636143]
748 [D loss: 0.604683, acc.: 60.16%] [G loss: 1.808338]
749 [D loss: 0.545712, acc.: 75.78%] [G loss: 1.763078]
750 [D loss: 0.565366, acc.: 73.44%] [G loss: 1.879912]
751 [D loss: 0.645987, acc.: 53.91%] [G loss: 1.618202]
752 [D loss: 0.557818, acc.: 78.12%] [G loss: 1.979944]
753 [D loss: 0.542235, acc.: 79.69%] [G loss: 1.982042]
754 [D loss: 0.583265, acc.: 68.75%] [G loss: 1.

884 [D loss: 0.636376, acc.: 54.69%] [G loss: 1.775900]
885 [D loss: 0.572682, acc.: 75.00%] [G loss: 1.993074]
886 [D loss: 0.536603, acc.: 77.34%] [G loss: 2.153284]
887 [D loss: 0.553620, acc.: 74.22%] [G loss: 1.701163]
888 [D loss: 0.620878, acc.: 67.19%] [G loss: 1.625551]
889 [D loss: 0.652042, acc.: 61.72%] [G loss: 1.272920]
890 [D loss: 0.793006, acc.: 57.03%] [G loss: 1.383049]
891 [D loss: 0.644904, acc.: 55.47%] [G loss: 1.984232]
892 [D loss: 0.597622, acc.: 69.53%] [G loss: 1.951907]
893 [D loss: 0.542793, acc.: 78.91%] [G loss: 1.976114]
894 [D loss: 0.689686, acc.: 55.47%] [G loss: 1.869825]
895 [D loss: 0.635912, acc.: 54.69%] [G loss: 1.577805]
896 [D loss: 0.597836, acc.: 69.53%] [G loss: 1.726808]
897 [D loss: 0.637276, acc.: 64.06%] [G loss: 1.583239]
898 [D loss: 0.672465, acc.: 62.50%] [G loss: 1.516470]
899 [D loss: 0.655053, acc.: 74.22%] [G loss: 1.446781]
900 [D loss: 0.549668, acc.: 79.69%] [G loss: 1.810255]
901 [D loss: 0.623266, acc.: 70.31%] [G loss: 1.

In [62]:
print(X_train.shape)

(41407, 34848)


In [102]:
gen = 10
noise = np.random.normal(0, 1, (gen, 100))
new_mails = generator.predict(noise)
print(np.round(new_mails))

idx = np.random.randint(0, X_train.shape[1], gen)
imgs = X_train[idx]
print(imgs)

generated_labels = clf.predict(new_mails)
print(generated_labels)

[[0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 ...
 [0. 2. 2. ... 3. 1. 2.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]]
[[0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 ...
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]]
[1 1 1 0 1 1 1 0 0 1]


In [103]:
cv.inverse_transform(new_mails) # See the generated words

[array(['changing', 'feature', 'feels', 'forgot', 'future', 'hundred',
        'include', 'like', 'logo', 'love', 'loved', 'marco', 'memory',
        'new', 'options', 'overnight', 'pay', 'perfect', 'ported',
        'refund', 'sleek', 'slide', 'slowest', 'still', 'straight', 'sure',
        'unneeded', 'well', 'worked', 'working', 'works'], dtype='<U112'),
 array(['cause', 'compare', 'depending', 'end', 'exactly', 'excellent',
        'fast', 'feature', 'feels', 'great', 'greenify', 'half', 'happy',
        'iphone', 'know', 'love', 'memory', 'new', 'nice', 'perfect',
        'phone', 'price', 'quality', 'samsung', 'simple', 'slowest',
        'sure', 'using', 'version'], dtype='<U112'),
 array(['bought', 'exactly', 'feature', 'forgot', 'future', 'girl',
        'great', 'hundred', 'marco', 'nice', 'ok', 'overnight', 'price',
        'product', 'sd', 'stars', 'unneeded', 'works'], dtype='<U112'),
 array(['awesome', 'back', 'bad', 'based', 'blocks', 'bought', 'budget',
        'called'