In [1]:
from __future__ import print_function
import os
import sys
import numpy as np
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.utils import to_categorical
from keras.layers import Dense, Input, GlobalMaxPooling1D, Dropout
from keras.layers import Conv1D, MaxPooling1D, Embedding
from keras.models import Model
from keras.initializers import Constant
from matplotlib import pyplot
from keras import backend as K
import pandas as pd
from sklearn.utils import shuffle
from keras.models import model_from_json


Using TensorFlow backend.
  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])
  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])


In [2]:
BASE_DIR = 'C:/Users/hafizmrf3/DataScienceProject/SentimentAnalysis/edx/data'
GLOVE_DIR = os.path.join(BASE_DIR, 'glovedata')
TEXT_DATA_DIR = os.path.join(BASE_DIR, 'edxdata')
MOVIE_REVIEW_FILE_NAME = "spotify_labeled_edit.csv"
MAX_SEQUENCE_LENGTH = 1000
MAX_NUM_WORDS = 20000
EMBEDDING_DIM = 100
VALIDATION_SPLIT = 0.2

In [3]:
def read_csv(filepath):
     if os.path.splitext(filepath)[1] != '.csv':
          return  # or whatever
     seps = [',', ';', '\t']                    # ',' is default
     encodings = [None, 'utf-8', 'ISO-8859-1']  # None is default
     for sep in seps:
         for encoding in encodings:
              try:
                  return pd.read_csv(filepath, encoding=encoding, sep=sep)
              except Exception:  # should really be more specific 
                  pass
     raise ValueError("{!r} is has no encoding in {} or seperator in {}"
                      .format(filepath, encodings, seps))

In [4]:

# second, prepare text samples and their labels
print('Processing text dataset')
index_to_label_dict = {}
texts = []  # list of text samples
labels_index = {}  # dictionary mapping label name to numeric id
labels = []  # list of label ids

input_df = read_csv(os.path.join(TEXT_DATA_DIR, MOVIE_REVIEW_FILE_NAME))
review_df1 = input_df[['review','label']]
review_df = review_df1[review_df1['label']!='unsup']


review_df = shuffle(review_df)
print("Here are Few Samples in data")
print(review_df.head)

print("Here total number of positive and negative samples")
print(review_df.groupby(['label']).count())

print("Converting pandas dataframe into lists")
texts = review_df['review'].values.tolist()
labels = []
labels_text = []
labels_text_unique = review_df.label.unique().tolist()
labels_text = review_df['label'].values.tolist()

idxCounter = 0
for label in labels_text_unique:
    labels_index[label] = idxCounter
    index_to_label_dict[idxCounter] = label
    idxCounter = idxCounter + 1;

idxCounter = 0    
for label in labels_text:
    print("processing row " + str(idxCounter))
    labels.append(labels_index[label])
    idxCounter = idxCounter + 1;
    

print("Labels Array")
print(len(labels))
print("Labels Dictionary")
print(labels_index)
print("Done")

Processing text dataset
Here are Few Samples in data
<bound method NDFrame.head of                                                 review  label
65   baru download apk tp tidak bisa masuk muncul t...      0
759                 ternyata ada iklannya mengecewakan      0
416      akhirakhir malah g kena buat dengerin lagu si      0
953  musiknya enak meski membutuhkan sinyal sangat ...      1
594  musik bagian kehudipan spotify memberikan kemu...      1
..                                                 ...    ...
382            makasihaq sangat senang pkek apli kasih      1
800                        bisa dengarkan lagu apa aja      1
904  nice bisa dengarkan lagu lagu terbaru lagu has...      1
404           oke sih iklan nya bayakkuragin iklan nya      0
710  lamalama aplikasi kaya joox muter musiknya nga...      0

[1000 rows x 2 columns]>
Here total number of positive and negative samples
       review
label        
0         500
1         500
Converting pandas dataframe into lists
p

  result = method(y)


In [5]:
print("loading model .....")
# load json and create model
json_file = open('C:/Users/hafizmrf3/DataScienceProject/SentimentAnalysis/edx/ms2.json', 'r')
loaded_model_json = json_file.read()
json_file.close()
loaded_model = model_from_json(loaded_model_json)
# load weights into new model
loaded_model.load_weights("C:/Users/hafizmrf3/DataScienceProject/SentimentAnalysis/edx/ms2.h5")
print("Loaded model from disk")
 
# evaluate loaded model on test data
from keras import optimizers
adam = optimizers.Adam(learning_rate=0.001, beta_1=0.9, beta_2=0.999, amsgrad=False) 
loaded_model.compile(loss='mean_squared_error',
              optimizer=adam,
              metrics=['acc'])

#loaded_model.compile(loss='categorical_crossentropy', optimizer='rmsprop', metrics=['accuracy'])
print("done")

loading model .....

Loaded model from disk
done


In [6]:
#score = loaded_model.evaluate(X, Y, verbose=0)
#print("%s: %.2f%%" % (loaded_model.metrics_names[1], score[1]*100))

In [7]:
# finally, vectorize the text samples into a 2D integer tensor
tokenizer = Tokenizer(num_words=MAX_NUM_WORDS)
tokenizer.fit_on_texts(texts)
sequences = tokenizer.texts_to_sequences(texts)

In [96]:
test_texts = ["gue suka banget sama spotify, pokoknya recommended banget untuk dengerin lagu kesukaan loo! thx spotify",
              
              "jelek banget tampilannya, ngebosenin",
              
              "terlalu banyak iklan dan tidak bisa putar lagu yang diinginkan. ga suka sama layanannya",
              
              "mudah digunakan untuk semua kalangan",
              
              "kurang satu fitur, lirik lagunya ga ada :(",
              
              "dimanapun gabut pasti enjoy banget kalo udh dengerin lagu pilihan terbaik dari spotify",
             
              "rating 5 lah buat spotify, oke banget !!!",
              
              "suka banget dengerin podcast-podcast berkualitas dari spotify, tingkatkan terus layanannya yaa :)",
              
              "kecewa banget padahal aku udah bayar untuk premium tapi kenapa ga bisa download yaa",
              
              "KECEWA BANGET KENAPA BISA SEENAK INI LAYANANANNYAA HIYAHIYAHIYA",
              
             ]
test_sequences = tokenizer.texts_to_sequences(test_texts)
test_data = pad_sequences(test_sequences, maxlen=MAX_SEQUENCE_LENGTH)


In [97]:
nn_output = loaded_model.predict(test_data)
print(nn_output.round())
print("=====DETAIL=====")
print(nn_output)

[[0. 1.]
 [1. 0.]
 [1. 0.]
 [0. 1.]
 [1. 0.]
 [0. 1.]
 [0. 1.]
 [0. 1.]
 [1. 0.]
 [0. 1.]]
=====DETAIL=====
[[0.02041851 0.9795815 ]
 [0.88656086 0.11343912]
 [0.98986024 0.01013977]
 [0.4372473  0.56275266]
 [0.99505323 0.00494676]
 [0.00459657 0.9954034 ]
 [0.00531807 0.9946819 ]
 [0.00187237 0.99812764]
 [0.8348816  0.16511843]
 [0.00537503 0.9946249 ]]


In [98]:
i=0
for idx in np.argmax(nn_output, axis=1):
    print("Category: ", index_to_label_dict[idx])
    print("text: " , test_texts[i])
    print("==========================================================================================================================")
    i = i + 1

Category:  1
text:  gue suka banget sama spotify, pokoknya recommended banget untuk dengerin lagu kesukaan loo! thx spotify
Category:  0
text:  jelek banget tampilannya, ngebosenin
Category:  0
text:  terlalu banyak iklan dan tidak bisa putar lagu yang diinginkan. ga suka sama layanannya
Category:  1
text:  mudah digunakan untuk semua kalangan
Category:  0
text:  kurang satu fitur, lirik lagunya ga ada :(
Category:  1
text:  dimanapun gabut pasti enjoy banget kalo udh dengerin lagu pilihan terbaik dari spotify
Category:  1
text:  rating 5 lah buat spotify, oke banget !!!
Category:  1
text:  suka banget dengerin podcast-podcast berkualitas dari spotify, tingkatkan terus layanannya yaa :)
Category:  0
text:  kecewa banget padahal aku udah bayar untuk premium tapi kenapa ga bisa download yaa
Category:  1
text:  KECEWA BANGET KENAPA BISA SEENAK INI LAYANANANNYAA HIYAHIYAHIYA


In [95]:
##buat confusion matrix
from sklearn.metrics import confusion_matrix
matrix = confusion_matrix(test_data, nn_output.round())
print(matrix)

ValueError: Classification metrics can't handle a mix of multiclass-multioutput and multilabel-indicator targets