In [1]:
#imports for data processing
import nltk
from nltk.stem.lancaster import LancasterStemmer
import numpy as np
import tflearn
import tensorflow as tf
import unicodedata
import sys
from tqdm import tqdm
import pandas as pd
import os
from random import shuffle
import string

#imports for nural network
from keras.models import Sequential
from keras.layers import *
from keras.optimizers import *
from keras.models import load_model

class SetData():
    def __init__(self):
        self.num_classes = 0
        self.classes = []
        self.train_data = None
        self.test_data = None
        self.train_data_dir = '/Users/arunprakash/Documents/TensorFlow/MovieData/train'
        self.test_data_dir = '/Users/arunprakash/Documents/TensorFlow/MovieData/test'
        for _, dirnames, _ in os.walk(self.train_data_dir):
            self.num_classes += len(dirnames)
            if len(dirnames)>0:
                self.classes = dirnames
        self.stemmer = LancasterStemmer()
        self.tbl = str.maketrans({key: None for key in string.punctuation})

    def one_hot_classes(self,clsname):
        ohe = []
        for i in self.classes:
            if i == clsname:
                ohe.append(1)
            else:
                ohe.append(0)
        return ohe

    def remove_punctuation(self,text):
        return text.translate(self.tbl)


    def train_data_with_label(self):
        train_text = []
        tr_words = []
        for cls in self.classes:
            train_data_path = self.train_data_dir+ '/' + cls
            for i in tqdm(os.listdir(train_data_path)):
                if i != '.DS_Store':
                    path = os.path.join(train_data_path, i)
                    df = pd.read_csv(path,delimiter='\n')
                    dfnp = df.values
                    for dlg in dfnp:
                        dlg = self.remove_punctuation(dlg[0])
                        dlg_tkn = nltk.word_tokenize(dlg)
                        tr_words.extend(dlg_tkn)
                        train_text.append((dlg_tkn, self.one_hot_classes(cls)))
        tr_words = [self.stemmer.stem(w.lower()) for w in tr_words]
        tr_words = sorted(list(set(tr_words)))
        shuffle(train_text)
        np.save('train_text.npy', train_text)
        np.save('train_text_wrds.npy', tr_words)
        return train_text, tr_words

    def test_data_with_label(self):
        test_text = []
        tst_words = []
        for cls in self.classes:
            test_data_path = self.test_data_dir+ '/' + cls
            for i in tqdm(os.listdir(test_data_path)):
                if i != '.DS_Store':
                    path = os.path.join(test_data_path, i)
                    df = pd.read_csv(path,delimiter='\n')
                    dfnp = df.values
                    for dlg in dfnp:
                        dlg = self.remove_punctuation(dlg[0])
                        dlg_tkn = nltk.word_tokenize(dlg)
                        tst_words.extend(dlg_tkn)
                        test_text.append((dlg_tkn, self.one_hot_classes(cls)))
        tst_words = [self.stemmer.stem(w.lower()) for w in tst_words]
        tst_words = sorted(list(set(tst_words)))
        np.save('test_text.npy', test_text)
        np.save('test_text_wrds.npy', tst_words)
        return test_text,tst_words

    def bag_of_words(self, data,wrds,data_type='train'):
        model_data = []
        batch_data = [] #for saving the bow as batches to overcome memory issue
        cnt=1
        for i in data:
            bow = []
            tokenized = i[0]            
            tokenized = [self.stemmer.stem(word.lower()) for word in tokenized]
            for w in wrds:
                bow.append(1) if w in tokenized else bow.append(0)
            model_data.append([bow, i[1]])
            batch_data.append([bow, i[1]])
            if cnt%5000 == 0:
                print('processing {}'.format(cnt))
                if cnt%50000 == 0:
                    batch_res = np.array(batch_data)
                    model_name = 'model_data_'+ data_type + str(cnt) +'_part.npy'
                    np.save(model_name, batch_res)
                    batch_data = []
            cnt += 1
        res = np.array(model_data)
        batch_res = np.array(batch_data)
        model_name = 'model_data_'+ data_type +'last_part.npy'
        np.save(model_name, batch_res)
        return res



  return f(*args, **kwds)
  from ._conv import register_converters as _register_converters
Using TensorFlow backend.


In [2]:
sd = SetData()

In [3]:
#Initialize data set
train_data, tr_words = sd.train_data_with_label()
test_data, tst_words = sd.test_data_with_label()

#Load data
#train_data = np.load('train_text.npy')
#tr_words = np.load('train_text_wrds.npy')
#test_data = np.load('test_text.npy')
#tst_words = np.load('test_text_wrds.npy')

100%|██████████| 21/21 [00:04<00:00,  4.75it/s]
100%|██████████| 19/19 [00:02<00:00,  6.58it/s]
100%|██████████| 22/22 [00:02<00:00,  7.79it/s]
100%|██████████| 21/21 [00:03<00:00,  6.82it/s]
100%|██████████| 21/21 [00:03<00:00,  5.75it/s]
100%|██████████| 6/6 [00:01<00:00,  4.51it/s]
100%|██████████| 4/4 [00:00<00:00,  6.40it/s]
100%|██████████| 7/7 [00:00<00:00,  9.07it/s]
100%|██████████| 6/6 [00:00<00:00, 11.27it/s]
100%|██████████| 6/6 [00:01<00:00,  5.49it/s]


In [5]:
training = sd.bag_of_words(train_data,tr_words,'train')
#training = np.load('model_data_50000train.npy')

training_data = list(training[:, 0])
training_label = list(training[:, 1])

processing 5000
processing 10000
processing 15000
processing 20000
processing 25000
processing 30000
processing 35000
processing 40000
processing 45000
processing 50000
processing 55000
processing 60000
processing 65000
processing 70000
processing 75000
processing 80000
processing 85000
processing 90000
processing 95000
processing 100000
processing 105000
processing 110000
processing 115000
processing 120000
processing 125000
processing 130000
processing 135000
processing 140000
processing 145000
processing 150000
processing 155000
processing 160000
processing 165000
processing 170000


In [7]:
testing = sd.bag_of_words(test_data,tr_words,'test')# here also we have to pass the training words
#testing = np.load('model_data_test.npy')

testing_data = list(testing[:, 0])
testing_label = list(testing[:, 1])

processing 5000
processing 10000
processing 15000
processing 20000
processing 25000
processing 30000
processing 35000
processing 40000
processing 45000


In [8]:
#tf.reset_default_graph()
#ANN
model = Sequential()

model.add(InputLayer(input_shape=[len(training_data[0])]))#keras will internally add batch dimention
model.add(Dense(128,activation='relu'))
model.add(Dense(128,activation='relu'))
model.add(Dense(64,activation='relu'))
model.add(Dense(32,activation='relu'))
model.add(Dropout(rate=0.5))
model.add(Dense(sd.num_classes,activation='softmax'))
optimizer = Adam(lr=1e-3)
model.compile(optimizer=optimizer,loss='categorical_crossentropy',metrics=['accuracy'])
model.fit(x=np.array(training_data),y=np.array(training_label),epochs=100,batch_size=200)
model.summary()

model.save('my_classify_model.h5')

Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100
Epoch 54/100
Epoch 55/100
Epoch 56/100
Epoch 57/100
Epoch 58/100
Epoch 59/100
Epoch 60/100
Epoch 61/100
Epoch 62/100
Epoch 63/100
Epoch 64/100
Epoch 65/100
Epoch 66/100
Epoch 67/100
Epoch 68/100
Epoch 69/100
Epoch 70/100
Epoch 71/100
Epoch 72/100
Epoch 73/100
Epoch 74/100
Epoch 75/100
Epoch 76/100
Epoch 77/100
Epoch 78

In [None]:
loss_and_metrics = model.evaluate(np.array(testing_data), np.array(testing_label), batch_size=100)
print(loss_and_metrics)

In [None]:
import nltk
from nltk.stem.lancaster import LancasterStemmer
import numpy as np
import tflearn
import tensorflow as tf
import unicodedata
import sys
from tqdm import tqdm
import pandas as pd
import os
from random import shuffle
import string

#imports for nural network
from keras.models import Sequential
from keras.layers import *
from keras.optimizers import *
from keras.models import load_model

In [None]:
model = load_model('my_classify_model.h5')

In [None]:
sd = SetData()

In [None]:
a =np.array(testing_data[1])

In [10]:
categories = sd.classes

In [None]:
a.reshape(15685,)

In [75]:
print(categories[np.argmax(model.predict(np.array(testing_data[40000:40001])))])

drama


In [None]:
model.predict(np.array(training_data[25000:25001]))

In [58]:
np.array(testing_data[40000:40001]).shape

(1, 15685)

In [25]:
testing_label[39999:40000]

[[0, 0, 0, 0, 1]]

In [None]:
testing_data[40000:40001]

In [6]:
len(training_data)

172630

In [27]:
categories

['comedy', 'sciencefic', 'horror', 'action', 'drama']

In [44]:
def get_bow_for_new_sub(path):
    test_text = []
    df = pd.read_csv(path,delimiter='\n')
    dfnp = df.values
    for dlg in dfnp:
        dlg = sd.remove_punctuation(dlg[0])
        dlg_tkn = nltk.word_tokenize(dlg)
        test_text.append((dlg_tkn,[0]))
    return test_text

In [45]:
td = get_bow_for_new_sub('/Users/arunprakash/Documents/TensorFlow/MovieData/test/action/The.Rock.1996.720p.DVD9.BluRay.x264-SEPTiC_eng.txt')

In [46]:
td_bow = sd.bag_of_words(td,tr_words,data_type='my_test')

In [70]:
tdb = list(td_bow[:,0])

In [88]:
len(tdb)

2630

In [93]:
comedy = 0
sciencefic = 0
drama = 0
action = 0
horror = 0
for i in range(len(tdb)):
    pred = categories[np.argmax(model.predict(np.array(tdb[i:i+1])))]
    if pred == 'comedy':
        comedy += 1
    elif pred == 'sciencefic':
        sciencefic += 1
    elif pred == 'horror':
        horror += 1
    elif pred == 'action':
        action += 1
    elif pred == 'drama':
        drama += 1
print('comedy: '+ str(comedy) + ' sciencefic: '+ str(sciencefic) + ' horror: '+ str(horror) + ' action: '+ str(action) + ' drama: '+ str(drama))

comedy: 744 sciencefic: 458 horror: 359 action: 487 drama: 582


In [95]:
for cls in sd.classes:
    test_data_path = sd.test_data_dir+ '/' + cls
    print(cls+': ')
    for i in tqdm(os.listdir(test_data_path)):
        comedy = 0
        sciencefic = 0
        drama = 0
        action = 0
        horror = 0
        if i != '.DS_Store':
            #path = os.path.join(test_data_path, i)
            td = get_bow_for_new_sub(os.path.join(test_data_path, i))
            td_bow = sd.bag_of_words(td,tr_words,data_type='my_test')
            tdb = list(td_bow[:,0])
            for i in range(len(tdb)):
                pred = categories[np.argmax(model.predict(np.array(tdb[i:i+1])))]
                if pred == 'comedy':
                    comedy += 1
                elif pred == 'sciencefic':
                    sciencefic += 1
                elif pred == 'horror':
                    horror += 1
                elif pred == 'action':
                    action += 1
                elif pred == 'drama':
                    drama += 1
        print('comedy: '+ str(comedy) + ' sciencefic: '+ str(sciencefic) + ' horror: '+ str(horror) + ' action: '+ str(action) + ' drama: '+ str(drama))

  0%|          | 0/6 [00:00<?, ?it/s]

comedy: 


 17%|█▋        | 1/6 [00:15<01:17, 15.59s/it]

comedy: 1074 sciencefic: 433 horror: 332 action: 336 drama: 387
comedy: 0 sciencefic: 0 horror: 0 action: 0 drama: 0


 50%|█████     | 3/6 [00:26<00:26,  8.96s/it]

comedy: 642 sciencefic: 277 horror: 281 action: 246 drama: 392


 67%|██████▋   | 4/6 [00:42<00:21, 10.72s/it]

comedy: 972 sciencefic: 345 horror: 449 action: 363 drama: 529


 83%|████████▎ | 5/6 [01:04<00:12, 12.82s/it]

comedy: 1488 sciencefic: 413 horror: 537 action: 489 drama: 568


100%|██████████| 6/6 [01:28<00:00, 14.83s/it]
  0%|          | 0/4 [00:00<?, ?it/s]

comedy: 1546 sciencefic: 735 horror: 564 action: 580 drama: 660
sciencefic: 
comedy: 0 sciencefic: 0 horror: 0 action: 0 drama: 0


 50%|█████     | 2/4 [03:46<03:46, 113.21s/it]

comedy: 287 sciencefic: 152 horror: 226 action: 131 drama: 302


 75%|███████▌  | 3/4 [04:03<01:21, 81.11s/it] 

comedy: 699 sciencefic: 411 horror: 398 action: 359 drama: 806


100%|██████████| 4/4 [04:12<00:00, 63.10s/it]
  0%|          | 0/7 [00:00<?, ?it/s]

comedy: 328 sciencefic: 287 horror: 257 action: 252 drama: 272
horror: 


 14%|█▍        | 1/7 [00:08<00:52,  8.70s/it]

comedy: 584 sciencefic: 148 horror: 230 action: 195 drama: 193
comedy: 0 sciencefic: 0 horror: 0 action: 0 drama: 0


 43%|████▎     | 3/7 [00:19<00:25,  6.34s/it]

comedy: 444 sciencefic: 257 horror: 299 action: 201 drama: 396


 57%|█████▋    | 4/7 [00:25<00:18,  6.29s/it]

comedy: 417 sciencefic: 109 horror: 187 action: 131 drama: 187


 71%|███████▏  | 5/7 [00:36<00:14,  7.33s/it]

comedy: 191 sciencefic: 43 horror: 1557 action: 11 drama: 63


 86%|████████▌ | 6/7 [00:48<00:08,  8.02s/it]

comedy: 501 sciencefic: 244 horror: 414 action: 267 drama: 454


100%|██████████| 7/7 [00:50<00:00,  7.19s/it]
  0%|          | 0/6 [00:00<?, ?it/s]

comedy: 67 sciencefic: 42 horror: 124 action: 36 drama: 74
action: 


 17%|█▋        | 1/6 [00:05<00:29,  5.88s/it]

comedy: 323 sciencefic: 139 horror: 169 action: 136 drama: 223
comedy: 0 sciencefic: 0 horror: 0 action: 0 drama: 0


 50%|█████     | 3/6 [00:13<00:13,  4.52s/it]

comedy: 425 sciencefic: 184 horror: 203 action: 221 drama: 272
comedy: 1 sciencefic: 0 horror: 1 action: 0 drama: 0


 83%|████████▎ | 5/6 [00:29<00:05,  5.83s/it]

comedy: 744 sciencefic: 458 horror: 359 action: 487 drama: 582


100%|██████████| 6/6 [00:33<00:00,  5.66s/it]
  0%|          | 0/6 [00:00<?, ?it/s]

comedy: 215 sciencefic: 114 horror: 144 action: 125 drama: 166
drama: 
comedy: 0 sciencefic: 0 horror: 0 action: 0 drama: 0


 33%|███▎      | 2/6 [00:09<00:18,  4.67s/it]

comedy: 348 sciencefic: 249 horror: 256 action: 249 drama: 494


 50%|█████     | 3/6 [00:22<00:22,  7.34s/it]

comedy: 498 sciencefic: 354 horror: 308 action: 321 drama: 550


 67%|██████▋   | 4/6 [00:37<00:18,  9.44s/it]

comedy: 839 sciencefic: 373 horror: 447 action: 433 drama: 555


 83%|████████▎ | 5/6 [00:54<00:10, 10.92s/it]

comedy: 819 sciencefic: 340 horror: 354 action: 404 drama: 762


100%|██████████| 6/6 [01:10<00:00, 11.82s/it]

comedy: 691 sciencefic: 461 horror: 422 action: 417 drama: 764





In [1]:
import string

In [4]:
train_data

NameError: name 'train_data' is not defined