In [1]:
from IPython.core.display import display, HTML
display(HTML("<style>.container {width:100% !important;} </style>"))

In [2]:
#Importing libraries
import numpy as np
from nltk import word_tokenize, sent_tokenize
import io, re, string, time, datetime, os
from gensim.models import FastText as FT_gensim
import tqdm

In [3]:
from gensim.test.utils import get_tmpfile

In [4]:
import nltk
nltk.download('wordnet')
from nltk.stem import WordNetLemmatizer
from nltk.stem import LancasterStemmer

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\Manish\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [5]:
from keras.preprocessing.sequence import pad_sequences
from keras.layers import Dense, Input, LSTM, CuDNNLSTM, Embedding, Activation, Dropout,SpatialDropout1D, Bidirectional, Flatten
from keras.models import Sequential, Model
from keras.models import Model
from keras.optimizers import Adam
from keras.layers.normalization import BatchNormalization
from keras.utils import to_categorical


Using TensorFlow backend.


In [6]:
#Train and test filenames
train_file = 'train_posts.csv'
test_file1 = 'test_split01.csv'
#test_file2 = 'test_split02.csv'

#For testing flow of code with less data
#train_file = 'test_split01.csv'
#test_file1 = 'test_split02.csv'


In [7]:
#Function to load labelled file in dataframe
def load_file(file_name, train_or_val = True):
    f1 = open(file_name, 'rt', encoding="utf8")
    data = []
    target = []
    for line in f1:
        line = line.strip()
        if line and line != '':
          if(train_or_val):
            target.append(int(line[-1]))
            line = line[:-1]
          data.append(line)
    f1.close()
    if(train_or_val):
      return data, target
    else:
      return data

In [8]:
t1= datetime.datetime.now()
print("Loading file: " +str(train_file) + '...')
#train_df = load_file(train_file)
train_data, train_y = load_file(train_file)
t2 = datetime.datetime.now()
print('Time taken : ' +str((t2-t1).seconds))

Loading file: train_posts.csv...
Time taken : 3


In [9]:
len(train_data), len(train_y)

(512629, 512629)

In [10]:
#Basic preprocessing in the content of each blog
def preprocess(content):
    re_print = re.compile('[^%s]' % re.escape(string.printable))
    punctuation_table = str.maketrans('', '', string.punctuation)
    content = content.translate(punctuation_table) #Remove punctuations
    content = content.split()
    content = [word.lower() for word in content] #lowercase all
    content = ['_NUMBER' if word.isnumeric() else word for word in content ] #Replace a number by _NUMBER
    #lancaster=LancasterStemmer() #Initialize Stemmer Class 
    #content = [lancaster.stem(word) for word in content]  #Stemming
    lemmatizer = WordNetLemmatizer() #Initialize Lemmatizer class
    content = [lemmatizer.lemmatize(word) for word in content]   #Lemmatizing
    content = [re_print.sub('', w) for w in content]
    return content

In [11]:
#print( 'Unprocessed sample: '  +str (train_data[0])) #Unprocessed sample
#print( 'Preprocessed sample: '  +str (preprocess(train_data[0]))) #Preprocessed sample

In [12]:
#Preprocessing complete train data:
t1= datetime.datetime.now()
print('Preprocessing...')
preprocess_data = []
for i in range(len(train_data)):
  preprocess_data.append(preprocess(train_data[i])) #Preprocessed content

t2= datetime.datetime.now()
print('Time taken : ' +str((t2-t1).seconds))
# Preprocessed train data
#train_data[0:2],preprocess_data[0:2], train_y[0:2]

Preprocessing...
Time taken : 303


In [13]:
#Training Gensim Fasttext model for learning word embeddings
#train_content = './train_content.txt' #Used to read from file
#del train_data
fname = "fasttext.model"
if(os.path.exists(fname)):
  model_gensim = FT_gensim.load(fname)
else:
  t1= datetime.datetime.now()
  print('Building Gensim Fasttext model for word embeddings...')
  model_gensim = FT_gensim(size = 100)  #Instantiate gensim fasttext model
  #model_gensim.build_vocab(corpus_file= train_content)
  model_gensim.build_vocab(sentences = preprocess_data)
  model_gensim.train(
    #corpus_file=train_content,
    sentences = preprocess_data ,
    epochs=model_gensim.epochs,
    total_examples=model_gensim.corpus_count, 
    total_words=model_gensim.corpus_total_words
    )
  t2= datetime.datetime.now()
  print('Time taken : ' +str((t2-t1).seconds))

  print('Saving gensim Fasttext model to disk...')
  model_gensim.save(fname)
  print('Done.')

In [14]:
print('Gensim fasttext model loaded: '  +str(model_gensim))

Gensim fasttext model loaded: FastText(vocab=139925, size=100, alpha=0.025)


In [15]:
#print('problem' in model_gensim.wv.vocab)

In [16]:
#model_gensim.wv.most_similar(positive = 'jayhawks')

In [15]:
##Load and preprocess test data
t1= datetime.datetime.now()
print("Loading file: " +str(test_file1) + '...')
test_data1, test_y1 = load_file(test_file1)
t2 = datetime.datetime.now()
print('Time taken : ' +str((t2-t1).seconds))

#Preprocessing complete train data:
t1= datetime.datetime.now()
print('Preprocessing...')
preprocess_test1 = []
for i in range(len(test_data1)):
  preprocess_test1.append(preprocess(test_data1[i])) #Preprocessed content

t2= datetime.datetime.now()
print('Time taken : ' +str((t2-t1).seconds))
del test_data1

Loading file: test_split01.csv...
Time taken : 0
Preprocessing...
Time taken : 6


In [16]:
#To get average length of content
def averageLen(lst):
    lengths = [len(i) for i in lst]
    return 0 if len(lengths) == 0 else (float(sum(lengths)) / len(lengths)) 

In [17]:
print('Average length of contents in ' +str(train_file) +' is: ' +str (averageLen(preprocess_data)))

Average length of contents in train_posts.csv is: 184.3809129019232


In [18]:
CONTENT_LEN = 300 ##Content length to work with

In [19]:
#word_index = {t[0]: i+1 for i,t in enumerate(vocab.most_common(MAX_NB_WORDS))}
word_index = {word: idx+1 for idx,word in enumerate(model_gensim.wv.vocab)}
train_seq = [[word_index.get(word, 0) for word in content] for content in preprocess_data]
test_seq1 = [[word_index.get(word, 0)  for word in content] for content in preprocess_test1]

In [20]:
MAX_NB_WORDS = len(model_gensim.wv.vocab)
MAX_SEQUENCE_LENGTH = 300

In [21]:
# pad sequences
train_data = pad_sequences(train_seq, maxlen=MAX_SEQUENCE_LENGTH, padding="pre", truncating="post")

In [22]:
train_y = to_categorical(train_y)

In [23]:
print('Shape of data tensor:', train_data.shape)
print('Shape of label tensor:', train_y.shape)

Shape of data tensor: (512629, 300)
Shape of label tensor: (512629, 3)


In [24]:
test_data1 = pad_sequences(test_seq1, maxlen=MAX_SEQUENCE_LENGTH, padding="pre", truncating="post")
print('Shape of test_data tensor:', test_data1.shape)

Shape of test_data tensor: (10000, 300)


In [25]:
WV_DIM = 100
nb_words = min(MAX_NB_WORDS, len(model_gensim.wv.vocab))
#We initialize the matrix with random numbers
wv_matrix = (np.random.rand(nb_words, WV_DIM) - 0.5) / 5.0
for word, i in word_index.items():
    if i >= MAX_NB_WORDS:
        continue
    try:
        embedding_vector = model_gensim.wv[word]
        # words not found in embedding index will be all-zeros.
        wv_matrix[i] = embedding_vector
    except:
        pass

In [27]:
del preprocess_data
del preprocess_test1
del train_seq
del test_seq1
del word_index

NameError: name 'preprocess_data' is not defined

In [29]:
import gc
gc.collect()

48

In [None]:

# Inputs
seq_input = Input(shape=(MAX_SEQUENCE_LENGTH,), dtype='int32')

wv_layer = Embedding(nb_words,
                     WV_DIM,
                     mask_zero=False,
                     weights=[wv_matrix],
                     input_length=MAX_SEQUENCE_LENGTH,
                     trainable=False)

embedded_sequences = wv_layer(seq_input)

# biLSTM
embedded_sequences = SpatialDropout1D(0.2)(embedded_sequences)
x = Bidirectional(LSTM(64, return_sequences=False, dropout = 0.4))(embedded_sequences)
#x = Bidirectional(LSTM(128, return_sequences=False, dropout=0.2, recurrent_dropout=0.2))(embedded_sequences)

#x =  LSTM(32, return_sequences=True)(embedded_sequences)  # returns a sequence of vectors of dimension 32
#x = LSTM(32, return_sequences=True)(x)  # returns a sequence of vectors of dimension 32
x = LSTM(32)(x)
'''
x = Dense(32, activation="relu")(x)
x = Dropout(0.40)(x)
x = Dense(16, activation="relu")(x)
x = Dropout(0.40)(x)
x = Dense(4, activation="relu")(x)
x = Dropout(0.40)(x)
x = Dense(2, activation="relu")(x)
x = Dropout(0.40)(x)'''

x = Dense(3, activation="sigmoid")(x)


              

In [None]:
# Build the model
model = Model(inputs=[seq_input], outputs=x)
#model.compile(loss='binary_crossentropy', optimizer=Adam(lr=0.001, clipnorm=.25, beta_1=0.7, beta_2=0.99), metrics=['accuracy'])
#model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

In [None]:
model.summary()

In [None]:
hist = model.fit([train_data], train_y, epochs=10, batch_size=128, shuffle=True)

In [None]:
loss, accuracy = model.evaluate([train_data], train_y, verbose=1)
print('Accuracy: %f' % (accuracy*100))
print(loss)

In [None]:
model.evaluate([test_data1], train_y = to_categorical(test_y1), verbose=1)

In [None]:
print('Accuracy of last epoch: ' +str(hist.history.get('acc')[-1]))

In [None]:
import pandas as pd
import matplotlib.pyplot as plt

In [None]:
history = pd.DataFrame(hist.history)
plt.figure(figsize=(12,12));
plt.plot(history["loss"]);
plt.plot(history["val_loss"]);
plt.title("Loss with pretrained word vectors");
plt.show();

**Running on test data**

In [None]:
#https://drive.google.com/open?id=17yV2rdCXy-xnVPRN6_Jqhbx7gsuTtKlI  #Location of test file
download_test = drive.CreateFile({'id': '1y-FLjkLXFzHtDVo24S0-bEBZ6ECqro8Z'})
download_test.GetContentFile('test_mystere.csv')
test_mystere = 'test_mystere.csv'

In [None]:
##Load and preprocess test data
t1= datetime.datetime.now()
print("Loading file: " +str(test_mystere) + '...')
test_data1, test_y1 = load_file(test_mystere)
t2 = datetime.datetime.now()
print('Time taken : ' +str((t2-t1).seconds))

#Preprocessing complete train data:
t1= datetime.datetime.now()
print('Preprocessing...')
preprocess_test1 = []
for i in range(len(test_data1)):
  preprocess_test1.append(preprocess(test_data1[i])) #Preprocessed content

t2= datetime.datetime.now()
print('Time taken : ' +str((t2-t1).seconds))
del test_data1

In [None]:
#Get word index and pad test data
test_seq1 = [[word_index.get(word, 0)  for word in content] for content in preprocess_test1]
test_data1 = pad_sequences(test_seq1, maxlen=MAX_SEQUENCE_LENGTH, padding="pre", truncating="post")
print('Shape of test_data tensor:', test_data1.shape)

In [None]:
#Prediction
pred = model.predict(test_data1)


In [None]:
##Convert matrix output to labels 
y_pred = []
for item in pred:
    y_pred.append(np.where(item == 1)[0][0])

In [None]:
print(y_pred)