In [81]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

In [82]:

import nltk

from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.models import Sequential
from keras.layers import Dense, Flatten, LSTM, Activation,Dropout
from keras.layers.embeddings import Embedding
from keras.models import load_model
from gensim.models import Word2Vec

In [83]:
nltk.download('punkt')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [84]:
train_data = pd.read_csv('df_convote_train.csv')
test_data = pd.read_csv('df_convote_test.csv')

In [85]:
y = train_data['political_party'].values
X = []

for line in train_data["text"].values:
    corpus = []
    sentences = nltk.sent_tokenize(line)
    for sent in sentences:
        sent = sent.lower()
        tokens = nltk.tokenize.word_tokenize(sent)
        filtered_words = [w.strip() for w in tokens if len(w) > 1]
        corpus.extend(filtered_words)
    X.append(corpus)

In [86]:
y_test = test_data['political_party'].values

X_test = []
for line in test_data["text"].values:
    tmp = []
    sentences = nltk.sent_tokenize(line)
    for sent in sentences:
        sent = sent.lower()
        tokens = nltk.word_tokenize(sent)
        filtered_words = [w.strip() for w in tokens if len(w) > 1]
        tmp.extend(filtered_words)
    X_test.append(tmp)

In [87]:
X_total = X + X_test


In [88]:
tokenizer = Tokenizer()
tokenizer.fit_on_texts(X_total)

X_temp = tokenizer.texts_to_sequences(X_total)
word_index_Xtotal = tokenizer.word_index

In [89]:
len(word_index_Xtotal)

29374

In [90]:
import gensim

In [91]:
#Dimension of vectors we are generating
EMBEDDING_DIM = 300

#Creating Word Vectors by Word2Vec Method (takes time...)
w2v_model = gensim.models.Word2Vec(sentences=X_total, size=EMBEDDING_DIM, window=5, min_count=1)

In [67]:
len(w2v_model.wv.vocab)

25462

In [68]:
tokenizer = Tokenizer()
tokenizer.fit_on_texts(X)

X = tokenizer.texts_to_sequences(X)
word_index = tokenizer.word_index

In [69]:
maxlen = 1000 

#Making all news of size maxlen defined above
X = pad_sequences(X, maxlen=maxlen)

In [70]:
vocab_size = len(word_index_Xtotal) + 1
vocab_size

25463

In [71]:
# Function to create weight matrix from word2vec gensim model
def get_weight_matrix(model, vocab):
    # total vocabulary size plus 0 for unknown words
    vocab_size = len(vocab) + 1
    # define weight matrix dimensions with all 0
    weight_matrix = np.zeros((vocab_size, EMBEDDING_DIM))
    # step vocab, store vectors using the Tokenizer's integer mapping
    for word, i in vocab.items():
        weight_matrix[i] = model[word]
    return weight_matrix

In [72]:
embedding_vectors = get_weight_matrix(w2v_model, word_index_Xtotal)


  if __name__ == '__main__':


In [78]:
model = Sequential()
#Non-trainable embeddidng layer
model.add(Embedding(vocab_size, output_dim=EMBEDDING_DIM, weights=[embedding_vectors], input_length=maxlen, trainable=False))
#LSTM 
model.add(LSTM(units=maxlen))
model.add(Dense(1, activation='sigmoid'))
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['acc'])


In [79]:
model.summary()

Model: "sequential_3"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_3 (Embedding)     (None, 1000, 300)         7638900   
                                                                 
 lstm_3 (LSTM)               (None, 1000)              5204000   
                                                                 
 dense_4 (Dense)             (None, 1)                 1001      
                                                                 
Total params: 12,843,901
Trainable params: 5,205,001
Non-trainable params: 7,638,900
_________________________________________________________________


In [80]:
from sklearn.model_selection import train_test_split

X_train, X_val, y_train, y_val = train_test_split(X, y,test_size=0.1,random_state=42) 
history = model.fit(X_train,y_train, epochs=5)

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5

KeyboardInterrupt: ignored

In [None]:
tokenizer = Tokenizer()
tokenizer.fit_on_texts(X_test)

X_test = tokenizer.texts_to_sequences(X_test)
word_index_test = tokenizer.word_index

In [None]:
maxlen = 1000 

#Making all news of size maxlen defined above
X_test = pad_sequences(X_test, maxlen=maxlen)

In [None]:
y_pred_val = (model.predict(X_val) >= 0.5).astype("int")
y_pred = (model.predict(X_test) >= 0.5).astype("int")

In [None]:
from sklearn.metrics import classification_report, accuracy_score
accuracy_score(y_val, y_pred_val)
#validation accuracy

In [None]:
accuracy_score(y_test, y_pred)
#test accuracy

In [None]:
print(classification_report(y_test, y_pred))