<a href="https://colab.research.google.com/github/juliabiswas/politics-bot/blob/master/politics-bot.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

##Importing the Libraries

In [58]:
#importing software libraries
from numpy import array
import pandas as pd
import csv
import string
import random as rand
from pickle import dump, load
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.utils import to_categorical
from keras.models import Sequential, load_model
from keras.layers import Dense, LSTM, Embedding
from keras.callbacks import EarlyStopping

##Loading the Data

In [59]:
#accessing drive
#from google.colab import drive
#drive.mount('/content/drive', force_remount=True)

In [60]:
#loading data and creates a list with each comment as an element
comments = []
data = pd.read_csv('https://raw.githubusercontent.com/juliabiswas/politics-bot/master/comments.csv')
data.columns = ['comment']
comments = data['comment']

##Cleaning the Data

In [61]:
#cleaning each comment
def clean(comment):
  comment = comment.replace('--', ' ')
  tokens = comment.split()
  table = str.maketrans('', '', string.punctuation)
  tokens = [w.translate(table) for w in tokens]
  tokens = [word for word in tokens if word.isalpha()]
  tokens = [word.lower() for word in tokens]
  return tokens
 
#organizing into sequences
sequences = []
length = 15 #14 tokens will be the input and 1 token will be the output
for comment in comments: #each comment is split into sequences
  tokens = clean(comment)
  for i in range(length, len(tokens)):
    seq = tokens[i-length:i]
    line = ' '.join(seq)
    sequences.append(line)

## Training the Model

In [None]:
#encoding sequences as integers
stored_sequences = sequences #saves a copy of sequences
tokenizer = Tokenizer()
tokenizer.fit_on_texts(sequences)
sequences = tokenizer.texts_to_sequences(sequences)

#calculating vocab size
vocab_size = len(tokenizer.word_index) + 1
 
#separating into input and output
sequences = array(sequences)
x, y = sequences[:,:-1], sequences[:,-1]
y = to_categorical(y, num_classes=vocab_size)
seq_length = x.shape[1]
 
#defining the model
mod = Sequential()
mod.add(Embedding(vocab_size, 50, input_length=seq_length))
mod.add(LSTM(100, return_sequences=True))
mod.add(LSTM(100, return_sequences=True))
mod.add(LSTM(100))
mod.add(Dense(100, activation='relu'))
mod.add(Dense(vocab_size, activation='softmax'))
print(mod.summary())

#compiling the model
mod.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

#fitting the model
mod.fit(x, y, batch_size=128, epochs=200)

Model: "sequential_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      (None, 14, 50)            244400    
_________________________________________________________________
lstm_3 (LSTM)                (None, 14, 100)           60400     
_________________________________________________________________
lstm_4 (LSTM)                (None, 14, 100)           80400     
_________________________________________________________________
lstm_5 (LSTM)                (None, 100)               80400     
_________________________________________________________________
dense_2 (Dense)              (None, 100)               10100     
_________________________________________________________________
dense_3 (Dense)              (None, 4888)              493688    
Total params: 969,388
Trainable params: 969,388
Non-trainable params: 0
________________________________________________

##Using the Model

In [None]:
#generating a comment using the model
def generate(mod, tokenizer, seq_length, seed_comment, n_words):
	result = list()
	in_comment = seed_comment
	for i in range(0, n_words):
		#text to integer
		encoded = tokenizer.texts_to_sequences([in_comment])[0]
		#truncating to a fixed length
		encoded = pad_sequences([encoded], maxlen=seq_length, truncating='pre')
		#predicting probabilities
		prob = mod.predict_classes(encoded, verbose=0)
		#word index to word
		out_word = ''
		for word, index in tokenizer.word_index.items():
			if index == prob:
				out_word = word
				break
		#adding to input
		in_comment += ' ' + out_word
		result.append(out_word)
	return ' '.join(result)
 
#calculating the sequence length
seq_length = len(stored_sequences[0].split()) - 1

#selecting a seed comment
seed_comment = stored_sequences[rand.randint(0,len(stored_sequences))]
 
#generating a new comment
generated = generate(mod, tokenizer, seq_length, seed_comment, 14)
print(generated)
