#  Practica NLG

### Importamos los paquetes

In [26]:
import gensim
import re
import glob
import numpy as np
import pandas as pd
from copy import deepcopy

In [27]:
from keras.models import Sequential
from keras.layers import Dense
from keras.layers import Dropout
from keras.layers import LSTM
from keras.utils import np_utils

### Cargamos los datos

In [28]:
df = pd.read_csv('../../archivos/BarackObama.csv')

In [29]:
df.drop(['Unnamed: 0', 'date', 'id', 'link', 'retweet', 'author'], axis=1, inplace=True)

In [30]:
df.head()

Unnamed: 0,text
0,Denying climate change is dangerous. Join @OFA...
1,The American Bar Association gave Judge Garlan...
2,We need a fully functional Supreme Court. Edit...
3,"Cynics, take note: When we #ActOnClimate, we b..."
4,"""That’s how we will overcome the challenges we..."


In [31]:
text= "\n".join(re.sub('[^a-zA-Z0-9!?\',.:" ]+', '', tweet) for tweet in df.sample(6000).text)

In [32]:
print(text)

If you believe in affordable health care, keep up the fight in 2014: http:ofa.bobE
"For nonviolent drug crimes, we need to lower long mandatory minimum sentencesor get rid of them entirely." President Obama
Welcome POTUS, the official government account of the President of the United States.
One of our favorite photos from this year: 30,000 strong in Wisconsin.pic.twitter.combufskgX6
Your free chance to meet the President. No catch, just sign up here:http:OFA.BO5cN6Vd
Add your name if you think the Senate should do its job: http:ofa.boi9yw DoYourJobpic.twitter.comDNxgqTSRTN
Its time to hold the big banks accountable to the people they serve. Show your support for Wall Street reform: http:j.mp95pyHM
There are no red states or blue states, just the United States.pic.twitter.comxm7EFGjK
"Affordable health care is not some privilege just for the few, it's a basic right everybody should be able to enjoy." President Obama
President Obama on today's EqualPay vote: "Senate Republicans put part

In [33]:
characters = sorted(list(set(text)))
n_to_char = {n:char for n, char in enumerate(characters)}
char_to_n = {char:n for n, char in enumerate(characters)}

print('Unique chars: {}'.format(len(characters)))

Unique chars: 71


In [34]:
print(characters)

['\n', ' ', '!', '"', "'", ',', '.', '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', ':', '?', 'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M', 'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z', 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z']


### Preprocesamos los datos

In [35]:
X = []
Y = []
length = len(text)
seq_length = 40
for i in range(0, length-seq_length, 1):
    sequence = text[i:i + seq_length]
    label =text[i + seq_length]
    X.append([char_to_n[char] for char in sequence])
    Y.append(char_to_n[label])

In [36]:
X_modified = np.reshape(X, (len(X), seq_length, 1))
X_modified = X_modified / float(len(characters))
Y_modified = np_utils.to_categorical(Y)

### Creamos los modelos

In [37]:
# Model 1
model_1 = Sequential()
model_1.add(LSTM(1000, input_shape=(X_modified.shape[1], X_modified.shape[2]), return_sequences=True))
model_1.add(Dropout(0.2))
model_1.add(LSTM(1000))
model_1.add(Dropout(0.2))
model_1.add(Dense(Y_modified.shape[1], activation='softmax'))
model_1.compile(loss='categorical_crossentropy', optimizer='adam')
model_1.summary()

Model: "sequential_5"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
lstm_10 (LSTM)               (None, 40, 1000)          4008000   
_________________________________________________________________
dropout_10 (Dropout)         (None, 40, 1000)          0         
_________________________________________________________________
lstm_11 (LSTM)               (None, 1000)              8004000   
_________________________________________________________________
dropout_11 (Dropout)         (None, 1000)              0         
_________________________________________________________________
dense_5 (Dense)              (None, 71)                71071     
Total params: 12,083,071
Trainable params: 12,083,071
Non-trainable params: 0
_________________________________________________________________


In [38]:
# Model 2
model_2 = Sequential()
model_2.add(LSTM(2000, input_shape=(X_modified.shape[1], X_modified.shape[2]), return_sequences=True))
model_2.add(Dropout(0.2))
model_2.add(LSTM(2000))
model_2.add(Dropout(0.2))
model_2.add(Dense(Y_modified.shape[1], activation='softmax'))
model_2.compile(loss='categorical_crossentropy', optimizer='adam')
model_2.summary()

Model: "sequential_6"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
lstm_12 (LSTM)               (None, 40, 2000)          16016000  
_________________________________________________________________
dropout_12 (Dropout)         (None, 40, 2000)          0         
_________________________________________________________________
lstm_13 (LSTM)               (None, 2000)              32008000  
_________________________________________________________________
dropout_13 (Dropout)         (None, 2000)              0         
_________________________________________________________________
dense_6 (Dense)              (None, 71)                142071    
Total params: 48,166,071
Trainable params: 48,166,071
Non-trainable params: 0
_________________________________________________________________


In [39]:
# Model 3
model_3 = Sequential()
model_3.add(LSTM(2000, input_shape=(X_modified.shape[1], X_modified.shape[2]), return_sequences=True))
model_3.add(Dropout(0.2))
model_3.add(LSTM(2000, return_sequences=True))
model_3.add(Dropout(0.2))
model_3.add(LSTM(2000))
model_3.add(Dropout(0.2))
model_3.add(Dense(Y_modified.shape[1], activation='softmax'))
model_3.compile(loss='categorical_crossentropy', optimizer='adam')
model_3.summary()

Model: "sequential_7"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
lstm_14 (LSTM)               (None, 40, 2000)          16016000  
_________________________________________________________________
dropout_14 (Dropout)         (None, 40, 2000)          0         
_________________________________________________________________
lstm_15 (LSTM)               (None, 40, 2000)          32008000  
_________________________________________________________________
dropout_15 (Dropout)         (None, 40, 2000)          0         
_________________________________________________________________
lstm_16 (LSTM)               (None, 2000)              32008000  
_________________________________________________________________
dropout_16 (Dropout)         (None, 2000)              0         
_________________________________________________________________
dense_7 (Dense)              (None, 71)               

In [40]:
# Model 4
model_4 = Sequential()
model_4.add(LSTM(3000, input_shape=(X_modified.shape[1], X_modified.shape[2]), return_sequences=True))
model_4.add(Dropout(0.2))
model_4.add(LSTM(3000))
model_4.add(Dropout(0.2))
model_4.add(Dense(Y_modified.shape[1], activation='softmax'))
model_4.compile(loss='categorical_crossentropy', optimizer='adam')
model_4.summary()

Model: "sequential_8"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
lstm_17 (LSTM)               (None, 40, 3000)          36024000  
_________________________________________________________________
dropout_17 (Dropout)         (None, 40, 3000)          0         
_________________________________________________________________
lstm_18 (LSTM)               (None, 3000)              72012000  
_________________________________________________________________
dropout_18 (Dropout)         (None, 3000)              0         
_________________________________________________________________
dense_8 (Dense)              (None, 71)                213071    
Total params: 108,249,071
Trainable params: 108,249,071
Non-trainable params: 0
_________________________________________________________________


### Generamos el texto

In [54]:
def generate_text(model, string_id):
    string_mapped = deepcopy(string_id)
    full_string = [n_to_char[value] for value in string_mapped]
    
    # Generating characters
    for i in range(100):
        x = np.reshape(string_mapped,(1,len(string_mapped), 1))
        x = x / float(len(characters))

        pred_index = np.argmax(model.predict(x, verbose=0))
        seq = [n_to_char[value] for value in string_mapped]
        full_string.append(n_to_char[pred_index])

        string_mapped.append(pred_index)
        string_mapped = string_mapped[1:len(string_mapped)]
        
    text = ""
    for char in full_string:
        text = text + char
    return text

In [55]:
model_1_results = generate_text(model_1, X[10])

In [56]:
model_2_results = generate_text(model_2, X[10])

In [57]:
model_3_results = generate_text(model_3, X[10])

In [58]:
model_4_results = generate_text(model_4, X[10])

### Imprimimos los resultados

In [59]:
t = ''
for ch in X[10]:
    t += n_to_char.get(ch)
print(t)

ieve in affordable health care, keep up 


In [60]:
print(model_1_results)

ieve in affordable health care, keep up gggggggggggggggggggggggggggggggggggggggggggggggggggggggggggggggggggggggggggggggggggggggggggggggggggg


In [61]:
print(model_2_results)

ieve in affordable health care, keep up 5555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555555


In [62]:
print(model_3_results)

ieve in affordable health care, keep up TTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTT


In [63]:
print(model_4_results)

ieve in affordable health care, keep up FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF


# Conclusiones:
He intentado mejorar los modelos, pero el resultado siempre es el mismo. Pienso que podríamos hacer algo decente con más twits, ya que son muy pocos. También podríamos crear modelos más complejos que llevaría mucho tiempo entrenarlos.