In [182]:
import numpy as np
import pandas as pd
import re
import nltk
nltk.download('wordnet')
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('averaged_perceptron_tagger')
from nltk.corpus import stopwords
from nltk.corpus import wordnet
from nltk.stem import WordNetLemmatizer 
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.model_selection import train_test_split
import keras
from keras.models import Sequential
from keras.layers import Dense, Dropout
from keras.layers import LSTM,  Embedding
from keras.layers import Flatten,BatchNormalization
pd.set_option('display.max_colwidth', -1)

[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


In [0]:
df= pd.read_csv('/content/trump_data.csv')
df.head()

Unnamed: 0,text
0,Today we express our deepest gratitude to all those who have served in our armed forces. #ThankAVet https://t.co/wPk7QWpK8Z
1,Busy day planned in New York. Will soon be making some very important decisions on the people who will be running our government!
2,Love the fact that the small groups of protesters last night have passion for our great country. We will all come together and be proud!
3,"Just had a very open and successful presidential election. Now professional protesters, incited by the media, are protesting. Very unfair!"
4,"A fantastic day in D.C. Met with President Obama for first time. Really good meeting, great chemistry. Melania liked Mrs. O a lot!"


###Data Cleaning

In [0]:
# supprimer les liens
df['text'] = df['text'].replace(r'http\S+', ' ', regex=True).replace(r'www\S+', '', regex=True)
#sumpprimer les mentions
df['text'] = df['text'].replace(r'@\S+', ' ', regex=True)
# supprmier les hashtags
df['text'] = df['text'].replace(r'(\s)#\w+', ' ', regex=True)
df['text'] = df.text.apply(lambda x: x.lower())
#enlever les punctuations
df['text'] = df.text.str.replace("[^\w\s]", "  ")
# enlever les caractéres spéciaux
df['text'] =  df['text'].apply(lambda x: re.sub(r'[^\x00-\x7F]+',' ', x))
# supprimer les mots d'un seul caractére
df['text'] = df.text.apply(lambda x: ' '.join([word for word in x.split() if len(word)>1]))
df['text'] = df.text.apply(lambda x: ' '.join([word for word in x.split() if len(word)>2]))
df.head()

Unnamed: 0,text
0,today express our deepest gratitude all those who have served our armed forces
1,busy day planned new york will soon making some very important decisions the people who will running our government
2,love the fact that the small groups protesters last night have passion for our great country will all come together and proud
3,just had very open and successful presidential election now professional protesters incited the media are protesting very unfair
4,fantastic day met with president obama for first time really good meeting great chemistry melania liked mrs lot


In [0]:
#remove stop words
stop = stopwords.words('english')
newStopWords = ['youve','be','can','could','must','would','us','year','day','finally','time','also','last','today','mrs','thank'
                'next','two','three','else','even','there','if','so','one','all','got','get','use','too','etc','rt','mr']
stop.extend(newStopWords)
df['text'] = df['text'].apply(lambda x: ' '.join([word for word in x.split() if word not in (stop)]))
df.head()

Unnamed: 0,text
0,express deepest gratitude served armed forces
1,busy planned new york soon making important decisions people running government
2,love fact small groups protesters night passion great country come together proud
3,open successful presidential election professional protesters incited media protesting unfair
4,fantastic met president obama first really good meeting great chemistry melania liked lot


In [0]:
nltk.download('words')
words = set(nltk.corpus.words.words())
# enlever les mots non anglaises
df['text'] = df.text.apply(lambda x: ' '.join([word for word in x.split() if word in words]))
df.head()

[nltk_data] Downloading package words to /root/nltk_data...
[nltk_data]   Unzipping corpora/words.zip.


Unnamed: 0,text
0,express gratitude armed
1,busy new york soon making important people running government
2,love fact small night passion great country come together
3,open successful presidential election professional media unfair
4,fantastic met president first really good meeting great chemistry lot


In [0]:
# Conserver les tweets qui contiennent au moins 3 mots
df["nb_words"]=df.text.apply(lambda x: len(x.split()))
df=df[df['nb_words'] >= 3]
df.reset_index(drop=True, inplace=True)
df

Unnamed: 0,text,nb_words
0,express gratitude armed,3
1,busy new york soon making important people running government,9
2,love fact small night passion great country come together,9
3,open successful presidential election professional media unfair,7
4,fantastic met president first really good meeting great chemistry lot,10
...,...,...
9634,may number act priority focus tax reform many far greater importance,11
9635,thought felt win big easily fabled knew,7
9636,enough around world without yet another president russia respect far,10
9637,terrible situation spoke governor stay safe,6


### Word Lemmatization

In [0]:
def get_wordnet_pos(word):
    """Map POS tag to first character lemmatize() accepts"""
    tag = nltk.pos_tag([word])[0][1][0].upper()
    tag_dict = {"J": wordnet.ADJ,
                "N": wordnet.NOUN,
                "V": wordnet.VERB,
                "R": wordnet.ADV}

    return tag_dict.get(tag, wordnet.NOUN)

lemmatizer = WordNetLemmatizer()


df['text'] = df['text'].apply(lambda x: ' '.join([lemmatizer.lemmatize(w, get_wordnet_pos(w)) for w in x.split()]))

In [0]:
df

Unnamed: 0,text,nb_words
0,express gratitude arm,3
1,busy new york soon make important people run government,9
2,love fact small night passion great country come together,9
3,open successful presidential election professional medium unfair,7
4,fantastic met president first really good meeting great chemistry lot,10
...,...,...
9634,may number act priority focus tax reform many far great importance,11
9635,thought felt win big easily fabled knew,7
9636,enough around world without yet another president russia respect far,10
9637,terrible situation spoke governor stay safe,6


### Extract Keywords

In [0]:
def sort_coo(coo_matrix):
    tuples = zip(coo_matrix.col, coo_matrix.data)
    return sorted(tuples, key=lambda x: (x[1], x[0]), reverse=True)

In [0]:
def extract_topn_from_vector(tfidf_transformer,feature_names, doc, topn=10):
    """get the feature names and tf-idf score of top n items"""
    tf_idf_vector=tfidf_transformer.transform(cv.transform([str(doc)]))
    #sort the tf-idf vectors by descending order of scores
    sorted_items=sort_coo(tf_idf_vector.tocoo())
    #use only topn items from vector
    sorted_items = sorted_items[:topn]
    
    feature_vals = list()
    for idx, score in sorted_items:
       
       feature_vals.append(feature_names[idx]) 
    #print(sorted_items[0][0])  
    #return feature_names[sorted_items[0][0]]
    return ' '.join([x for x in feature_vals])

In [0]:

docs= df['text'].values.tolist()
cv=CountVectorizer(max_df=1,stop_words=stop)
word_count_vector=cv.fit_transform([str(docs)])

In [0]:
tfidf_transformer=TfidfTransformer(smooth_idf=True,use_idf=True)
tfidf_transformer.fit(word_count_vector)

TfidfTransformer(norm='l2', smooth_idf=True, sublinear_tf=False, use_idf=True)

In [0]:
feature_names=cv.get_feature_names()
df['keywords']= df.text.apply(lambda x: extract_topn_from_vector(tfidf_transformer,feature_names,x,3)) 

In [0]:
df

Unnamed: 0,text,nb_words,keywords
0,express gratitude arm,3,gratitude express arm
1,busy new york soon make important people run government,9,york soon run
2,love fact small night passion great country come together,9,together small passion
3,open successful presidential election professional medium unfair,7,unfair successful professional
4,fantastic met president first really good meeting great chemistry lot,10,really president met
...,...,...,...
9634,may number act priority focus tax reform many far great importance,11,tax reform priority
9635,thought felt win big easily fabled knew,7,win thought knew
9636,enough around world without yet another president russia respect far,10,yet world without
9637,terrible situation spoke governor stay safe,6,terrible stay spoke


In [0]:
from tensorflow.keras.preprocessing.text import Tokenizer

tokenizer = Tokenizer()
tokenizer.fit_on_texts(df['keywords'].values.tolist())
#df['sequences'] = df.keywords.apply(lambda x :tokenizer.texts_to_sequences(x))
sequences = tokenizer.texts_to_sequences(df['keywords'].values.tolist())
sequences

[[1252, 1253, 1703],
 [62, 39, 38],
 [31, 162, 647],
 [153, 236, 532],
 [45, 12, 648],
 [2, 228, 830],
 [48, 1002, 133],
 [9, 335, 29],
 [138, 237, 52],
 [7, 9, 90],
 [10, 9, 41],
 [98, 108, 1003],
 [6, 1, 256],
 [9, 14, 36],
 [9, 494, 15],
 [7, 82, 53],
 [6, 208, 2],
 [17, 39, 336],
 [83, 5, 1704],
 [6, 2, 66],
 [9, 26, 101],
 [9, 1004, 831],
 [97, 1254, 831],
 [9, 1, 36],
 [6, 26, 2],
 [39, 18, 99],
 [6, 26, 2],
 [2, 20, 15],
 [2, 18, 48],
 [6, 31, 56],
 [34, 1005, 37],
 [3, 6, 31],
 [50, 590, 101],
 [2, 51, 74],
 [26, 316, 52],
 [381, 102, 832],
 [109, 1006, 197],
 [6, 17, 591],
 [105, 2, 99],
 [6, 2, 20],
 [17, 13, 52],
 [6, 9, 2],
 [26, 52, 382],
 [101, 52],
 [90, 63, 51],
 [6, 9, 29],
 [4, 10, 42],
 [649, 317, 1007],
 [2, 51, 134],
 [1705, 209, 1008],
 [6, 2, 592],
 [265, 1255, 13],
 [82, 7, 2],
 [139, 6, 96],
 [4, 5, 59],
 [4, 32, 134],
 [139, 6, 96],
 [2, 74, 15],
 [7, 35, 66],
 [133, 2, 533],
 [383, 247, 1009],
 [109, 1706, 167],
 [67, 456, 157],
 [7, 82, 2],
 [1256, 74, 416],

In [0]:
len(sequences)

9639

### Create  the new DataFrame

In [0]:
matrice = list()
for i in range(len(sequences)-3):
  row=list()
  for j in range(4):
    for seq in sequences[i+j]:
      row.append(seq)
  matrice.append(row)
data=pd.DataFrame(matrice)
data

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11
0,1252,1253,1703,62,39,38,31,162,647,153,236.0,532.0
1,62,39,38,31,162,647,153,236,532,45,12.0,648.0
2,31,162,647,153,236,532,45,12,648,2,228.0,830.0
3,153,236,532,45,12,648,2,228,830,48,1002.0,133.0
4,45,12,648,2,228,830,48,1002,133,9,335.0,29.0
...,...,...,...,...,...,...,...,...,...,...,...,...
9631,161,744,90,89,1656,16,15,92,60,21,230.0,1686.0
9632,89,1656,16,15,92,60,21,230,1686,7,154.0,859.0
9633,15,92,60,21,230,1686,7,154,859,79,25.0,111.0
9634,21,230,1686,7,154,859,79,25,111,70,166.0,195.0


In [0]:
data.columns=["seq1_1","seq1_2","seq1_3","seq2_1","seq2_2","seq2_3","seq3_1","seq3_2","seq3_3","target1","target2","target3"]
data.head()

Unnamed: 0,seq1_1,seq1_2,seq1_3,seq2_1,seq2_2,seq2_3,seq3_1,seq3_2,seq3_3,target1,target2,target3
0,1252,1253,1703,62,39,38,31,162,647,153,236.0,532.0
1,62,39,38,31,162,647,153,236,532,45,12.0,648.0
2,31,162,647,153,236,532,45,12,648,2,228.0,830.0
3,153,236,532,45,12,648,2,228,830,48,1002.0,133.0
4,45,12,648,2,228,830,48,1002,133,9,335.0,29.0


In [0]:
data.target1.fillna(0, inplace=True)
data.target2.fillna(0, inplace=True)
data.target3.fillna(0, inplace=True)

In [0]:
data.describe()

Unnamed: 0,seq1_1,seq1_2,seq1_3,seq2_1,seq2_2,seq2_3,seq3_1,seq3_2,seq3_3,target1,target2,target3
count,9636.0,9636.0,9636.0,9636.0,9636.0,9636.0,9636.0,9636.0,9636.0,9636.0,9636.0,9636.0
mean,232.003113,317.386986,431.082296,232.046596,317.703715,429.749274,232.708593,318.14944,428.979763,232.946243,318.754047,423.37391
std,446.306164,527.458618,613.186475,446.085667,527.971638,612.594174,447.280904,528.289738,612.264696,446.931197,528.771508,611.295782
min,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.0,0.0
25%,11.0,23.0,43.0,11.0,23.0,43.0,11.0,23.0,43.0,11.0,23.0,41.0
50%,55.0,95.0,162.0,56.0,95.0,161.0,56.0,95.5,161.0,56.0,96.0,154.0
75%,214.0,347.0,541.0,215.25,347.0,538.0,216.0,348.0,535.5,216.25,349.0,527.0
max,2889.0,2894.0,2893.0,2889.0,2894.0,2893.0,2889.0,2894.0,2893.0,2889.0,2894.0,2893.0


### Prepare X and y ad split data

In [0]:
X= data.drop(columns=["target1","target2","target3"])
y=data[["target1","target2","target3"]]
from tensorflow.keras.utils import to_categorical
from sklearn.preprocessing import OneHotEncoder
#y=to_categorical(y, num_classes=vocab_size)
#enc = OneHotEncoder(sparse=False) 
#y=enc.fit_transform(y.values.reshape((y.shape[0]),3))

In [141]:
y.shape

(9636, 3)

In [0]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X,y,  test_size=0.3, random_state=40)

### LSTM Model

In [0]:
vocab_size = len(tokenizer.word_index) + 1

In [0]:
from keras import backend as K
def root_mean_squared_error(y_true, y_pred):
        return K.sqrt(K.mean(K.square(y_pred - y_true))) 

In [185]:

model = Sequential()
model.add(Embedding(vocab_size, 50, input_length=9))
model.add(LSTM(1024,return_sequences=True))
model.add(Dropout(0.2))
model.add(LSTM(128,return_sequences=True))
model.add(Dropout(0.2))
model.add(LSTM(128,return_sequences=True))
model.add(Dropout(0.2))
model.add(LSTM(128,return_sequences=False))
model.add(Dropout(0.2))
#model.add(Flatten())
model.add(Dense(3))
model.compile(loss= root_mean_squared_error , optimizer="adam",metrics=["accuracy"])
model.summary()
#

Model: "sequential_42"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_38 (Embedding)     (None, 9, 50)             144750    
_________________________________________________________________
lstm_97 (LSTM)               (None, 9, 1024)           4403200   
_________________________________________________________________
dropout_3 (Dropout)          (None, 9, 1024)           0         
_________________________________________________________________
lstm_98 (LSTM)               (None, 9, 128)            590336    
_________________________________________________________________
dropout_4 (Dropout)          (None, 9, 128)            0         
_________________________________________________________________
lstm_99 (LSTM)               (None, 9, 128)            131584    
_________________________________________________________________
dropout_5 (Dropout)          (None, 9, 128)          

In [186]:
model.fit(X_train, y_train, batch_size = 100, epochs = 20, validation_data=(X_test, y_test))


Train on 6745 samples, validate on 2891 samples
Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


<keras.callbacks.History at 0x7ff9e4619320>