In [2]:
import re
import pandas as pd
pd.set_option("display.max_colwidth", 200)
import numpy as np
from sklearn.model_selection import train_test_split

In [3]:
train = pd.read_csv("train_file.csv")
train = train.drop(['IDLink', 'Source', 'PublishDate', 'Topic', 'Facebook', 'GooglePlus', 'LinkedIn'], axis=1)
train.head()

Unnamed: 0,Title,Headline,SentimentTitle,SentimentHeadline
0,Obama Lays Wreath at Arlington National Cemetery,Obama Lays Wreath at Arlington National Cemetery. President Barack Obama has laid a wreath at the Tomb of the Unknowns to honor,0.0,-0.0533
1,A Look at the Health of the Chinese Economy,"Tim Haywood, investment director business-unit head for fixed income at Gam, discusses the China beige book and the state of the economy.",0.208333,-0.156386
2,Nouriel Roubini: Global Economy Not Back to 2008,"Nouriel Roubini, NYU professor and chairman at Roubini Global Economics, explains why the global economy isn't facing the same conditions",-0.42521,0.139754
3,Finland GDP Expands In Q4,"Finland's economy expanded marginally in the three months ended December, after contracting in the previous quarter, preliminary figures from Statistics Finland showed Monday.",0.0,0.026064
4,"Tourism, govt spending buoys Thai economy in January","Tourism and public spending continued to boost the economy in January, in light of contraction in private consumption and exports, according to the Bank of Thailand data.",0.0,0.141084


In [4]:
test = pd.read_csv("test_file.csv")
test = test.drop(['Source', 'PublishDate', 'Topic', 'Facebook', 'GooglePlus', 'LinkedIn'], axis=1)
test.head()

Unnamed: 0,IDLink,Title,Headline
0,tFrqIR6Chj,Sliding Economy: FG fights back with N3trn TSA funds,"With the 2016 budget now passed by the National Assembly and a N3trillion war chest, the government of President Muhammadu Buhari says"
1,DVAaGErjlF,Microsoft shows how HoloLens can bring distant family members ...,A recent Microsoft Research video shows how the $3000 augmented reality system can be used to transmit 3D models of people anywhere in
2,OT9UIZm5M2,"Microsoft’s Twitter Robot Praises Hitler, Trump & Recites Racism","* Microsoft teamed with Bing to create TayTweets, an account for a robot that was designed to learn about “conversational understanding,” by having automated discussions with Twitter users, and mi..."
3,lflGp3q2Fj,Flood of Central Bank Moves Can't Get World Economy Out of Rut,Central bankers have managed to steer the world economy clear of a recession while leaving it stuck in the same rut that led to its troubles in the first place.
4,zDYG0SoovZ,USD/JPY: bears lining up on mixed U.S. economy outlook,"However, this streak of seven-day gains might end here as markets take a step back and ponder in respect of the US economy and its inflation"


In [5]:
train.shape, test.shape

((55932, 4), (37288, 3))

In [6]:
X = train['Title'].values
Y = train['SentimentTitle'].values

In [7]:
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences

Using TensorFlow backend.


In [8]:
tk = Tokenizer(lower = True)
tk.fit_on_texts(X)
X_seq = tk.texts_to_sequences(X)
X_pad = pad_sequences(X_seq, maxlen = 100, padding = 'post')

In [9]:
X_train, X_test, y_train, y_test = train_test_split(X_pad, Y, test_size = 0.2, random_state = 0)

In [10]:
f = open('glove.6B.100d.sample.txt')
embedd_index = {}
for line in f:
    val = line.split()
    word = val[0]
    coff = np.asarray(val[1:],dtype = 'float')
    embedd_index[word] = coff

f.close()
print('Found %s word vectors.' % len(embedd_index))

Found 100 word vectors.


In [14]:
index_of_words = tk.word_index
embed_num_dims= 100
embedding_matrix = np.zeros((len(index_of_words) + 1, embed_num_dims))

tokens = []
labels = []

for word,i in index_of_words.items():
    temp = embedd_index.get(word)
    if temp is not None:
        embedding_matrix[i] = temp

In [19]:
from keras.layers import Dense , Flatten ,Embedding,Input
#Embedding layer before the actual BLSTM 
embedd_layer = Embedding(len(index_of_words) + 1 , embed_num_dims , input_length = 100 , weights = [embedding_matrix])

In [20]:
batch_size = 64
X_train1 = X_train[batch_size:]
y_train1 = y_train[batch_size:]
X_valid = X_train[:batch_size]
y_valid = y_train[:batch_size]

In [27]:
from keras.models import Sequential
from keras.layers import Embedding, LSTM, Dense, Dropout
from keras.layers import Bidirectional, GlobalMaxPooling1D
vocabulary_size = len(tk.word_counts.keys())+1
max_words = 100
embedding_size = 32
model = Sequential()
model.add(embedd_layer)
model.add(Bidirectional(LSTM(30 , return_sequences = True , dropout = 0.1 , recurrent_dropout = 0.1)))
model.add(GlobalMaxPooling1D())
model.add(Dense(30,activation = 'relu'))
model.add(Dropout(0.2))
model.add(Dense(1,activation = 'sigmoid'))

from keras import metrics

model.compile(loss = 'mean_squared_error',
              optimizer = 'sgd',
              metrics=[metrics.mae])

In [28]:
model.fit(X_train1, y_train1, validation_data = (X_valid, y_valid), batch_size = batch_size, epochs = 10)

Train on 44681 samples, validate on 64 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.History at 0x1341439b0>

In [30]:
scores = model.evaluate(X_test, y_test, verbose = 0)

In [31]:
X = test['Title'].values
tk = Tokenizer(lower = True)
tk.fit_on_texts(X)
X_seq = tk.texts_to_sequences(X)
X_pad = pad_sequences(X_seq, maxlen = 100, padding = 'post')

In [32]:
title_predict = model.predict(X_pad, verbose = 0)
print(title_predict)

[[0.00554429]
 [0.00196987]
 [0.01750842]
 ...
 [0.00491612]
 [0.00194016]
 [0.00328234]]


In [42]:
final = pd.DataFrame()
final['IDLink'] = test['IDLink']
final['SentimentTitle'] = title_predict
final.to_csv('result.csv',index=False)