In [11]:
%load_ext autoreload
%autoreload 2
%pylab inline
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

from sklearn.feature_extraction.text import CountVectorizer
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.models import Sequential
from keras.layers import Dense, Embedding, LSTM, SpatialDropout1D
from sklearn.model_selection import train_test_split
from keras.utils.np_utils import to_categorical
import re
import glob
import common_utils
from numpy import array as _A
from timeit import default_timer
from pathlib import Path

config = common_utils.get_config()
import sklearn

from IPython.core.display import display, HTML
display(HTML("<style>.container { width:100% !important; }</style>"))


The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload
Populating the interactive namespace from numpy and matplotlib


In [18]:
# Load train and test data

X_train_file_name = list()
X_test_file_name = list()
Y_train = list()
Y_test = list()
labels = config['labels']
modes = ('train','test')


dataFrameTrain = pd.DataFrame(columns=('text', 'sentiment'))
dataFrameTest  = pd.DataFrame(columns=('text', 'sentiment'))
df_both = {'train': dataFrameTrain, 'test': dataFrameTest}

counters ={mode:{label:0 for label in labels} for mode in modes }

start_time = default_timer()

for mode, paths, x, y in zip(
    modes, 
    (config['train_folders'], config['test_folders']), 
    (X_train_file_name, X_test_file_name),
    (Y_train, Y_test)):
    
    for path_name in paths:
        the_label = path_name.split('/')[-2]
        files_list = list(glob.iglob(path_name+"/*.txt"))
        # print ("Mode %s, Found %d for %s"%(mode, len(files_list), path_name))
        x.extend(files_list)
        y.extend([the_label==labels[0]]*len(files_list))
        for file_path in files_list:
            contents = Path(file_path).read_text(encoding="utf-8")
            df_both[mode] = df_both[mode].append({ 'text':contents, 'sentiment':the_label}, ignore_index=True)
        
        counters[mode][the_label] += len(files_list)
        

    runtime = default_timer() - start_time
    print("Done reading data in %d seconds for paths:"%runtime, paths)
    
Y_test = _A(Y_test)
Y_train = _A(Y_train)

print("X_train_sentences", len(X_train_file_name), "X_test_sentences", len(X_test_file_name))    
print("Y_train", len(Y_train), "Y_test", len(Y_test))    
print("counters", counters)  
df_both['train'].head()

Done reading data in 41 seconds for paths: ['/home/ishay/projects/NLP1/data//conservative/fox', '/home/ishay/projects/NLP1/data//conservative/national_review', '/home/ishay/projects/NLP1/data//liberal/politico_parsed', '/home/ishay/projects/NLP1/data//liberal/think_progress']
Done reading data in 58 seconds for paths: ['/home/ishay/projects/NLP1/data//conservative/westernjournal', '/home/ishay/projects/NLP1/data//liberal/demo', '/home/ishay/projects/NLP1/data//liberal/nytimes']
X_train_sentences 28082 X_test_sentences 11887
Y_train 28082 Y_test 11887
counters {'train': {'liberal': 15567, 'conservative': 12515}, 'test': {'liberal': 7338, 'conservative': 4549}}


Unnamed: 0,text,sentiment
0,Tim Graham | Fox News $TITLE_END$ This materia...,conservative
1,Trump says immigration into Europe has 'change...,conservative
2,"Sanders projected to win Washington, Alaska, H...",conservative
3,FoxNews.com - Breaking News | Latest News | Cu...,conservative
4,"Ex-New York state Sen. Dean Skelos, son convic...",conservative


In [19]:
df_both['test'].head()

Unnamed: 0,text,sentiment
0,Trump's Approval Rating Hits Landmark High Des...,conservative
1,Ron Paul Makes Announcement About Trump... Thi...,conservative
2,"BREAKING: Trump, Sanders Combine Forces, Make ...",conservative
3,"Numbers Don't Lie, People Love Trump's Economy...",conservative
4,160 Things Trump Has Done to Make America Grea...,conservative


In [20]:
data = df_both['train']
data['text'] = data['text'].apply(lambda x: x.lower())
data['text'] = data['text'].apply((lambda x: re.sub('[^a-zA-z0-9\s]','',x)))

print("Num of conservative records:", data[ data['sentiment'] == 'conservative'].size)
print("Num of liberal records:",      data[ data['sentiment'] == 'liberal'].size)


    


Num of conservative records: 25030
Num of liberal records: 31134


In [21]:
data.head()

Unnamed: 0,text,sentiment
0,tim graham fox news title_end this material m...,conservative
1,trump says immigration into europe has changed...,conservative
2,sanders projected to win washington alaska haw...,conservative
3,foxnewscom breaking news latest news curren...,conservative
4,exnew york state sen dean skelos son convicted...,conservative


In [22]:
#remove site identifiers
for idx,row in data.iterrows():
    row[0] = row[0].replace('fox',' ')
    row[0] = row[0].replace('politico',' ')
    row[0] = row[0].replace('national review',' ')
    row[0] = row[0].replace('westernjournal',' ')
    row[0] = row[0].replace('western journal',' ')
    row[0] = row[0].replace('democracy now',' ')
    row[0] = row[0].replace('the new york times',' ')
    row[0] = row[0].replace('thinkprogress',' ')
    row[0] = row[0].replace('western journal',' ')

In [23]:
from timeit import default_timer

start_time = default_timer()
max_fatures = 2000
tokenizer = Tokenizer(num_words=max_fatures, split=' ')
tokenizer.fit_on_texts(data['text'].values)
X = tokenizer.texts_to_sequences(data['text'].values)
X = pad_sequences(X)
print("Done tokenizer in %d seconds"%(default_timer()-start_time))


Done tokenizer in 24 seconds


In [24]:
X.shape

(28082, 22332)

In [25]:
embed_dim = 128
lstm_out = 96

model = Sequential()
model.add(Embedding(max_fatures, embed_dim,input_length = X.shape[1]))
model.add(SpatialDropout1D(0.4))
model.add(LSTM(lstm_out, dropout=0.2, recurrent_dropout=0.2))
model.add(Dense(2,activation='softmax'))
model.compile(loss = 'categorical_crossentropy', optimizer='adam',metrics = ['accuracy'])
print(model.summary())

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      (None, 22332, 128)        256000    
_________________________________________________________________
spatial_dropout1d_1 (Spatial (None, 22332, 128)        0         
_________________________________________________________________
lstm_1 (LSTM)                (None, 96)                86400     
_________________________________________________________________
dense_1 (Dense)              (None, 2)                 194       
Total params: 342,594
Trainable params: 342,594
Non-trainable params: 0
_________________________________________________________________
None


In [26]:
Y = pd.get_dummies(data['sentiment']).values

In [None]:
batch_size = 32
start_time = default_timer()
model.fit(X, Y, epochs = 3, batch_size=batch_size, verbose = 1)
print("Done fit in %d seconds"%(default_timer()-start_time))


Epoch 1/3
  576/28082 [..............................] - ETA: 4:40:19 - loss: 0.6799 - acc: 0.6372

In [13]:
test_data = df_both['train']
X_test = tokenizer.texts_to_sequences(test_data['text'].values)
X_test = pad_sequences(X_test)

Y_test = pd.get_dummies(test_data['sentiment']).values
score, acc = model.evaluate(X_test, Y_test, verbose = 2, batch_size = batch_size)
print("score: %.2f" % (score))
print("acc: %.2f" % (acc))

score: 0.16
acc: 0.94


In [14]:
pos_cnt, neg_cnt, pos_correct, neg_correct = 0, 0, 0, 0
for x in range(len(X_validate)):
    
    result = model.predict(X_validate[x].reshape(1,X_test.shape[1]),batch_size=1,verbose = 2)[0]
   
    if np.argmax(result) == np.argmax(Y_validate[x]):
        if np.argmax(Y_validate[x]) == 0:
            neg_correct += 1
        else:
            pos_correct += 1
       
    if np.argmax(Y_validate[x]) == 0:
        neg_cnt += 1
    else:
        pos_cnt += 1



print("pos_acc", pos_correct/pos_cnt*100, "%")
print("neg_acc", neg_correct/neg_cnt*100, "%")

pos_acc 91.00467289719626 %
neg_acc 96.58385093167702 %
