In [1]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

from sklearn.feature_extraction.text import CountVectorizer
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.models import Sequential
from keras.layers import Dense, Embedding, LSTM, SpatialDropout1D
from sklearn.model_selection import train_test_split
from keras.utils.np_utils import to_categorical
import re

  from ._conv import register_converters as _register_converters
Using TensorFlow backend.


In [2]:
def addDir(dir1, dataFrame1, label):
    from pathlib import Path
    pathlist = Path(dir1).glob('**/*.txt')
    c = 0
    for path in pathlist:
        contents = Path(path).read_text(encoding="utf-8")
        dataFrame1 = dataFrame1.append({ 'text':contents, 'sentiment':label}, ignore_index=True)
        c = c + 1
    print(c)
    return dataFrame1

In [3]:
#def function (thePath, the)
#'D:/classA/dest_parsed_fox/'
import glob
import common_utils
config = common_utils.get_config()

dataFrame = pd.DataFrame(columns=('text', 'sentiment'))
for dirname in glob.glob(config['data_dir']+'/conservative/*'):
    print(dirname)
    dataFrame = addDir(dirname, dataFrame, 'conservative')

for dirname in glob.glob(config['data_dir']+'/liberal/*'):
    print(dirname)    
    dataFrame = addDir(dirname, dataFrame, 'liberal')



/home/ishay/projects/NLP1/data//conservative/national_review
4834
/home/ishay/projects/NLP1/data//conservative/westernjournal
4549
/home/ishay/projects/NLP1/data//conservative/fox
7681
/home/ishay/projects/NLP1/data//liberal/demo
4246
/home/ishay/projects/NLP1/data//liberal/nytimes
3092
/home/ishay/projects/NLP1/data//liberal/politico_parsed
7404
/home/ishay/projects/NLP1/data//liberal/think_progress
8163


In [4]:
dataFrame.head()

Unnamed: 0,text,sentiment
0,Trump's Iran Confrontation & Miscalculation Ri...,conservative
1,Donald Trump’s DACA Decision: Here’s What Repu...,conservative
2,State of the Union Forced Mainstream Media to ...,conservative
3,Trump & Foreign Policy -- Ideology Is Not His ...,conservative
4,Trump Warns North Korea of 'Fire and Fury Like...,conservative


In [5]:
data = dataFrame
data['text'] = data['text'].apply(lambda x: x.lower())
data['text'] = data['text'].apply((lambda x: re.sub('[^a-zA-z0-9\s]','',x)))

print("Num of conservative records:", data[ data['sentiment'] == 'conservative'].size)
print("Num of liberal records:",      data[ data['sentiment'] == 'liberal'].size)


    


Num of conservative records: 34128
Num of liberal records: 45810


In [6]:
data.head()

Unnamed: 0,text,sentiment
0,trumps iran confrontation miscalculation risk...,conservative
1,donald trumps daca decision heres what republi...,conservative
2,state of the union forced mainstream media to ...,conservative
3,trump foreign policy ideology is not his gui...,conservative
4,trump warns north korea of fire and fury like ...,conservative


In [7]:
#remove site identifiers
for idx,row in data.iterrows():
    row[0] = row[0].replace('fox',' ')
    row[0] = row[0].replace('politico',' ')
    row[0] = row[0].replace('national review',' ')
    row[0] = row[0].replace('westernjournal',' ')
    row[0] = row[0].replace('western journal',' ')
    row[0] = row[0].replace('democracy now',' ')
    row[0] = row[0].replace('the new york times',' ')
    row[0] = row[0].replace('thinkprogress',' ')
    row[0] = row[0].replace('western journal',' ')

In [8]:
from timeit import default_timer

start_time = default_timer()
max_fatures = 2000
tokenizer = Tokenizer(num_words=max_fatures, split=' ')
tokenizer.fit_on_texts(data['text'].values)
X = tokenizer.texts_to_sequences(data['text'].values)
X = pad_sequences(X)
print("Done tokenizer in %d seconds"%(default_timer()-start_time))


Done tokenizer in 34 seconds


In [9]:
X.shape

(39969, 22402)

In [10]:
embed_dim = 128
lstm_out = 96

model = Sequential()
model.add(Embedding(max_fatures, embed_dim,input_length = X.shape[1]))
model.add(SpatialDropout1D(0.4))
model.add(LSTM(lstm_out, dropout=0.2, recurrent_dropout=0.2))
model.add(Dense(2,activation='softmax'))
model.compile(loss = 'categorical_crossentropy', optimizer='adam',metrics = ['accuracy'])
print(model.summary())

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      (None, 22402, 128)        256000    
_________________________________________________________________
spatial_dropout1d_1 (Spatial (None, 22402, 128)        0         
_________________________________________________________________
lstm_1 (LSTM)                (None, 96)                86400     
_________________________________________________________________
dense_1 (Dense)              (None, 2)                 194       
Total params: 342,594
Trainable params: 342,594
Non-trainable params: 0
_________________________________________________________________
None


In [11]:
Y = pd.get_dummies(data['sentiment']).values
X_train, X_test, Y_train, Y_test = train_test_split(X,Y, test_size = 0.33, random_state = 42)
print(X_train.shape,Y_train.shape)
print(X_test.shape,Y_test.shape)

(26779, 22402) (26779, 2)
(13190, 22402) (13190, 2)


In [12]:
batch_size = 32
model.fit(X_train, Y_train, epochs = 3, batch_size=batch_size, verbose = 1)

Epoch 1/3
Epoch 2/3
Epoch 3/3


<keras.callbacks.History at 0x7f2d771b32b0>

In [13]:
validation_size = 1500

X_validate = X_test[-validation_size:]
Y_validate = Y_test[-validation_size:]
X_test = X_test[:-validation_size]
Y_test = Y_test[:-validation_size]
score,acc = model.evaluate(X_test, Y_test, verbose = 2, batch_size = batch_size)
print("score: %.2f" % (score))
print("acc: %.2f" % (acc))

score: 0.16
acc: 0.94


In [14]:
pos_cnt, neg_cnt, pos_correct, neg_correct = 0, 0, 0, 0
for x in range(len(X_validate)):
    
    result = model.predict(X_validate[x].reshape(1,X_test.shape[1]),batch_size=1,verbose = 2)[0]
   
    if np.argmax(result) == np.argmax(Y_validate[x]):
        if np.argmax(Y_validate[x]) == 0:
            neg_correct += 1
        else:
            pos_correct += 1
       
    if np.argmax(Y_validate[x]) == 0:
        neg_cnt += 1
    else:
        pos_cnt += 1



print("pos_acc", pos_correct/pos_cnt*100, "%")
print("neg_acc", neg_correct/neg_cnt*100, "%")

pos_acc 91.00467289719626 %
neg_acc 96.58385093167702 %
