#### Sentiment Analysis Deeplearning 1

In [1]:
import pickle
import gensim
import numpy as np
import nltk
import re
import os 
import pandas as pd
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import CountVectorizer
from nltk.tokenize import word_tokenize,sent_tokenize
import seaborn as sns
import matplotlib.pyplot as plt
import math

#nltk.download('stopwords')
#nltk.download("wordnet")

##### 1.Prepare data

In [2]:
labeled_news1 = pd.read_csv('Full-Economic-News-DFE-839861.csv',encoding = 'ISO-8859-1')
train_data1 = labeled_news1.loc[labeled_news1.relevance == 'yes']
train_data1 = train_data1[['text','positivity','positivity:confidence']]
print(len(labeled_news1))
print(len(train_data1))

8000
1420


In [3]:
labeled_news2 = pd.read_csv('us-economic-newspaper.csv',encoding = 'ISO-8859-1')
train_data2 = labeled_news2.loc[labeled_news2.relevance == 'yes']
train_data2 = train_data2[['text','positivity','positivity:confidence']]
print(len(labeled_news2))
print(len(train_data2))

5015
2901


In [4]:
train_data = pd.concat([train_data1, train_data2])
print(len(train_data))

4321


In [5]:
# Drop missing values
train_data.dropna(inplace=True)
print (train_data.shape)

(4319, 3)


In [6]:
# Remove any 'neutral' ratings equal to 5
train_data = train_data.loc[train_data['positivity'] != 5]

# Encode 6-9s as 1 (rated positively)
# Encode 1-4s as 0 (rated poorly)
train_data['Positively Rated'] = np.where(train_data['positivity'] > 5, 1, 0)

In [7]:
#The numbers of positive rating and negative rating are quite even
train_data['Positively Rated'].mean()

0.3888

- clean data

In [12]:
def process_text( raw_news, lemmatizer, stopw ):
    '''
    Function to convert a raw news to a string of words
    The input is a single string (a raw news), and 
    the output is a single string (a preprocessed news)
    '''
    from bs4 import BeautifulSoup  
    
    # Remove HTML
    news_text = BeautifulSoup(raw_news,"lxml").get_text() 

    # Remove non-letters        
    letters_only = re.sub("[^a-zA-Z]", " ", news_text) 
    # Tokenize and clean bag of words:
    tokens = word_tokenize(letters_only.lower())
    tokens = [lemmatizer.lemmatize(t) for t in tokens]
    #tokens = [t for t in tokens if t not in stopw]
    #tokens = [t for t in tokens if len(t)>1]
    
    return tokens   

In [13]:
from nltk.corpus import stopwords
stopw = stopwords.words('english')
lemmatizer = WordNetLemmatizer()

#Apply this to the dataframe
train_data['processed_text'] = train_data['text'].apply(lambda t: process_text(t, lemmatizer, stopw))

In [14]:
train_data.head(2)

Unnamed: 0,text,positivity,positivity:confidence,Positively Rated,processed_text
0,NEW YORK -- Yields on most certificates of dep...,3.0,0.64,0,"[new, york, yield, on, most, certificate, of, ..."
4,NEW YORK -- Indecision marked the dollar's ton...,3.0,0.3257,0,"[new, york, indecision, marked, the, dollar, s..."


In [15]:
from sklearn.model_selection import train_test_split

# Split data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(train_data['processed_text'], 
                                                    train_data['Positively Rated'], 
                                                    random_state=4)

#### Word2Vec and Doc2Vec Approach

In [23]:
## use pretrained w2v
import gensim
from gensim.models.word2vec import Word2Vec
from gensim.models.keyedvectors import KeyedVectors

In [24]:
use_google = True

if use_google:
    news_w2v = KeyedVectors.load_word2vec_format(os.path.join('pre_trained_w2v','GoogleNews-vectors-negative300.bin'), binary=True)
else:
    news_w2v = Word2Vec.load(os.path.join('pre_trained_w2v','imf_160.w2v'))

In [25]:
news_w2v.wv.most_similar('good',topn=10)

[('great', 0.7291508913040161),
 ('bad', 0.7190051078796387),
 ('terrific', 0.6889115571975708),
 ('decent', 0.6837348341941833),
 ('nice', 0.6836091876029968),
 ('excellent', 0.6442928910255432),
 ('fantastic', 0.6407778859138489),
 ('better', 0.6120728850364685),
 ('solid', 0.5806034803390503),
 ('lousy', 0.5764203071594238)]

In [26]:
#Build word vector for training set by using the average value of all word vectors in the news, then scale
def buildWordVector(text, size):
    vec = np.zeros(size).reshape((1, size))
    count = 0.
    for word in text:
        try:
            vec += news_w2v[word].reshape((1, size))
            count += 1.
        except KeyError:
            continue
    if count != 0:
        vec /= count
    return vec

In [27]:
from sklearn.preprocessing import scale

n_dim = 300

train_vecs = np.concatenate([buildWordVector(z, n_dim) for z in X_train])
train_vecs = scale(train_vecs)

#Build test news vectors then scale
test_vecs = np.concatenate([buildWordVector(z, n_dim) for z in X_test])
test_vecs = scale(test_vecs)

In [28]:
# Try Logistic Regression
from sklearn.metrics import roc_auc_score
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC

modellr = LogisticRegression()
modellr.fit(train_vecs, y_train)

predictions = modellr.predict_proba(test_vecs)[:,1]
exact_predict = modellr.predict(test_vecs)

print('AUC: ', roc_auc_score(y_test, predictions))
print('Accuracy: ', accuracy_score(y_test, exact_predict))

AUC:  0.70302345157
Accuracy:  0.683368869936


## Use keras

In [29]:
import tensorflow as tf 
import keras
print(tf.__version__)

1.3.0


Using TensorFlow backend.


In [30]:
Dense = keras.layers.Dense
Dropout = keras.layers.Dropout

In [31]:
def build_model():
    model = keras.models.Sequential()
    model.add(Dense(units=200,activation='relu',input_dim=300))
    model.add(Dropout(0.3))
    model.add(Dense(units=50,activation='relu',input_dim=200))
    model.add(Dropout(0.3))
    ## output layer
    model.add(Dense(units=2,activation='softmax', input_dim=50))
    model.compile(loss='categorical_crossentropy',
              optimizer='sgd',metrics=['accuracy'])
    
    return model
    

In [32]:
y_train_one_hot = keras.utils.to_categorical(y_train,num_classes=None)

In [33]:
model = build_model()
model.fit(train_vecs, y_train_one_hot, epochs=500, batch_size=32,verbose=True)

Epoch 1/500
Epoch 2/500
Epoch 3/500
Epoch 4/500
Epoch 5/500
Epoch 6/500
Epoch 7/500
Epoch 8/500
Epoch 9/500
Epoch 10/500
Epoch 11/500
Epoch 12/500
Epoch 13/500
Epoch 14/500
Epoch 15/500
Epoch 16/500
Epoch 17/500
Epoch 18/500
Epoch 19/500
Epoch 20/500
Epoch 21/500
Epoch 22/500
Epoch 23/500
Epoch 24/500
Epoch 25/500
Epoch 26/500
Epoch 27/500
Epoch 28/500
Epoch 29/500
Epoch 30/500
Epoch 31/500
Epoch 32/500
Epoch 33/500
Epoch 34/500
Epoch 35/500
Epoch 36/500
Epoch 37/500
Epoch 38/500
Epoch 39/500
Epoch 40/500
Epoch 41/500
Epoch 42/500
Epoch 43/500
Epoch 44/500
Epoch 45/500
Epoch 46/500
Epoch 47/500
Epoch 48/500
Epoch 49/500
Epoch 50/500
Epoch 51/500
Epoch 52/500
Epoch 53/500
Epoch 54/500
Epoch 55/500
Epoch 56/500
Epoch 57/500
Epoch 58/500
Epoch 59/500
Epoch 60/500
Epoch 61/500
Epoch 62/500
Epoch 63/500
Epoch 64/500
Epoch 65/500
Epoch 66/500
Epoch 67/500
Epoch 68/500
Epoch 69/500
Epoch 70/500
Epoch 71/500
Epoch 72/500
Epoch 73/500
Epoch 74/500
Epoch 75/500
Epoch 76/500
Epoch 77/500
Epoch 78

Epoch 83/500
Epoch 84/500
Epoch 85/500
Epoch 86/500
Epoch 87/500
Epoch 88/500
Epoch 89/500
Epoch 90/500
Epoch 91/500
Epoch 92/500
Epoch 93/500
Epoch 94/500
Epoch 95/500
Epoch 96/500
Epoch 97/500
Epoch 98/500
Epoch 99/500
Epoch 100/500
Epoch 101/500
Epoch 102/500
Epoch 103/500
Epoch 104/500
Epoch 105/500
Epoch 106/500
Epoch 107/500
Epoch 108/500
Epoch 109/500
Epoch 110/500
Epoch 111/500
Epoch 112/500
Epoch 113/500
Epoch 114/500
Epoch 115/500
Epoch 116/500
Epoch 117/500
Epoch 118/500
Epoch 119/500
Epoch 120/500
Epoch 121/500
Epoch 122/500
Epoch 123/500
Epoch 124/500
Epoch 125/500
Epoch 126/500
Epoch 127/500
Epoch 128/500
Epoch 129/500
Epoch 130/500
Epoch 131/500
Epoch 132/500
Epoch 133/500
Epoch 134/500
Epoch 135/500
Epoch 136/500
Epoch 137/500
Epoch 138/500
Epoch 139/500
Epoch 140/500
Epoch 141/500
Epoch 142/500
Epoch 143/500
Epoch 144/500
Epoch 145/500
Epoch 146/500
Epoch 147/500
Epoch 148/500
Epoch 149/500
Epoch 150/500
Epoch 151/500
Epoch 152/500
Epoch 153/500
Epoch 154/500
Epoch 155

Epoch 163/500
Epoch 164/500
Epoch 165/500
Epoch 166/500
Epoch 167/500
Epoch 168/500
Epoch 169/500
Epoch 170/500
Epoch 171/500
Epoch 172/500
Epoch 173/500
Epoch 174/500
Epoch 175/500
Epoch 176/500
Epoch 177/500
Epoch 178/500
Epoch 179/500
Epoch 180/500
Epoch 181/500
Epoch 182/500
Epoch 183/500
Epoch 184/500
Epoch 185/500
Epoch 186/500
Epoch 187/500
Epoch 188/500
Epoch 189/500
Epoch 190/500
Epoch 191/500
Epoch 192/500
Epoch 193/500
Epoch 194/500
Epoch 195/500
Epoch 196/500
Epoch 197/500
Epoch 198/500
Epoch 199/500
Epoch 200/500
Epoch 201/500
Epoch 202/500
Epoch 203/500
Epoch 204/500
Epoch 205/500
Epoch 206/500
Epoch 207/500
Epoch 208/500
Epoch 209/500
Epoch 210/500
Epoch 211/500
Epoch 212/500
Epoch 213/500
Epoch 214/500
Epoch 215/500
Epoch 216/500
Epoch 217/500
Epoch 218/500
Epoch 219/500
Epoch 220/500
Epoch 221/500
Epoch 222/500
Epoch 223/500
Epoch 224/500
Epoch 225/500
Epoch 226/500
Epoch 227/500
Epoch 228/500
Epoch 229/500
Epoch 230/500
Epoch 231/500
Epoch 232/500
Epoch 233/500
Epoch 

Epoch 243/500
Epoch 244/500
Epoch 245/500
Epoch 246/500
Epoch 247/500
Epoch 248/500
Epoch 249/500
Epoch 250/500
Epoch 251/500
Epoch 252/500
Epoch 253/500
Epoch 254/500
Epoch 255/500
Epoch 256/500
Epoch 257/500
Epoch 258/500
Epoch 259/500
Epoch 260/500
Epoch 261/500
Epoch 262/500
Epoch 263/500
Epoch 264/500
Epoch 265/500
Epoch 266/500
Epoch 267/500
Epoch 268/500
Epoch 269/500
Epoch 270/500
Epoch 271/500
Epoch 272/500
Epoch 273/500
Epoch 274/500
Epoch 275/500
Epoch 276/500
Epoch 277/500
Epoch 278/500
Epoch 279/500
Epoch 280/500
Epoch 281/500
Epoch 282/500
Epoch 283/500
Epoch 284/500
Epoch 285/500
Epoch 286/500
Epoch 287/500
Epoch 288/500
Epoch 289/500
Epoch 290/500
Epoch 291/500
Epoch 292/500
Epoch 293/500
Epoch 294/500
Epoch 295/500
Epoch 296/500
Epoch 297/500
Epoch 298/500
Epoch 299/500
Epoch 300/500
Epoch 301/500
Epoch 302/500
Epoch 303/500
Epoch 304/500
Epoch 305/500
Epoch 306/500
Epoch 307/500
Epoch 308/500
Epoch 309/500
Epoch 310/500
Epoch 311/500
Epoch 312/500
Epoch 313/500
Epoch 

Epoch 323/500
Epoch 324/500
Epoch 325/500
Epoch 326/500
Epoch 327/500
Epoch 328/500
Epoch 329/500
Epoch 330/500
Epoch 331/500
Epoch 332/500
Epoch 333/500
Epoch 334/500
Epoch 335/500
Epoch 336/500
Epoch 337/500
Epoch 338/500
Epoch 339/500
Epoch 340/500
Epoch 341/500
Epoch 342/500
Epoch 343/500
Epoch 344/500
Epoch 345/500
Epoch 346/500
Epoch 347/500
Epoch 348/500
Epoch 349/500
Epoch 350/500
Epoch 351/500
Epoch 352/500
Epoch 353/500
Epoch 354/500
Epoch 355/500
Epoch 356/500
Epoch 357/500
Epoch 358/500
Epoch 359/500
Epoch 360/500
Epoch 361/500
Epoch 362/500
Epoch 363/500
Epoch 364/500
Epoch 365/500
Epoch 366/500
Epoch 367/500
Epoch 368/500
Epoch 369/500
Epoch 370/500
Epoch 371/500
Epoch 372/500
Epoch 373/500
Epoch 374/500
Epoch 375/500
Epoch 376/500
Epoch 377/500
Epoch 378/500
Epoch 379/500
Epoch 380/500
Epoch 381/500
Epoch 382/500
Epoch 383/500
Epoch 384/500
Epoch 385/500
Epoch 386/500
Epoch 387/500
Epoch 388/500
Epoch 389/500
Epoch 390/500
Epoch 391/500
Epoch 392/500
Epoch 393/500
Epoch 

Epoch 403/500
Epoch 404/500
Epoch 405/500
Epoch 406/500
Epoch 407/500
Epoch 408/500
Epoch 409/500
Epoch 410/500
Epoch 411/500
Epoch 412/500
Epoch 413/500
Epoch 414/500
Epoch 415/500
Epoch 416/500
Epoch 417/500
Epoch 418/500
Epoch 419/500
Epoch 420/500
Epoch 421/500
Epoch 422/500
Epoch 423/500
Epoch 424/500
Epoch 425/500
Epoch 426/500
Epoch 427/500
Epoch 428/500
Epoch 429/500
Epoch 430/500
Epoch 431/500
Epoch 432/500
Epoch 433/500
Epoch 434/500
Epoch 435/500
Epoch 436/500
Epoch 437/500
Epoch 438/500
Epoch 439/500
Epoch 440/500
Epoch 441/500
Epoch 442/500
Epoch 443/500
Epoch 444/500
Epoch 445/500
Epoch 446/500
Epoch 447/500
Epoch 448/500
Epoch 449/500
Epoch 450/500
Epoch 451/500
Epoch 452/500
Epoch 453/500
Epoch 454/500
Epoch 455/500
Epoch 456/500
Epoch 457/500
Epoch 458/500
Epoch 459/500
Epoch 460/500
Epoch 461/500
Epoch 462/500
Epoch 463/500
Epoch 464/500
Epoch 465/500
Epoch 466/500
Epoch 467/500
Epoch 468/500
Epoch 469/500
Epoch 470/500
Epoch 471/500
Epoch 472/500
Epoch 473/500
Epoch 

Epoch 483/500
Epoch 484/500
Epoch 485/500
Epoch 486/500
Epoch 487/500
Epoch 488/500
Epoch 489/500
Epoch 490/500
Epoch 491/500
Epoch 492/500
Epoch 493/500
Epoch 494/500
Epoch 495/500
Epoch 496/500
Epoch 497/500
Epoch 498/500
Epoch 499/500
Epoch 500/500


<keras.callbacks.History at 0x7fb84194e390>

In [34]:
y_test_one_hot = keras.utils.to_categorical(y_test,num_classes=None)
loss_and_metrics = model.evaluate(test_vecs, y_test_one_hot,batch_size=938)



In [35]:
loss_and_metrics

[1.94415283203125, 0.69509601593017578]

## using tensorflow

In [37]:
n_input = 300
n_classes = 2 
learning_rate = 0.005
training_epochs = 300
batch_size = 32
keep_rate = 0.4

save_file = 'ckpt/train_model.ckpt' 

In [39]:
tf.reset_default_graph()

## keep probability for drop out
keep_prob = tf.placeholder(tf.float32)

## input layer 
x = tf.placeholder("float",[None,n_input])      ## because out data is in grey scale, so only has 1 channel 
y = tf.placeholder("float",[None,n_classes])


## hiden layer
layer1 = tf.layers.dense(inputs=x, units=200,activation=tf.nn.relu)
layer1 = tf.nn.dropout(layer1,keep_prob) 
layer2 = tf.layers.dense(inputs=layer1, units=100)
layer2 = tf.nn.dropout(layer2,keep_prob) 

## output layer 
logits = tf.layers.dense(inputs=layer2, units=2)
cost = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits(
    logits=logits, labels=y))

optimizer = tf.train.AdamOptimizer(learning_rate=learning_rate)
train_op = optimizer.minimize(cost)

## calculate accuracy
pred_probas = tf.nn.softmax(logits)
pred_classes = tf.argmax(logits, axis=1)

correct_prediction = tf.equal(pred_classes,tf.argmax(y,1))
accuracy = tf.reduce_mean(tf.cast(correct_prediction,tf.float32))


In [40]:
def get_batches(X, Y, batch_size):
    # Get the batch size and number of batches we can make
    n_batches = len(X)//batch_size     ## only keep the integer number 
    for n in range(0, n_batches):
        # The features
        x = np.array(X[n*batch_size:n*batch_size+batch_size])
        # The targets, shifted by one
        y = np.array(Y[n*batch_size:n*batch_size+batch_size])
        yield x, y
    if len(X)//batch_size>0:
        x = np.array(X[(n_batches-1)*batch_size:])
        y = np.array(Y[(n_batches-1)*batch_size:])
        yield x,y

In [41]:
batches = list(get_batches(train_vecs, y_train_one_hot, batch_size))
init = tf.global_variables_initializer()
saver = tf.train.Saver()
train_acc_list = []
valid_acc_list = []
loss_list = []


with tf.Session() as sess:
    sess.run(init)                           ## run initializer
    ## train cycles 
    epoch = None
    valid_accuracy = None
    #epoch_pbar = tqdm(range(training_epochs), desc='Epoch: {}, Validation Accuracy: {}'.format(epoch, valid_accuracy), unit='epoches')
    for epoch in range(training_epochs):   ## use tqdm for process bar 
        total_batch = len(train_vecs)//batch_size
        ## loop over all batches 
        for i in range(total_batch):
            batch_x, batch_y = batches[i]
            _,loss = sess.run([train_op,cost], feed_dict={x: batch_x, y: batch_y,keep_prob: keep_rate})
    
        # Calculate Training and Validation accuracy
        training_accuracy = sess.run(accuracy,feed_dict={
                    x: train_vecs,
                    y: y_train_one_hot,
                    keep_prob: 1.0})
        
        validation_accuracy = sess.run(accuracy, feed_dict={
                    x: test_vecs,
                    y: y_test_one_hot,
                    keep_prob: 1.0})
        # Log accuracy
        loss_list.append(loss)
        train_acc_list.append(training_accuracy)
        valid_acc_list.append(validation_accuracy)
                               
        ## print status for every 10 epochs 
        if epoch % 50 == 0:
            print('Epoch {:<3} - Cost: {:.6f}, Training Accuracy: {:.3f}, Validation Accuracy: {:.3f}'.format(
                epoch,
                loss,
                training_accuracy,
                validation_accuracy))
            
    ####################################
    ## save the model for future use####
    ####################################
    saver.save(sess, save_file)
    print('Trained Model Saved.')



Epoch 0   - Cost: 0.740906, Training Accuracy: 0.683, Validation Accuracy: 0.679
Epoch 50  - Cost: 0.099261, Training Accuracy: 0.878, Validation Accuracy: 0.670
Epoch 100 - Cost: 0.272123, Training Accuracy: 0.974, Validation Accuracy: 0.677
Epoch 150 - Cost: 0.508899, Training Accuracy: 0.981, Validation Accuracy: 0.701
Epoch 200 - Cost: 0.428218, Training Accuracy: 0.991, Validation Accuracy: 0.686
Epoch 250 - Cost: 0.015576, Training Accuracy: 0.995, Validation Accuracy: 0.684
Trained Model Saved.


### reload graph for inference 

In [42]:
#graph = tf.Graph()
sess = tf.Session() 
new_saver = tf.train.import_meta_graph(save_file+'.meta')
new_saver.restore(sess, save_file)

INFO:tensorflow:Restoring parameters from ckpt/train_model.ckpt


INFO:tensorflow:Restoring parameters from ckpt/train_model.ckpt


In [43]:
acc = sess.run(accuracy,feed_dict={
            x: train_vecs,
            y: y_train_one_hot,
            keep_prob: 1.0})

In [44]:
acc

0.99253196

In [45]:
acc = sess.run(accuracy,feed_dict={
            x: test_vecs,
            y: y_test_one_hot,
            keep_prob: 1.0})

In [46]:
acc

0.68230277