#### Sentiment Analysis Deeplearning 1

In [1]:
import pickle
import gensim
import numpy as np
import nltk
import re
import os 
import pandas as pd
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import CountVectorizer
from nltk.tokenize import word_tokenize,sent_tokenize
import seaborn as sns
import matplotlib.pyplot as plt
import math

#nltk.download('stopwords')
#nltk.download("wordnet")



##### 1.Prepare data

In [2]:
labeled_news1 = pd.read_csv('Full-Economic-News-DFE-839861.csv',encoding = 'ISO-8859-1')
train_data1 = labeled_news1.loc[labeled_news1.relevance == 'yes']
train_data1 = train_data1[['text','positivity','positivity:confidence']]
print(len(labeled_news1))
print(len(train_data1))

8000
1420


In [3]:
labeled_news2 = pd.read_csv('us-economic-newspaper.csv',encoding = 'ISO-8859-1')
train_data2 = labeled_news2.loc[labeled_news2.relevance == 'yes']
train_data2 = train_data2[['text','positivity','positivity:confidence']]
print(len(labeled_news2))
print(len(train_data2))

5015
2901


In [4]:
train_data = pd.concat([train_data1, train_data2])
print(len(train_data))

4321


In [5]:
# Drop missing values
train_data.dropna(inplace=True)
print (train_data.shape)

(4319, 3)


In [6]:
# Remove any 'neutral' ratings equal to 5
train_data = train_data.loc[train_data['positivity'] != 5]

# Encode 6-9s as 1 (rated positively)
# Encode 1-4s as 0 (rated poorly)
train_data['Positively Rated'] = np.where(train_data['positivity'] > 5, 1, 0)

In [7]:
#The numbers of positive rating and negative rating are quite even
train_data['Positively Rated'].mean()

0.3888

- clean data

In [8]:
def process_text( raw_news, lemmatizer, stopw ):
    '''
    Function to convert a raw news to a string of words
    The input is a single string (a raw news), and 
    the output is a single string (a preprocessed news)
    '''
    from bs4 import BeautifulSoup  
    
    # Remove HTML
    news_text = BeautifulSoup(raw_news,"lxml").get_text() 

    # Remove non-letters        
    letters_only = re.sub("[^a-zA-Z]", " ", news_text) 
    # Tokenize and clean bag of words:
    tokens = word_tokenize(letters_only.lower())
    tokens = [lemmatizer.lemmatize(t) for t in tokens]
    #tokens = [t for t in tokens if t not in stopw]
    #tokens = [t for t in tokens if len(t)>1]
    
    return tokens   

In [9]:
from nltk.corpus import stopwords
stopw = stopwords.words('english')
lemmatizer = WordNetLemmatizer()

#Apply this to the dataframe
train_data['processed_text'] = train_data['text'].apply(lambda t: process_text(t, lemmatizer, stopw))

In [10]:
train_data.head(2)

Unnamed: 0,text,positivity,positivity:confidence,Positively Rated,processed_text
0,NEW YORK -- Yields on most certificates of dep...,3.0,0.64,0,"[new, york, yield, on, most, certificate, of, ..."
4,NEW YORK -- Indecision marked the dollar's ton...,3.0,0.3257,0,"[new, york, indecision, marked, the, dollar, s..."


In [11]:
from sklearn.model_selection import train_test_split

# Split data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(train_data['processed_text'], 
                                                    train_data['Positively Rated'], 
                                                    random_state=4)

#### Word2Vec and Doc2Vec Approach

In [12]:
## use pretrained w2v
import gensim
from gensim.models.word2vec import Word2Vec
from gensim.models.keyedvectors import KeyedVectors

In [13]:
use_google = True

if use_google:
    news_w2v = KeyedVectors.load_word2vec_format(os.path.join('pre_trained_w2v','GoogleNews-vectors-negative300.bin'), binary=True)
else:
    news_w2v = Word2Vec.load(os.path.join('pre_trained_w2v','imf_160.w2v'))

In [14]:
news_w2v.wv.most_similar('good',topn=10)

[('great', 0.7291510105133057),
 ('bad', 0.7190051078796387),
 ('terrific', 0.6889116168022156),
 ('decent', 0.6837348341941833),
 ('nice', 0.6836092472076416),
 ('excellent', 0.644292950630188),
 ('fantastic', 0.6407778263092041),
 ('better', 0.6120728254318237),
 ('solid', 0.5806034803390503),
 ('lousy', 0.5764201879501343)]

In [15]:
#Build word vector for training set by using the average value of all word vectors in the news, then scale
def buildWordVector(text, size):
    vec = np.zeros(size).reshape((1, size))
    count = 0.
    for word in text:
        try:
            vec += news_w2v[word].reshape((1, size))
            count += 1.
        except KeyError:
            continue
    if count != 0:
        vec /= count
    return vec

In [16]:
from sklearn.preprocessing import scale

n_dim = 300

train_vecs = np.concatenate([buildWordVector(z, n_dim) for z in X_train])
train_vecs = scale(train_vecs)

#Build test news vectors then scale
test_vecs = np.concatenate([buildWordVector(z, n_dim) for z in X_test])
test_vecs = scale(test_vecs)

In [17]:
# Try Logistic Regression
from sklearn.metrics import roc_auc_score
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC

modellr = LogisticRegression()
modellr.fit(train_vecs, y_train)

predictions = modellr.predict_proba(test_vecs)[:,1]
exact_predict = modellr.predict(test_vecs)

print('AUC: ', roc_auc_score(y_test, predictions))
print('Accuracy: ', accuracy_score(y_test, exact_predict))

AUC:  0.70302345157
Accuracy:  0.683368869936


## Use keras

In [18]:
import tensorflow as tf 
import keras

print(tf.__version__)

1.3.0


Using TensorFlow backend.


In [19]:
Dense = keras.layers.Dense
Dropout = keras.layers.Dropout

In [20]:
def build_model():
    model = keras.models.Sequential()
    model.add(Dense(units=200,activation='sigmoid',input_dim=300))
    model.add(Dropout(0.5))
    model.add(Dense(units=100,activation='sigmoid'))
    model.add(Dropout(0.5))
    ## output layer
    model.add(Dense(units=2,activation='softmax'))
    model.compile(loss='categorical_crossentropy',
              optimizer='sgd',metrics=['accuracy'])
    
    return model
    

In [21]:
y_train_one_hot = keras.utils.to_categorical(y_train,num_classes=None)

In [22]:
model = build_model()
model.fit(train_vecs, y_train_one_hot, epochs=400, batch_size=32,verbose=False)

<keras.callbacks.History at 0x4a203255f8>

In [23]:
loss_and_metrics = model.evaluate(train_vecs, y_train_one_hot)
loss_and_metrics



[0.54573713571553883, 0.71941678495190053]

In [24]:
y_test_one_hot = keras.utils.to_categorical(y_test,num_classes=None)
loss_and_metrics = model.evaluate(test_vecs, y_test_one_hot)
loss_and_metrics



[0.58264108520072655, 0.70149253744052165]

## using tensorflow

In [25]:
n_input = 300
n_classes = 2 
learning_rate = 0.001
training_epochs = 500
batch_size = 32
keep_rate = 0.1

save_file = 'ckpt/train_model.ckpt' 

In [26]:
tf.reset_default_graph()

## keep probability for drop out
keep_prob = tf.placeholder(tf.float32)

## input layer 
x = tf.placeholder("float",[None,n_input])      ## because out data is in grey scale, so only has 1 channel 
y = tf.placeholder("float",[None,n_classes])


## hiden layer
layer1 = tf.layers.dense(inputs=x, units=128,activation=tf.nn.sigmoid)
layer1 = tf.nn.dropout(layer1,keep_prob) 
layer2 = tf.layers.dense(inputs=layer1, units=32,activation=tf.nn.sigmoid)
layer2 = tf.nn.dropout(layer2,keep_prob) 

## output layer 
logits = tf.layers.dense(inputs=layer2, units=2)
cost = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits(
    logits=logits, labels=y))

optimizer = tf.train.AdamOptimizer(learning_rate=learning_rate)
train_op = optimizer.minimize(cost)

## calculate accuracy
pred_probas = tf.nn.softmax(logits)
pred_classes = tf.argmax(logits, axis=1)

correct_prediction = tf.equal(pred_classes,tf.argmax(y,1))
accuracy = tf.reduce_mean(tf.cast(correct_prediction,tf.float32))


In [27]:
def get_batches(X, Y, batch_size):
    # Get the batch size and number of batches we can make
    n_batches = len(X)//batch_size     ## only keep the integer number 
    for n in range(0, n_batches):
        # The features
        x = np.array(X[n*batch_size:n*batch_size+batch_size])
        # The targets, shifted by one
        y = np.array(Y[n*batch_size:n*batch_size+batch_size])
        yield x, y
    if len(X)//batch_size>0:
        x = np.array(X[(n_batches-1)*batch_size:])
        y = np.array(Y[(n_batches-1)*batch_size:])
        yield x,y

In [28]:
batches = list(get_batches(train_vecs, y_train_one_hot, batch_size))
init = tf.global_variables_initializer()
saver = tf.train.Saver()
train_acc_list = []
valid_acc_list = []
loss_list = []


with tf.Session() as sess:
    sess.run(init)                           ## run initializer
    ## train cycles 
    epoch = None
    valid_accuracy = None
    #epoch_pbar = tqdm(range(training_epochs), desc='Epoch: {}, Validation Accuracy: {}'.format(epoch, valid_accuracy), unit='epoches')
    for epoch in range(training_epochs):   ## use tqdm for process bar 
        total_batch = len(train_vecs)//batch_size
        ## loop over all batches 
        for i in range(total_batch):
            batch_x, batch_y = batches[i]
            _,loss = sess.run([train_op,cost], feed_dict={x: batch_x, y: batch_y,keep_prob: keep_rate})
    
        # Calculate Training and Validation accuracy
        training_accuracy = sess.run(accuracy,feed_dict={
                    x: train_vecs,
                    y: y_train_one_hot,
                    keep_prob: 1.0})
        
        validation_accuracy = sess.run(accuracy, feed_dict={
                    x: test_vecs,
                    y: y_test_one_hot,
                    keep_prob: 1.0})
        # Log accuracy
        loss_list.append(loss)
        train_acc_list.append(training_accuracy)
        valid_acc_list.append(validation_accuracy)
                               
        ## print status for every 10 epochs 
        if epoch % 50 == 0:
            print('Epoch {:<3} - Cost: {:.6f}, Training Accuracy: {:.3f}, Validation Accuracy: {:.3f}'.format(
                epoch,
                loss,
                training_accuracy,
                validation_accuracy))
            
    ####################################
    ## save the model for future use####
    ####################################
    saver.save(sess, save_file)
    print('Trained Model Saved.')



Epoch 0   - Cost: 1.273625, Training Accuracy: 0.604, Validation Accuracy: 0.633
Epoch 50  - Cost: 0.618965, Training Accuracy: 0.604, Validation Accuracy: 0.633
Epoch 100 - Cost: 0.548313, Training Accuracy: 0.729, Validation Accuracy: 0.716
Epoch 150 - Cost: 0.541274, Training Accuracy: 0.744, Validation Accuracy: 0.713
Epoch 200 - Cost: 0.554368, Training Accuracy: 0.762, Validation Accuracy: 0.714
Epoch 250 - Cost: 0.534694, Training Accuracy: 0.768, Validation Accuracy: 0.714
Epoch 300 - Cost: 0.560570, Training Accuracy: 0.781, Validation Accuracy: 0.710
Epoch 350 - Cost: 0.635540, Training Accuracy: 0.789, Validation Accuracy: 0.710
Epoch 400 - Cost: 0.548678, Training Accuracy: 0.793, Validation Accuracy: 0.707
Epoch 450 - Cost: 0.448937, Training Accuracy: 0.798, Validation Accuracy: 0.705
Trained Model Saved.


### reload graph for inference 

In [34]:
#graph = tf.Graph()
sess = tf.Session() 
new_saver = tf.train.import_meta_graph(save_file+'.meta')
new_saver.restore(sess, save_file)

INFO:tensorflow:Restoring parameters from ckpt/train_model.ckpt


In [35]:
acc = sess.run(accuracy,feed_dict={
            x: train_vecs,
            y: y_train_one_hot,
            keep_prob: 1.0})

In [36]:
acc

0.80192035

In [37]:
acc = sess.run(accuracy,feed_dict={
            x: test_vecs,
            y: y_test_one_hot,
            keep_prob: 1.0})

In [38]:
acc

0.70682305