In [1]:
## data processsing
import pandas as pd
import numpy as np
import re
import pickle
import nltk

from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import *

from nltk import ngrams

In [2]:
from keras.models import Sequential
from keras.layers import Dense
from keras.layers import LSTM
from keras.layers.embeddings import Embedding
from keras.preprocessing import sequence
np.random.seed(7)

import nltk

from itertools import islice

def take(n, iterable):
    "Return first n items of the iterable as a list"
    return list(islice(iterable, n))

def remove_stopwords(X):
    word_tokens = word_tokenize(X)
    stop_words = pd.read_csv("data/stop_words.csv")["stop word"].tolist()
    X = [w for w in word_tokens if not w in stop_words]
    X = ' '.join(X)
    return X
        
def stemming(X):
    stemmer = PorterStemmer()
    word_tokens = word_tokenize(X)
    X = [stemmer.stem(w) for w in word_tokens]
    X = ' '.join(X)
    return X

def remove_noun(X):
    X = nltk.tag.pos_tag(X.split())
    X = [word for word, tag in X if tag != 'NNP' and tag != 'NNPS' and tag != 'NN']
    X = ' '.join(X)
    return X

def cleaning(X):    
    X = [re.sub(r'[^\x00-\x7F]+','', twit) for twit in X]
    X = [re.sub(r"^\d+\s|\s\d+\s|\s\d+$",'', twit) for twit in X]
    X = [re.sub(r'http\S+', '', twit) for twit in X]
    X = [re.sub(r'#', '', twit) for twit in X]
    X = [re.sub(r'@', '', twit) for twit in X]
    X = [twit.lower() for twit in X]
    X = [remove_stopwords(twit) for twit in X]
    return X

def two_grams(twit):
    n = 2
    tokens = ngrams(twit.split(), n)
    X = [token for token in tokens]
    return X
def unit_grams(twit):
    n = 1
    tokens = ngrams(twit.split(), n)
    X = [token for token in tokens]
    return X
def ngrams_features(X):
    X_1 = [unit_grams(twit) for twit in X] 
    X_2 = [two_grams(twit) for twit in X]
    return X_1, X_2

def find_dict(X, top_words):
    ### use NLTK ######
    # Tokenize
    # every element in X will be transfered to a feature vector, 
    # each element in the feature vector represents the occrence of a word
    # the dimenson of the vector equals to the top_word.
    all_words = []
    for words in X:
        all_words = all_words + words
    all_words = nltk.FreqDist(all_words)
    print("the size of the vocab is: ", len(all_words))
    word_features = list(all_words.keys())[:top_words]
    dict_words = {}
    for count, word in enumerate(word_features):
        dict_words[word] = count
    return dict_words

def find_features(twit, dict_words):
    features = []
    for word in twit:
        if word in dict_words:
            features.append(dict_words[word])
    return features

def text2features(X_raw):
    X_text = cleaning(X_raw)
    X_1, X_2 = ngrams_features(X_text)
    X_train = X_1
    dict_words = find_dict(X_train, vocab_size)
    print("the dictionary is", take(10, dict_words.items()))
    X_train = [find_features(twit, dict_words) for twit in X_train] 
    i = 10
    print(f"the {i+1}th sample is: ", X_train[i])
    return X_train, dict_words

def discrete_sentiment(y):
    y_d = []
    for label in y:
        if label < -0.20:
            y_d.append(0)
        elif label < 0.25:
            y_d.append(1)
        else:
            y_d.append(2)
    return y_d

Using TensorFlow backend.


In [3]:
##############################################
##############################################
#####    prepare the data ##########
##############################################
##############################################

import json
with open('data/Headline_Trainingdata.json') as f:
    data = json.load(f)
vocab_size = 2500
df_raw = pd.DataFrame(data=data)
df_raw.sample(frac=1)
company_name = df_raw["company"].tolist()
X_raw = df_raw["title"].tolist()
y_raw = df_raw["sentiment"].tolist()
X_raw = [twit.replace(company_name[i],  '') for i, twit in enumerate(X_raw)]
X_train = list(text2features(X_raw)[0])
y_train = discrete_sentiment(y_raw)
from collections import Counter
print("training labels:", Counter(y_train))

y_encoded = np.zeros((len(y_train), 3),  dtype=int)
y_encoded[np.asarray(y_train) == 0, 0] = 1
y_encoded[np.asarray(y_train) == 1, 1] = 1
y_encoded[np.asarray(y_train) == 2, 2] = 1

max_review_length = 14
num_test = 150
X_train = sequence.pad_sequences(X_train, maxlen=max_review_length)

X_test = X_train[:num_test]
X = X_train[num_test:]
y_test = y_encoded[:num_test]
y = y_encoded[num_test:]

the size of the vocab is:  2647
the dictionary is [(('book',), 0), (('second',), 1), (('consecutive',), 2), (('quarter',), 3), (('sales',), 4), (('growth',), 5), (('posts',), 6), (('drop',), 7), (('first-quarter',), 8), (('organic',), 9)]
the 11th sample is:  [19, 62, 30, 63, 64, 65, 66, 67, 68]
training labels: Counter({2: 401, 1: 399, 0: 342})


In [4]:
from keras.layers import Input, Dense, concatenate, Activation, Dropout
from keras.models import Model
from keras.layers import MaxPooling1D, Conv1D, GlobalMaxPooling1D
from sklearn.metrics import confusion_matrix
from sklearn.metrics import precision_score

tweet_input = Input(shape=(max_review_length,), dtype='int32')
tweet_encoder = Embedding(vocab_size, 100, input_length=max_review_length)(tweet_input)

unigram_branch = Conv1D(filters=50, kernel_size=1, padding='valid', activation='relu', strides=1)(tweet_encoder)
unigram_branch = GlobalMaxPooling1D()(unigram_branch)
# unigram_branch = Dropout(0.9)(unigram_branch)
bigram_branch = Conv1D(filters=20, kernel_size=2, padding='valid', activation='relu', strides=1)(tweet_encoder)
bigram_branch = GlobalMaxPooling1D()(bigram_branch)
# bigram_branch = Dropout(0.9)(bigram_branch)
trigram_branch = Conv1D(filters=15, kernel_size=3, padding='valid', activation='relu', strides=1)(tweet_encoder)
trigram_branch = GlobalMaxPooling1D()(trigram_branch)
# trigram_branch = Dropout(0.9)(trigram_branch)
fourgram_branch = Conv1D(filters=10, kernel_size=4, padding='valid', activation='relu', strides=1)(tweet_encoder)
fourgram_branch = GlobalMaxPooling1D()(fourgram_branch)
# fourgram_branch = Dropout(0.9)(fourgram_branch)
fivegram_branch = Conv1D(filters=5, kernel_size=5, padding='valid', activation='relu', strides=1)(tweet_encoder)
fivegram_branch = GlobalMaxPooling1D()(fivegram_branch)
# fivegram_branch = Dropout(0.9)(fivegram_branch)

merged = concatenate([
    unigram_branch, 
    bigram_branch, 
    trigram_branch, 
#    fourgram_branch, 
#    fivegram_branch, 
#    sixgram_branch
], axis=1)

merged = Dense(64, activation='relu')(merged)
merged = Dropout(0.5)(merged)
merged = Dense(3)(merged)
output = Activation('softmax')(merged)
model = Model(inputs=[tweet_input], outputs=[output])
model.compile(loss='categorical_crossentropy',
                  optimizer='adam',
                  metrics=['accuracy'])

model.summary()

Model: "model_1"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_1 (InputLayer)            (None, 14)           0                                            
__________________________________________________________________________________________________
embedding_1 (Embedding)         (None, 14, 100)      250000      input_1[0][0]                    
__________________________________________________________________________________________________
conv1d_1 (Conv1D)               (None, 14, 50)       5050        embedding_1[0][0]                
__________________________________________________________________________________________________
conv1d_2 (Conv1D)               (None, 13, 20)       4020        embedding_1[0][0]                
____________________________________________________________________________________________

In [6]:
num_train = len(X_raw) - num_test
training_size = [
#        200, 
#        400, 
#        600, 
#        800, 
        num_train]
acc_list = []
for size in training_size:
    model.fit(X[:size], y[:size], batch_size=32, epochs=10, validation_split=0.1)
    y_predic = model.predict(X_test)
    predic = np.argmax(y_predic, 1)
    label = np.argmax(y_test, 1)

    results = (predic == label)
    accuracy = sum(results)/num_test
    print("test labels:", Counter(label))
    #print("train labels:", Counter(np.argmax(y, 1)))
    #print("all labels:", Counter(np.argmax(y_encoded, 1)))
    df_results = pd.DataFrame(data={"news":X_raw[:num_test], "label":label, "prediction":predic, "results":results})
    df_results.to_csv("results/sentiment_results_1031.csv")
    print("the accuracy for the testing data set is: ", accuracy)
    acc_list.append(accuracy)
    cm = confusion_matrix(label, predic)
    print(cm)

Train on 892 samples, validate on 100 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
test labels: Counter({1: 57, 0: 48, 2: 45})
the accuracy for the testing data set is:  0.64
[[29  8 11]
 [12 38  7]
 [ 3 13 29]]
