# Sentiment classification for social media - He Tianyou

In [224]:
# Import packages
import pandas as pd
import nltk
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import TfidfVectorizer
#from sklearn.cross_validation import train_test_split
from sklearn import linear_model, naive_bayes, svm
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix

import numpy as np
from tensorflow import keras, argmax
from tensorflow.keras.preprocessing import text, sequence
from sklearn.preprocessing import LabelEncoder, OneHotEncoder, LabelBinarizer
#from sklearn.metrics import roc_auc_score

import re

Classifier part one

In [225]:
# Read training data
train_data = pd.read_csv('semeval-tweets/twitter-training-data.txt', sep='\\t', names=['tweet_id','sentiment','tweet_text'])
# Read dev data
dev_data = pd.read_csv('semeval-tweets/twitter-dev-data.txt', sep='\\t', names=['tweet_id','sentiment','tweet_text'])

  
  after removing the cwd from sys.path.


Text preprocessing

In [226]:
# Remove URLs. Note that URLs may appear in different forms
train_data['tweet_text'] = train_data['tweet_text'].str.replace(r'(((https?:\/\/)|(w{3}.))[\S]*)|([\w\d\/\.]*\.(com|cn|co|net|org|edu|uk|int|js|html))', '')
dev_data['tweet_text'] = dev_data['tweet_text'].str.replace(r'(((https?:\/\/)|(w{3}.))[\S]*)|([\w\d\/\.]*\.(com|cn|co|net|org|edu|uk|int|js|html))', '')
# change all n't or n' suffixes to not e.g "won't",'wouldn',"wouldn't" into wo not
train_data['tweet_text'] = train_data['tweet_text'].str.replace(r'(n\'t|n\')', ' not ')
dev_data['tweet_text'] = dev_data['tweet_text'].str.replace(r'(n\'t|n\')', ' not ')
# change all happy emojis to happy e.g :) :') ;) :D ;D :'D xD (: (':')):3 c: C: c; C; c': C':
train_data['tweet_text'] = train_data['tweet_text'].str.replace(r"\:\)|\:\'\)|\;\)|\:D|xD|\:3|\(\:|\('\:|\;D|\:\'D|c\:|C\:|c\;|C\;|c\'\:|C\'\:", ' happy ')
dev_data['tweet_text'] = dev_data['tweet_text'].str.replace(r"\:\)|\:\'\)|\;\)|\:D|xD|\:3|\(\:|\('\:|\;D|\:\'D|c\:|C\:|c\;|C\;|c\'\:|C\'\:", ' happy ')
# change all sad emojis to sad e.g D; D: ): ); :'( D': Dx :( :'(' :c :C ;c ;C:'c:'C
train_data['tweet_text'] = train_data['tweet_text'].str.replace(r"D\;|D\:|\)\:|\)\;|\:\'\(|D\'\:|Dx|\:\(|\:c|\:C|\;c|\;C|\:\'c|\:\'C", ' sad ')
dev_data['tweet_text'] = dev_data['tweet_text'].str.replace(r"D\;|D\:|\)\:|\)\;|\:\'\(|D\'\:|Dx|\:\(|\:c|\:C|\;c|\;C|\:\'c|\:\'C", ' sad ')
# remove twitter handles
train_data['tweet_text'] = train_data['tweet_text'].str.replace(r'\@[\S]*', '')
dev_data['tweet_text'] = dev_data['tweet_text'].str.replace(r'\@[\S]*', '')
# Remove numbers that are fully made of digits
train_data['tweet_text'] = train_data['tweet_text'].str.replace(r'\b\d+\b','')
dev_data['tweet_text'] = dev_data['tweet_text'].str.replace(r'\b\d+\b','')
# Remove words with only 1 character. 
train_data['tweet_text'] = train_data['tweet_text'].str.replace(r'\b\w\b','')
dev_data['tweet_text'] = dev_data['tweet_text'].str.replace(r'\b\w\b','')
train_data.head()

Unnamed: 0,tweet_id,sentiment,tweet_text
0,335104872099066692,positive,Felt privileged to play Foo Fighters songs on ...
1,796528524030124618,positive,""" Pakistan may be an Islamic country, but der ..."
2,760964834217238632,positive,Happy Birthday to the coolest golfer in Bali! ...
3,147713180324524046,negative,TMILLS is going to Tucson! But the 29th and i...
4,732302280474120023,negative,Hmmmmm where are the #BlackLivesMatter when ma...


In [227]:
# TFIDF Vectorizer
stopset = set(stopwords.words('english')) # remove stopwords
stopset.remove('not') # we keep not in our data to maintain negation
stopset.add('wo') # to account for won't -> wo not after regex
vectorizer = TfidfVectorizer(use_idf=True, lowercase=True, strip_accents='ascii', stop_words=stopset)

In [228]:
stopset

{'a',
 'about',
 'above',
 'after',
 'again',
 'against',
 'ain',
 'all',
 'am',
 'an',
 'and',
 'any',
 'are',
 'aren',
 "aren't",
 'as',
 'at',
 'be',
 'because',
 'been',
 'before',
 'being',
 'below',
 'between',
 'both',
 'but',
 'by',
 'can',
 'couldn',
 "couldn't",
 'd',
 'did',
 'didn',
 "didn't",
 'do',
 'does',
 'doesn',
 "doesn't",
 'doing',
 'don',
 "don't",
 'down',
 'during',
 'each',
 'few',
 'for',
 'from',
 'further',
 'had',
 'hadn',
 "hadn't",
 'has',
 'hasn',
 "hasn't",
 'have',
 'haven',
 "haven't",
 'having',
 'he',
 'her',
 'here',
 'hers',
 'herself',
 'him',
 'himself',
 'his',
 'how',
 'i',
 'if',
 'in',
 'into',
 'is',
 'isn',
 "isn't",
 'it',
 "it's",
 'its',
 'itself',
 'just',
 'll',
 'm',
 'ma',
 'me',
 'mightn',
 "mightn't",
 'more',
 'most',
 'mustn',
 "mustn't",
 'my',
 'myself',
 'needn',
 "needn't",
 'no',
 'nor',
 'now',
 'o',
 'of',
 'off',
 'on',
 'once',
 'only',
 'or',
 'other',
 'our',
 'ours',
 'ourselves',
 'out',
 'over',
 'own',
 're',
 's'

In [229]:
# sentiment becomes dependent variable
train_y = np.array(train_data['sentiment'])
dev_y = np.array(dev_data['sentiment'])

train_y

array(['positive', 'positive', 'positive', ..., 'positive', 'positive',
       'neutral'], dtype=object)

In [230]:
train_X = vectorizer.fit_transform(train_data.tweet_text)
dev_X = vectorizer.transform(dev_data.tweet_text)

In [231]:
print(train_y.shape)
print(train_X.shape)
print(dev_y.shape)
print(dev_X.shape)

(45101,)
(45101, 42076)
(2000,)
(2000, 42076)


In [232]:
# train MaxEnt classifier (Logisitic regression)
clf = linear_model.LogisticRegression(multi_class = 'ovr')
clf.fit(train_X, train_y)

# test model accuracy
pred_y_mxe = clf.predict(dev_X)
acc_score_mxe = accuracy_score(dev_y, pred_y_mxe)
conf_mat_mxe = confusion_matrix(dev_y, pred_y_mxe, labels = ["positive", "neutral", "negative"])

print(acc_score_mxe)
print(conf_mat_mxe)



0.655
[[439 252  12]
 [152 714  53]
 [ 36 185 157]]


In [233]:
# train naive bayes classifier
clf2 = naive_bayes.MultinomialNB()
clf2.fit(train_X, train_y)

# test model accuracy
pred_y_nb = clf2.predict(dev_X)
acc_score_nb = accuracy_score(dev_y, pred_y_nb)
conf_mat_nb = confusion_matrix(dev_y, pred_y_nb, labels = ["positive", "neutral", "negative"])

print(acc_score_nb)
print(conf_mat_nb)

0.5995
[[395 307   1]
 [146 763  10]
 [ 40 297  41]]


In [234]:
# train svm
#clf3 = svm.SVC(gamma='auto')
clf3 = linear_model.SGDClassifier(loss='hinge', penalty='l2', alpha=1e-3, random_state=42, max_iter=5, tol=None)
clf3.fit(train_X, train_y)

# test model accuracy
pred_y_svm = clf3.predict(dev_X)
acc_score_svm = accuracy_score(dev_y, pred_y_svm)
conf_mat_svm = confusion_matrix(dev_y, pred_y_svm, labels = ["positive", "neutral", "negative"])

print(acc_score_svm)
print(conf_mat_svm)

0.5875
[[359 342   2]
 [119 794   6]
 [ 44 312  22]]


LSTM classifier with GLOVE

In [235]:
# sentiment becomes dependent variable
#train_y = np.array(train_data['sentiment'])
#dev_y = np.array(dev_data['sentiment'])
#label_encoder = LabelEncoder()
#labels_train_y = label_encoder.fit_transform(train_data.sentiment)
#labels_dev_y = label_encoder.fit_transform(dev_data.sentiment)
# Labels integer-encoded into: positive = 2, neutral = 1, negative = 0
#one_hot = OneHotEncoder(sparse=False)
#train_y = one_hot.fit_transform(labels_train_y.reshape(len(labels_train_y), 1))
#dev_y = one_hot.fit_transform(labels_dev_y.reshape(len(labels_dev_y), 1))

# One Hot Encoded: Positive = [0,0,1], Neutral = [0,1,0], Negative = [1,0,0]
lb = LabelBinarizer(sparse_output=False)
train_y = lb.fit_transform(train_data.sentiment)
dev_y = lb.fit_transform(dev_data.sentiment)
#train_y = lb_train_y.reshape(len(lb_train_y), 1)
#dev_y = lb_dev_y.reshape(len(lb_dev_y), 1)
train_y

array([[0, 0, 1],
       [0, 0, 1],
       [0, 0, 1],
       ...,
       [0, 0, 1],
       [0, 0, 1],
       [0, 1, 0]])

In [236]:
#label = argmax(dev_y, axis = 1)
#multilabel_confusion_matrix(label, label)
#train_y.argmax(axis=1)[:15]

In [237]:
# load the pre-trained word-embedding vectors 
embeddings_dict = {}
with open("data/glove.6B.100d.txt", 'r') as f:
    for line in f:
        values = line.split()
        word = values[0]
        vector = np.asarray(values[1:], "float32")
        embeddings_dict[word] = vector

In [238]:
embeddings_dict['were']

array([ 0.26874 ,  0.17994 , -0.29083 , -0.72304 , -0.05883 ,  0.37211 ,
        0.39979 ,  0.47827 , -0.41014 , -0.089043,  0.68457 ,  0.29088 ,
        0.9661  ,  0.43289 ,  0.44254 , -1.1529  ,  0.15147 , -0.02307 ,
       -1.2467  , -0.037292,  0.94212 ,  0.37771 ,  1.2369  ,  0.12327 ,
       -0.33831 , -0.98651 ,  0.44322 ,  0.083459, -0.11953 , -0.057447,
        0.6761  , -0.59646 , -0.3251  ,  0.53957 ,  0.66822 ,  0.082015,
        0.42181 ,  0.62666 ,  0.038678,  0.089652, -0.53395 , -0.40426 ,
       -0.060807,  0.14335 ,  0.53841 , -0.12983 ,  0.43699 , -0.077531,
        0.20441 , -0.9894  , -0.080389, -0.13893 ,  0.046432,  1.6775  ,
       -0.34565 , -1.7503  , -0.25442 , -0.28207 ,  1.2024  ,  1.0927  ,
       -0.55076 ,  1.3852  , -0.74759 ,  0.96273 ,  0.69044 , -0.41462 ,
        0.55676 ,  0.39588 ,  0.053647, -0.35503 , -0.3909  , -0.48323 ,
       -0.048448, -0.37728 , -0.51204 ,  0.50097 ,  0.16188 ,  0.91052 ,
       -1.6308  , -0.31484 ,  0.51824 , -0.078027, 

In [239]:
# load the pre-trained word-embedding vectors 
embeddings_index = {}
for i, line in enumerate(open('data/glove.6B.100d.txt')):
    values = line.split()
    embeddings_index[values[0]] = np.asarray(values[1:], dtype='float32')

# create a tokenizer 
token = text.Tokenizer()
token.fit_on_texts(train_data['tweet_text'])
word_index = token.word_index

# find maximum length of each document
train_lens = [len(x.split()) for x in train_data['tweet_text']]
print("Max word count in training data: ", max(train_lens))
dev_lens = [len(x.split()) for x in dev_data['tweet_text']]
print("Max word count in development data: ", max(dev_lens))

# convert text to sequence of tokens and pad them to ensure equal length vectors 
train_seq_x = sequence.pad_sequences(token.texts_to_sequences(train_data.tweet_text), maxlen=33) #avg word count of a tweet is 55 words
valid_seq_x = sequence.pad_sequences(token.texts_to_sequences(dev_data.tweet_text), maxlen=33)

# create token-embedding mapping
embedding_matrix = np.zeros((len(word_index) + 1, 100))
for word, i in word_index.items():
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None:
        embedding_matrix[i] = embedding_vector

Max word count in training data:  33
Max word count in development data:  30


In [240]:
#index = [i for i in range(len(lengths)) if lengths[i] > 32]
#print(train_data['tweet_text'][index])

embeddings_dict is the same as embeddings_index

In [241]:
def train_model(classifier, feature_vector_train, label, feature_vector_valid, is_neural_net=False):
    # fit the training dataset on the classifier
    classifier.fit(feature_vector_train, label)
    
    # predict the labels on validation dataset
    predictions = classifier.predict(feature_vector_valid)
    
    if is_neural_net:
        predictions = predictions.argmax(axis=1)
        
    dev_y2 = dev_y.argmax(axis=1)
        
    print(confusion_matrix(dev_y2, predictions, labels = [2, 1, 0]))
    
    return accuracy_score(predictions, dev_y2)

In [242]:
def create_rnn_lstm():
    # Add an Input Layer
    input_layer = keras.layers.Input((33, ))

    # Add the word embedding Layer
    embedding_layer = keras.layers.Embedding(len(word_index) + 1, 100, weights=[embedding_matrix], trainable=False)(input_layer)
    embedding_layer = keras.layers.SpatialDropout1D(0.3)(embedding_layer)

    # Add the LSTM Layer
    lstm_layer = keras.layers.LSTM(100)(embedding_layer)

    # Add the output Layers
    output_layer1 = keras.layers.Dense(33, activation="relu")(lstm_layer)
    output_layer1 = keras.layers.Dropout(0.25)(output_layer1)
    output_layer2 = keras.layers.Dense(3, activation="softmax")(output_layer1)

    # Compile the model
    model = keras.models.Model(inputs=input_layer, outputs=output_layer2)
    model.compile(optimizer=keras.optimizers.RMSprop(), 
                  loss='categorical_crossentropy', 
                  metrics=['accuracy','categorical_accuracy'])
    
    return model

classifier = create_rnn_lstm()
accuracy = train_model(classifier, train_seq_x, train_y, valid_seq_x, is_neural_net=True)
print("RNN-LSTM, Word Embeddings",  accuracy)

[[310 381  12]
 [104 782  33]
 [ 18 261  99]]
RNN-LSTM, Word Embeddings 0.5955
