In [2]:
import csv
import re
import gensim
import numpy as np
from sklearn.linear_model import LogisticRegression


Using TensorFlow backend.


In [5]:
def preprocess_tweets(tweets):
    # lowercase all tweets
    tweets = [[tweet[0].lower(), tweet[1]] for tweet in tweets]

    # replace all users by @someuser
    for i in range(0, len(tweets)):
        tweets[i][0] = re.sub(r'(?<=^|(?<=[^a-zA-Z0-9-_\.]))@([A-Za-z_]+[A-Za-z0-9_]+)',
                              '@someuser',
                              tweets[i][0])
    return tweets


## Possible further preprocessing

- Several emoticons are replaced by the tokens <smile>, <sadface>, <lolface> or <neutralface>. (There are some libraries for this)
- e.g. “sooooo” is replaced by “soo”
- remove the hash in front of the hastags and convert them to normal words
- Better splitting of the tweets when calculating document vectors (now only split a spaces)

In [5]:
def create_datasets(filename):
    tweets = []
    
    with open(filename) as csvfile:
        reader = csv.reader(csvfile, delimiter='\t')
        for row in reader:
            assert len(row) == 4
            tweets.append([row[1], row[3].split(':')[0]])
    
    return preprocess_tweets(tweets)


In [6]:
train_set = create_datasets('2018-Valence-oc-En-train.txt')
dev_set = create_datasets('2018-Valence-oc-En-dev.txt')


In [7]:
def calculate_document_vector(document, word_vectors):
    document = document.split()
    num_words = 0

    average_vector = np.zeros(50)
    for word in document:
        if word in word_vectors:
            average_vector += word_vectors[word]
            num_words += 1

    if num_words != 0:
        average_vector /= num_words

        return average_vector
    else:
        return None


In [8]:
def extract_features(data_set, word_vectors):
    features = []
    labels = []

    for i in range(0, len(data_set)):
        document_vector = calculate_document_vector(data_set[i][0], word_vectors) 
        if document_vector is not None:
            features.append(document_vector)
            labels.append(data_set[i][1])

    return np.asarray(features), labels


In [9]:
def train_model(train_set, test_set):
    word_vectors = gensim.models.KeyedVectors.load_word2vec_format("glove.50d", binary=False)
    
    train_X, train_Y = extract_features(train_set, word_vectors)
    test_X, test_Y = extract_features(test_set, word_vectors)
    
    #assert not (np.all(np.isfinite(train_X)))
    #assert not (np.all(np.isfinite(test_X)))
    assert not (np.any(np.isnan(train_X)))
    assert not (np.any(np.isnan(test_X)))
    
    print("Start training")
    
    model = LogisticRegression(solver='lbfgs', multi_class='multinomial', verbose=10)

    model.fit(train_X, train_Y)

    print("Start prediction")
    
    train_predict = model.predict(train_X)
    train_acc = np.mean(train_predict == train_Y)
    print("Train set accuracy", train_acc)

    test_predict = model.predict(test_X)
    test_acc = np.mean(test_predict == test_Y)
    print("Test set accuracy", test_acc)
    
    return model


In [11]:
model = train_model(train_set, dev_set)

Start training
Start prediction
Train set accuracy 0.398116438356
Test set accuracy 0.266331658291


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.1s remaining:    0.0s
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.1s finished
