In [1]:
import pickle
import re
import numpy as np
from nltk.tokenize import TweetTokenizer
from sklearn import svm
from sklearn import metrics
from sklearn.model_selection import GridSearchCV
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2
from sklearn.feature_selection import VarianceThreshold
from preprocessing.tweet import Tweet
from nltk.data import load


In [2]:
train = np.asarray(pickle.load(open("preprocessing/train_set", "rb")))
test = np.asarray(pickle.load(open("preprocessing/test_set", "rb")))

print(train.shape, test.shape)

(1174,) (200,)


In [3]:
# TODO: remove workaround

for i, tweet in enumerate(train):
    if ":" in tweet.valence:
        train[i].valence = int(str.split(tweet.valence, ":")[0])


In [4]:
# TODO: remove workaround

for i, tweet in enumerate(test):
    if ":" in tweet.valence:
        test[i].valence = int(str.split(tweet.valence, ":")[0])


In [5]:
y_train = np.asarray([int(tweet.valence) for tweet in train])
y_test = np.asarray([int(tweet.valence) for tweet in test])


## Feature Selection


In [6]:
def get_num_of_hashtags(data_set):
    X = np.zeros((len(data_set), 1))
    
    for i, tweet in enumerate(data_set):
        X[i, 0] = np.asarray(len(tweet.hashtags))
        
    return X


In [7]:
def get_num_of_mult_punctuation(data_set):
    X = np.zeros((len(data_set), 1))

    for i, tweet in enumerate(data_set):
        X[i, 0] = len(re.findall(r"[?!]{2,}", tweet.raw))
        
    return X


In [8]:
def get_num_pos_tags(train, test):
    vocabulary = list(load('help/tagsets/upenn_tagset.pickle'))
    pos_tags_train = [tweet.pos_tags for tweet in train]
    pos_tags_test = [tweet.pos_tags for tweet in test]

    vectorizer = CountVectorizer(vocabulary=vocabulary, tokenizer=lambda doc: doc, lowercase=False)

    train_vector = vectorizer.transform(pos_tags_train)
    test_vector = vectorizer.transform(pos_tags_test)
   
    return np.asarray(train_vector.toarray()), np.asarray(test_vector.toarray())



In [9]:
def get_ngram_vectors(train, test, method="word", upper=3, lower=1):
    vectorizer = CountVectorizer(ngram_range=(lower, upper), tokenizer=TweetTokenizer().tokenize, 
                                 analyzer=method)
    
    raw_train = [tweet.raw for tweet in train]
    raw_test = [tweet.raw for tweet in test]
    
    train_vector = vectorizer.fit_transform(raw_train)
    test_vector = vectorizer.transform(raw_test)

    return np.asarray(train_vector.toarray()), np.asarray(test_vector.toarray())


In [10]:
def extract_features(train, test):
    word_ngram_upper_bound = 4
    char_ngram_upper_bound = 5
    char_ngram_lower_bound = 3

    pos_vec_train, pos_vec_test = get_num_pos_tags(train, test)
    train_word_vectors, test_word_vectors = get_ngram_vectors(
        train, test, "word", word_ngram_upper_bound)
    train_char_vectors, test_char_vectors = get_ngram_vectors(
        train, test, "char", char_ngram_upper_bound, char_ngram_lower_bound)
    
    train_features = np.concatenate([get_num_of_hashtags(train),
                                     get_num_of_mult_punctuation(train),
                                     pos_vec_train,
                                     train_word_vectors,
                                     train_char_vectors], axis=1)

    test_features = np.concatenate([get_num_of_hashtags(test),
                                    get_num_of_mult_punctuation(test),
                                    test_word_vectors,
                                    test_char_vectors,
                                    pos_vec_test], axis=1)
    
    
    return train_features, test_features


In [11]:
X_train, X_test = extract_features(train, test)


## Baseline

In [12]:
def transform_to_pos_neg(np_array):
    np_array[np_array < 0] = -1
    np_array[np_array > 0] = 1
    
    return np_array


In [13]:
def evaluate_model(model, X, y):
    prediction = model.predict(X)

    acc = metrics.accuracy_score(y, prediction)
    print("Accuracy:", acc)

    acc_interval = acc + metrics.accuracy_score(y + 1, prediction) \
                   + metrics.accuracy_score(y - 1, prediction)
    print("Accuracy for small interval:", acc_interval)  # Accuracy if predicted value is +-1

    acc_dir = metrics.accuracy_score(transform_to_pos_neg(y), transform_to_pos_neg(prediction))
    print("Accuracy for right direction:", acc_dir)


In [17]:
# TODO: find out why first run yields much higher accuracy

lin_svm = svm.LinearSVC()
lin_svm.fit(X_train, y_train)
evaluate_model(lin_svm, X_test, y_test)

Accuracy: 0.07
Accuracy for small interval: 0.505
Accuracy for right direction: 0.41


## Training with Grid Search

In [18]:
cs = [0.001, 0.01, 0.1, 1, 10]
grid = {'C': cs}

grid_search = GridSearchCV(svm.LinearSVC(), grid, cv=5, verbose=10)
grid_search.fit(X_train, y_train)
print(grid_search.best_params_)
evaluate_model(grid_search, X_test, y_test)


Fitting 5 folds for each of 5 candidates, totalling 25 fits
[CV] C=0.001 .........................................................


[CV] .......................... C=0.001, score=0.338983, total=   1.0s
[CV] C=0.001 .........................................................


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    1.4s remaining:    0.0s


[CV] .......................... C=0.001, score=0.372881, total=   0.9s
[CV] C=0.001 .........................................................


[Parallel(n_jobs=1)]: Done   2 out of   2 | elapsed:    2.7s remaining:    0.0s


[CV] .......................... C=0.001, score=0.323404, total=   0.9s
[CV] C=0.001 .........................................................


[Parallel(n_jobs=1)]: Done   3 out of   3 | elapsed:    3.8s remaining:    0.0s


[CV] .......................... C=0.001, score=0.350427, total=   0.7s
[CV] C=0.001 .........................................................


[Parallel(n_jobs=1)]: Done   4 out of   4 | elapsed:    4.8s remaining:    0.0s


[CV] .......................... C=0.001, score=0.313305, total=   0.7s
[CV] C=0.01 ..........................................................


[Parallel(n_jobs=1)]: Done   5 out of   5 | elapsed:    5.8s remaining:    0.0s


[CV] ........................... C=0.01, score=0.313559, total=   1.1s
[CV] C=0.01 ..........................................................


[Parallel(n_jobs=1)]: Done   6 out of   6 | elapsed:    7.2s remaining:    0.0s


[CV] ........................... C=0.01, score=0.322034, total=   1.5s
[CV] C=0.01 ..........................................................


[Parallel(n_jobs=1)]: Done   7 out of   7 | elapsed:    9.1s remaining:    0.0s


[CV] ........................... C=0.01, score=0.344681, total=   1.7s
[CV] C=0.01 ..........................................................


[Parallel(n_jobs=1)]: Done   8 out of   8 | elapsed:   11.2s remaining:    0.0s


[CV] ........................... C=0.01, score=0.333333, total=   1.8s
[CV] C=0.01 ..........................................................


[Parallel(n_jobs=1)]: Done   9 out of   9 | elapsed:   13.4s remaining:    0.0s


[CV] ........................... C=0.01, score=0.296137, total=   1.5s
[CV] C=0.1 ...........................................................


[CV] ............................ C=0.1, score=0.300847, total=   3.7s
[CV] C=0.1 ...........................................................


[CV] ............................ C=0.1, score=0.313559, total=   3.2s
[CV] C=0.1 ...........................................................


[CV] ............................ C=0.1, score=0.327660, total=   3.7s
[CV] C=0.1 ...........................................................


[CV] ............................ C=0.1, score=0.316239, total=   3.2s
[CV] C=0.1 ...........................................................


[CV] ............................ C=0.1, score=0.287554, total=   3.1s
[CV] C=1 .............................................................


[CV] .............................. C=1, score=0.313559, total=   3.0s
[CV] C=1 .............................................................


[CV] .............................. C=1, score=0.322034, total=   2.6s
[CV] C=1 .............................................................


[CV] .............................. C=1, score=0.323404, total=   3.2s
[CV] C=1 .............................................................


[CV] .............................. C=1, score=0.307692, total=   2.3s
[CV] C=1 .............................................................


[CV] .............................. C=1, score=0.278970, total=   2.4s
[CV] C=10 ............................................................


[CV] ............................. C=10, score=0.313559, total=   2.7s
[CV] C=10 ............................................................


[CV] ............................. C=10, score=0.317797, total=   2.4s
[CV] C=10 ............................................................


[CV] ............................. C=10, score=0.323404, total=   3.2s
[CV] C=10 ............................................................


[CV] ............................. C=10, score=0.303419, total=   2.3s
[CV] C=10 ............................................................


[CV] ............................. C=10, score=0.283262, total=   2.8s


[Parallel(n_jobs=1)]: Done  25 out of  25 | elapsed:  1.1min finished


{'C': 0.001}
Accuracy: 0.05
Accuracy for small interval: 0.365
Accuracy for right direction: 0.415


## Feature Selection


### Variance Threshold

In [19]:
# TODO: use different values for threshold!

variance_selector = VarianceThreshold(0.05)
X_train_selected = variance_selector.fit_transform(X_train)
X_test_selected = variance_selector.transform(X_test)


In [20]:
lin_svm = svm.LinearSVC()
lin_svm.fit(X_train_selected, y_train)
evaluate_model(lin_svm, X_test_selected, y_test)


Accuracy: 0.105
Accuracy for small interval: 0.51
Accuracy for right direction: 0.34


### Selection of kBest

In [21]:
# TODO: also try with different values for k

kbest_selector = SelectKBest(chi2, k=100) #50 and 200 are worse
X_train_selected = kbest_selector.fit_transform(X_train, y_train)
X_test_selected = kbest_selector.transform(X_test)


In [22]:
lin_svm = svm.LinearSVC()
lin_svm.fit(X_train_selected, y_train)
evaluate_model(lin_svm, X_test_selected, y_test)


Accuracy: 0.215
Accuracy for small interval: 0.935
Accuracy for right direction: 0.235
