In [19]:
import pickle
import re
import numpy as np
from nltk.tokenize import TweetTokenizer
from sklearn import svm
from sklearn import metrics
from sklearn.model_selection import GridSearchCV
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2
from sklearn.feature_selection import VarianceThreshold
from sklearn.feature_selection import RFECV
from sklearn.svm import SVC
from sklearn.metrics import classification_report

from nltk.data import load


In [2]:
train = np.asarray(pickle.load(open("train_set", "rb")))
test = np.asarray(pickle.load(open("test_set", "rb")))

print(train.shape, test.shape)

(1181,) (449,)


In [3]:
# Extracting valence from the dataset

y_train = np.asarray([int(tweet.valence) for tweet in train])
y_test = np.asarray([int(tweet.valence) for tweet in test])


## Feature Generation


In [4]:
def get_num_of_hashtags(data_set):
    # This oddly shape array is needed, because the features need to be in a column-like shape
    X = np.zeros((len(data_set), 1))

    for i, tweet in enumerate(data_set):
        X[i, 0] = np.asarray(len(tweet.hashtags))

    return X


In [5]:
def get_num_of_mult_punctuation(data_set):
    # multiple punctuation means sth. like ../!!!
    X = np.zeros((len(data_set), 1))

    for i, tweet in enumerate(data_set):
        X[i, 0] = len(re.findall(r"[?!]{2,}", tweet.raw))
        
    return X


In [6]:
def get_num_pos_tags(train, test):
    # First find out which POS tags are possible
    vocabulary = list(load('help/tagsets/upenn_tagset.pickle'))
    pos_tags_train = [tweet.pos_tags for tweet in train]
    pos_tags_test = [tweet.pos_tags for tweet in test]

    # CountVectorizer is used to create a vector for each tweet. Each number in this vector 
    # represents the number of occurrences for a specific POS tag.
    # All those vectors have the same length, which is needed to use them for the SVM.
    vectorizer = CountVectorizer(vocabulary=vocabulary, tokenizer=lambda doc: doc, lowercase=False)

    train_vector = vectorizer.transform(pos_tags_train)
    test_vector = vectorizer.transform(pos_tags_test)
   
    return np.asarray(train_vector.toarray()), np.asarray(test_vector.toarray())



In [7]:
def get_ngram_vectors(train, test, method="word", upper=3, lower=1):
    # This method either extracts word (method="word)" or char ngrams (method="char").
    # Again, they need to be transformed into one hot encoded vectors
    vectorizer = CountVectorizer(ngram_range=(lower, upper), tokenizer=TweetTokenizer().tokenize, 
                                 analyzer=method)
    
    raw_train = [tweet.raw for tweet in train]
    raw_test = [tweet.raw for tweet in test]
    
    train_vector = vectorizer.fit_transform(raw_train)
    test_vector = vectorizer.transform(raw_test)

    return np.asarray(train_vector.toarray()), np.asarray(test_vector.toarray())


In [8]:
def extract_features(train, test):
    word_ngram_upper_bound = 4
    char_ngram_upper_bound = 5
    char_ngram_lower_bound = 3

    pos_vec_train, pos_vec_test = get_num_pos_tags(train, test)
    train_word_vectors, test_word_vectors = get_ngram_vectors(
        train, test, "word", word_ngram_upper_bound)
    train_char_vectors, test_char_vectors = get_ngram_vectors(
        train, test, "char", char_ngram_upper_bound, char_ngram_lower_bound)

    # Put all the features together. Each feature is represented as a column in the resulting array
    train_features = np.concatenate([get_num_of_hashtags(train),
                                     get_num_of_mult_punctuation(train),
                                     pos_vec_train,
                                     train_word_vectors,
                                     train_char_vectors], axis=1)

    test_features = np.concatenate([get_num_of_hashtags(test),
                                    get_num_of_mult_punctuation(test),
                                    test_word_vectors,
                                    test_char_vectors,
                                    pos_vec_test], axis=1)

    return train_features, test_features


In [9]:
X_train, X_test = extract_features(train, test)


## Baseline

In [10]:
def transform_to_pos_neg(np_array):
    np_array[np_array < 0] = -1
    np_array[np_array > 0] = 1
    
    return np_array


In [11]:
from collections import Counter

def evaluate_model(model, X, y):
    # To make sure that the original y is not modified when applying the transform_to... function
    y_copy = np.copy(y)  
    prediction = model.predict(X)
    
    print("Real values:", Counter(y))
    print("Predicted values:", Counter(prediction), "\n")

    acc = metrics.accuracy_score(y, prediction)
    print("Accuracy:", acc)

    # Accuracy if predicted value is +-1
    acc_interval = acc + metrics.accuracy_score(y_copy + 1, prediction) \
                   + metrics.accuracy_score(y_copy - 1, prediction)
    print("Accuracy for small interval:", acc_interval)

    # checks if at least the predicted direction is the right one
    acc_dir = metrics.accuracy_score(transform_to_pos_neg(y_copy), transform_to_pos_neg(prediction))
    print("Accuracy for right direction:", acc_dir, "\n")
    
    print("Classwise evaluation:")
    print(classification_report(y, prediction))


In [12]:
def fit_and_eval_svm(X_train, y_train, X_test, y_test):
    lin_svm = svm.LinearSVC()
    lin_svm.fit(X_train, y_train)
    evaluate_model(lin_svm, X_test, y_test)

In [20]:
fit_and_eval_svm(X_train, y_train, X_test, y_test)

Real values: Counter({0: 105, -2: 95, -3: 69, 1: 58, 3: 53, 2: 35, -1: 34})
Predicted values: Counter({-2: 238, 3: 92, 0: 90, 1: 16, -3: 7, -1: 3, 2: 3}) 

Accuracy: 0.224944320713
Accuracy for small interval: 0.445434298441
Accuracy for right direction: 0.412026726058 

Classwise evaluation:
             precision    recall  f1-score   support

         -3       0.00      0.00      0.00        69
         -2       0.00      0.00      0.00        95
         -1       0.07      0.53      0.13        34
          0       0.28      0.24      0.26       105
          1       0.12      0.22      0.15        58
          2       0.00      0.00      0.00        35
          3       0.00      0.00      0.00        53

avg / total       0.09      0.12      0.09       449



  'precision', 'predicted', average, warn_for)


## Non Linear SVM (comparison)

In [14]:
non_lin_svm = SVC()
non_lin_svm.fit(X_train, y_train)
evaluate_model(non_lin_svm, X_test, y_test)

Real values: Counter({0: 105, -2: 95, -3: 69, 1: 58, 3: 53, 2: 35, -1: 34})
Predicted values: Counter({0: 449}) 

Accuracy: 0.233853006682
Accuracy for small interval: 0.438752783964
Accuracy for right direction: 0.233853006682 

Classwise evaluation:
             precision    recall  f1-score   support

         -3       0.00      0.00      0.00        69
         -2       0.00      0.00      0.00        95
         -1       0.00      0.00      0.00        34
          0       0.23      1.00      0.38       105
          1       0.00      0.00      0.00        58
          2       0.00      0.00      0.00        35
          3       0.00      0.00      0.00        53

avg / total       0.05      0.23      0.09       449



  'precision', 'predicted', average, warn_for)


## Naive Bayes (comparison)

In [15]:
from sklearn.naive_bayes import GaussianNB

gnb = GaussianNB()
gnb.fit(X_train, y_train)
evaluate_model(gnb, X_test, y_test)

Real values: Counter({0: 105, -2: 95, -3: 69, 1: 58, 3: 53, 2: 35, -1: 34})
Predicted values: Counter({0: 308, -2: 96, 3: 45}) 

Accuracy: 0.216035634744
Accuracy for small interval: 0.412026726058
Accuracy for right direction: 0.293986636971 

Classwise evaluation:
             precision    recall  f1-score   support

         -3       0.00      0.00      0.00        69
         -2       0.00      0.00      0.00        95
         -1       0.06      0.18      0.09        34
          0       0.23      0.68      0.34       105
          1       0.18      0.14      0.16        58
          2       0.00      0.00      0.00        35
          3       0.00      0.00      0.00        53

avg / total       0.08      0.19      0.11       449



  'precision', 'predicted', average, warn_for)


In [16]:
from sklearn.naive_bayes import MultinomialNB 

mnb = MultinomialNB()
mnb.fit(X_train, y_train)
evaluate_model(mnb, X_test, y_test)


Real values: Counter({0: 105, -2: 95, -3: 69, 1: 58, 3: 53, 2: 35, -1: 34})
Predicted values: Counter({-1: 158, 2: 147, 3: 64, -3: 42, -2: 21, 1: 17}) 

Accuracy: 0.102449888641
Accuracy for small interval: 0.407572383073
Accuracy for right direction: 0.43429844098 

Classwise evaluation:
             precision    recall  f1-score   support

         -3       0.00      0.00      0.00        69
         -2       0.00      0.00      0.00        95
         -1       0.08      0.50      0.13        34
          0       0.00      0.00      0.00       105
          1       0.14      0.55      0.22        58
          2       0.00      0.00      0.00        35
          3       0.00      0.00      0.00        53

avg / total       0.02      0.11      0.04       449



  'precision', 'predicted', average, warn_for)


## Training with Grid Search

In [21]:
# Determining the hyperparameter C through grid search with cross validation
cs = [0.001, 0.01, 0.1, 1, 10]
grid = {'C': cs}

grid_search = GridSearchCV(svm.LinearSVC(), grid, cv=5, verbose=10)
grid_search.fit(X_train, y_train)
print(grid_search.best_params_)
evaluate_model(grid_search, X_test, y_test)


Fitting 5 folds for each of 5 candidates, totalling 25 fits
[CV] C=0.001 .........................................................
[CV] .......................... C=0.001, score=0.347280, total=   0.8s
[CV] C=0.001 .........................................................


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    1.1s remaining:    0.0s


[CV] .......................... C=0.001, score=0.348739, total=   0.9s
[CV] C=0.001 .........................................................


[Parallel(n_jobs=1)]: Done   2 out of   2 | elapsed:    2.3s remaining:    0.0s


[CV] .......................... C=0.001, score=0.338983, total=   0.8s
[CV] C=0.001 .........................................................


[Parallel(n_jobs=1)]: Done   3 out of   3 | elapsed:    3.5s remaining:    0.0s


[CV] .......................... C=0.001, score=0.348936, total=   0.9s
[CV] C=0.001 .........................................................


[Parallel(n_jobs=1)]: Done   4 out of   4 | elapsed:    4.6s remaining:    0.0s


[CV] .......................... C=0.001, score=0.334764, total=   0.9s
[CV] C=0.01 ..........................................................


[Parallel(n_jobs=1)]: Done   5 out of   5 | elapsed:    5.7s remaining:    0.0s


[CV] ........................... C=0.01, score=0.380753, total=   1.2s
[CV] C=0.01 ..........................................................


[Parallel(n_jobs=1)]: Done   6 out of   6 | elapsed:    7.2s remaining:    0.0s


[CV] ........................... C=0.01, score=0.315126, total=   1.1s
[CV] C=0.01 ..........................................................


[Parallel(n_jobs=1)]: Done   7 out of   7 | elapsed:    8.6s remaining:    0.0s


[CV] ........................... C=0.01, score=0.292373, total=   1.1s
[CV] C=0.01 ..........................................................


[Parallel(n_jobs=1)]: Done   8 out of   8 | elapsed:   10.0s remaining:    0.0s


[CV] ........................... C=0.01, score=0.327660, total=   1.3s
[CV] C=0.01 ..........................................................


[Parallel(n_jobs=1)]: Done   9 out of   9 | elapsed:   11.6s remaining:    0.0s


[CV] ........................... C=0.01, score=0.334764, total=   1.1s
[CV] C=0.1 ...........................................................
[CV] ............................ C=0.1, score=0.364017, total=   2.2s
[CV] C=0.1 ...........................................................
[CV] ............................ C=0.1, score=0.281513, total=   1.9s
[CV] C=0.1 ...........................................................
[CV] ............................ C=0.1, score=0.275424, total=   2.4s
[CV] C=0.1 ...........................................................
[CV] ............................ C=0.1, score=0.319149, total=   2.2s
[CV] C=0.1 ...........................................................
[CV] ............................ C=0.1, score=0.300429, total=   1.8s
[CV] C=1 .............................................................
[CV] .............................. C=1, score=0.351464, total=   3.4s
[CV] C=1 .............................................................
[CV] .

[Parallel(n_jobs=1)]: Done  25 out of  25 | elapsed:  1.0min finished


{'C': 0.001}
Real values: Counter({0: 105, -2: 95, -3: 69, 1: 58, 3: 53, 2: 35, -1: 34})
Predicted values: Counter({-2: 176, 3: 167, 0: 83, 1: 13, -3: 7, 2: 2, -1: 1}) 

Accuracy: 0.211581291759
Accuracy for small interval: 0.380846325167
Accuracy for right direction: 0.403118040089 

Classwise evaluation:
             precision    recall  f1-score   support

         -3       0.00      0.00      0.00        69
         -2       0.00      0.00      0.00        95
         -1       0.09      0.47      0.15        34
          0       0.29      0.23      0.26       105
          1       0.16      0.50      0.24        58
          2       0.00      0.00      0.00        35
          3       0.00      0.00      0.00        53

avg / total       0.09      0.15      0.10       449



  'precision', 'predicted', average, warn_for)


## Feature Selection


### Variance Threshold

In [22]:
# Variance threshold removes all the features which variance is smaller than a certain percentage
thresholds = [0, 0.01, 0.05, 0.1, 0.5]

for threshold in thresholds:
    variance_selector = VarianceThreshold(threshold)
    X_train_selected = variance_selector.fit_transform(X_train)
    X_test_selected = variance_selector.transform(X_test)

    print("Threshhold:", threshold)
    fit_and_eval_svm(X_train_selected, y_train, X_test_selected, y_test)


Threshhold: 0
Real values: Counter({0: 105, -2: 95, -3: 69, 1: 58, 3: 53, 2: 35, -1: 34})
Predicted values: Counter({-2: 238, 3: 92, 0: 90, 1: 16, -3: 7, -1: 3, 2: 3}) 

Accuracy: 0.224944320713
Accuracy for small interval: 0.445434298441
Accuracy for right direction: 0.412026726058 

Classwise evaluation:
             precision    recall  f1-score   support

         -3       0.00      0.00      0.00        69
         -2       0.00      0.00      0.00        95
         -1       0.07      0.53      0.13        34
          0       0.28      0.24      0.26       105
          1       0.12      0.22      0.15        58
          2       0.00      0.00      0.00        35
          3       0.00      0.00      0.00        53

avg / total       0.09      0.12      0.09       449



  'precision', 'predicted', average, warn_for)


Threshhold: 0.01
Real values: Counter({0: 105, -2: 95, -3: 69, 1: 58, 3: 53, 2: 35, -1: 34})
Predicted values: Counter({-2: 248, 0: 108, 3: 53, -1: 23, 1: 9, -3: 5, 2: 3}) 

Accuracy: 0.207126948775
Accuracy for small interval: 0.423162583519
Accuracy for right direction: 0.369710467706 

Classwise evaluation:
             precision    recall  f1-score   support

         -3       0.00      0.00      0.00        69
         -2       0.00      0.00      0.00        95
         -1       0.07      0.53      0.12        34
          0       0.23      0.24      0.23       105
          1       0.08      0.09      0.08        58
          2       0.00      0.00      0.00        35
          3       0.00      0.00      0.00        53

avg / total       0.07      0.11      0.07       449

Threshhold: 0.05
Real values: Counter({0: 105, -2: 95, -3: 69, 1: 58, 3: 53, 2: 35, -1: 34})
Predicted values: Counter({-2: 199, 0: 134, -1: 66, 3: 17, 1: 13, 2: 10, -3: 10}) 

Accuracy: 0.198218262806
Accura

### Selection of kBest

In [23]:
# Uses chi square to find significance of features and only selects the top k features
ks = [10, 50, 100, 200, 500, 1000, 5000]

for k in ks:
    kbest_selector = SelectKBest(chi2, k=k)
    X_train_selected = kbest_selector.fit_transform(X_train, y_train)
    X_test_selected = kbest_selector.transform(X_test)
    
    print("k:", k)
    fit_and_eval_svm(X_train_selected, y_train, X_test_selected, y_test)


k: 10
Real values: Counter({0: 105, -2: 95, -3: 69, 1: 58, 3: 53, 2: 35, -1: 34})
Predicted values: Counter({0: 411, 3: 34, -2: 4}) 

Accuracy: 0.213808463252
Accuracy for small interval: 0.41425389755
Accuracy for right direction: 0.227171492205 

Classwise evaluation:
             precision    recall  f1-score   support

         -3       0.00      0.00      0.00        69
         -2       0.00      0.00      0.00        95
         -1       0.00      0.00      0.00        34
          0       0.22      0.85      0.34       105
          1       0.06      0.03      0.04        58
          2       0.00      0.00      0.00        35
          3       0.00      0.00      0.00        53

avg / total       0.06      0.20      0.09       449



  'precision', 'predicted', average, warn_for)


k: 50
Real values: Counter({0: 105, -2: 95, -3: 69, 1: 58, 3: 53, 2: 35, -1: 34})
Predicted values: Counter({0: 400, -2: 36, 1: 7, 3: 3, 2: 2, -1: 1}) 

Accuracy: 0.202672605791
Accuracy for small interval: 0.405345211581
Accuracy for right direction: 0.213808463252 

Classwise evaluation:
             precision    recall  f1-score   support

         -3       0.00      0.00      0.00        69
         -2       0.00      0.00      0.00        95
         -1       0.08      0.09      0.08        34
          0       0.21      0.81      0.34       105
          1       0.08      0.02      0.03        58
          2       0.00      0.00      0.00        35
          3       0.00      0.00      0.00        53

avg / total       0.07      0.20      0.09       449

k: 100
Real values: Counter({0: 105, -2: 95, -3: 69, 1: 58, 3: 53, 2: 35, -1: 34})
Predicted values: Counter({0: 402, -2: 21, -1: 7, 1: 7, 3: 6, -3: 4, 2: 2}) 

Accuracy: 0.224944320713
Accuracy for small interval: 0.438752783964

### Recursive Feature Elimination with Cross Validation


In [None]:
# Another form of feature selection

selector = RFECV(svm.LinearSVC(), step=500, cv=5, verbose=10)
selector.fit(X_train, y_train)
evaluate_model(selector, X_test, y_test)
