In [1]:
import pickle
import re
import numpy as np
from nltk.tokenize import TweetTokenizer
from sklearn import svm
from sklearn import metrics
from sklearn.model_selection import GridSearchCV
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2
from sklearn.feature_selection import VarianceThreshold
from sklearn.feature_selection import RFECV
from sklearn.svm import SVC
from sklearn.metrics import classification_report
from collections import Counter
from sklearn.metrics import precision_recall_fscore_support
from sklearn.metrics import f1_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score


from nltk.data import load
from collections import Counter


In [2]:
train = np.asarray(pickle.load(open("train_set", "rb")))
test = np.asarray(pickle.load(open("test_set", "rb")))

print(train.shape, test.shape)

(1181,) (449,)


In [3]:
# Extracting valence from the dataset

y_train = np.asarray([int(tweet.valence) for tweet in train])
y_test = np.asarray([int(tweet.valence) for tweet in test])


## Feature Generation


In [4]:
def get_num_of_hashtags(data_set):
    # This oddly shape array is needed, because the features need to be in a column-like shape
    X = np.zeros((len(data_set), 1))

    for i, tweet in enumerate(data_set):
        X[i, 0] = np.asarray(len(tweet.hashtags))

    return X


In [5]:
def get_num_of_mult_punctuation(data_set):
    # multiple punctuation means sth. like ../!!!
    X = np.zeros((len(data_set), 1))

    for i, tweet in enumerate(data_set):
        X[i, 0] = len(re.findall(r"[?!]{2,}", tweet.raw))
        
    return X


In [6]:
def get_valences(data_set):
    X = np.zeros((len(data_set), 9))
    
    for i, tweet in enumerate(data_set):
        valences = [[float(valence) for valence in tweet.emoticons_valence],
                    [float(valence) for valence in tweet.hashtags_valence],
                    [float(valence) for valence in tweet.tokens_valence]]

        for j, valence_list in enumerate(valences):
            if len(valence_list) is not 0:
                X[i, 3 * j] = sum(valence_list)
                X[i, 3 * j + 1] = max(valence_list, key=abs)
                X[i, 3 * j + 2] = valence_list[-1]

    return X


In [7]:
def get_num_pos_tags(train, test):
    # First find out which POS tags are possible
    vocabulary = list(load('help/tagsets/upenn_tagset.pickle'))
    pos_tags_train = [tweet.pos_tags for tweet in train]
    pos_tags_test = [tweet.pos_tags for tweet in test]

    # CountVectorizer is used to create a vector for each tweet. Each number in this vector 
    # represents the number of occurrences for a specific POS tag.
    # All those vectors have the same length, which is needed to use them for the SVM.
    vectorizer = CountVectorizer(vocabulary=vocabulary, tokenizer=lambda doc: doc, lowercase=False)

    train_vector = vectorizer.transform(pos_tags_train)
    test_vector = vectorizer.transform(pos_tags_test)
   
    return np.asarray(train_vector.toarray()), np.asarray(test_vector.toarray())



In [8]:
def get_ngram_vectors(train, test, method="word", upper=3, lower=1):
    # This method either extracts word (method="word)" or char ngrams (method="char").
    # Again, they need to be transformed into one hot encoded vectors
    vectorizer = CountVectorizer(ngram_range=(lower, upper), tokenizer=TweetTokenizer().tokenize, 
                                 analyzer=method)
    
    raw_train = [tweet.raw for tweet in train]
    raw_test = [tweet.raw for tweet in test]
    
    train_vector = vectorizer.fit_transform(raw_train)
    test_vector = vectorizer.transform(raw_test)

    return np.asarray(train_vector.toarray()), np.asarray(test_vector.toarray())


In [9]:
from sklearn.preprocessing import normalize

def extract_features(train, test):
    word_ngram_upper_bound = 4
    char_ngram_upper_bound = 5
    char_ngram_lower_bound = 3

    pos_vec_train, pos_vec_test = get_num_pos_tags(train, test)
    train_word_vectors, test_word_vectors = get_ngram_vectors(
        train, test, "word", word_ngram_upper_bound)
    train_char_vectors, test_char_vectors = get_ngram_vectors(
        train, test, "char", char_ngram_upper_bound, char_ngram_lower_bound)
    
    # Put all the features together. Each feature is represented as a column in the resulting array
    train_features = np.concatenate([get_num_of_hashtags(train),
                                     get_num_of_mult_punctuation(train),
                                     pos_vec_train,
                                     train_word_vectors,
                                     train_char_vectors,
                                     get_valences(train)], axis=1)

    test_features = np.concatenate([get_num_of_hashtags(test),
                                    get_num_of_mult_punctuation(test),
                                    test_word_vectors,
                                    test_char_vectors,
                                    pos_vec_test,
                                    get_valences(test)], axis=1)

    return train_features, test_features


In [10]:
def normalise_features(train, test):
    train = normalize(train)
    test = normalize(test)
    
    return train, test
    

In [11]:
# Extract non normalized features
X_train_nn, X_test_nn = extract_features(train, test)
# remove all constant features and normalise
X_train, X_test = normalise_features(X_train_nn, X_test_nn)


## Baseline

In [12]:
def transform_to_pos_neg(np_array):
    np_array[np_array < 0] = -1
    np_array[np_array > 0] = 1
    
    return np_array


In [13]:
results = []

def evaluate_model(model, X, y):
    # To make sure that the original y is not modified when applying the transform_to... function
    y_copy = np.copy(y)  
    prediction = model.predict(X)
    
    print("Real values:", Counter(y))
    print("Predicted values:", Counter(prediction), "\n")

    acc = metrics.accuracy_score(y, prediction)
    print("Accuracy:", acc)
    
    precision = precision_score(y, prediction, average="weighted")
    recall = recall_score(y, prediction, average="weighted")
    f1 = f1_score(y, prediction, average="weighted")

    # Accuracy if predicted value is +-1
    acc_interval = acc + metrics.accuracy_score(y_copy + 1, prediction) \
                   + metrics.accuracy_score(y_copy - 1, prediction)
    print("Accuracy for small interval:", acc_interval)

    # checks if at least the predicted direction is the right one
    acc_dir = metrics.accuracy_score(transform_to_pos_neg(y_copy), transform_to_pos_neg(prediction))
    print("Accuracy for right direction:", acc_dir, "\n")
    
    print("Classwise evaluation:")
    print(classification_report(y, prediction))
    
    class_scores = np.asarray(precision_recall_fscore_support(y, prediction))
    
    results.append([acc, acc_interval, acc_dir, precision, recall, f1])


In [14]:
def fit_and_eval_svm(X_train, y_train, X_test, y_test):
    lin_svm = svm.LinearSVC()
    lin_svm.fit(X_train, y_train)
    evaluate_model(lin_svm, X_test, y_test)

In [15]:
fit_and_eval_svm(X_train, y_train, X_test, y_test)

Real values: Counter({0: 105, -2: 95, -3: 69, 1: 58, 3: 53, 2: 35, -1: 34})
Predicted values: Counter({-2: 247, 0: 106, 3: 43, 1: 40, -3: 11, 2: 2}) 

Accuracy: 0.3429844098
Accuracy for small interval: 0.650334075724
Accuracy for right direction: 0.610244988864 

Classwise evaluation:
             precision    recall  f1-score   support

         -3       0.00      0.00      0.00        69
         -2       0.00      0.00      0.00        95
         -1       0.09      0.71      0.16        34
          0       0.33      0.33      0.33       105
          1       0.21      0.31      0.25        58
          2       0.00      0.00      0.00        35
          3       0.00      0.00      0.00        53

avg / total       0.11      0.17      0.12       449



  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)


## Radial Linear SVM (comparison)

In [16]:
non_lin_svm = SVC()
non_lin_svm.fit(X_train, y_train)
evaluate_model(non_lin_svm, X_test, y_test)

Real values: Counter({0: 105, -2: 95, -3: 69, 1: 58, 3: 53, 2: 35, -1: 34})
Predicted values: Counter({0: 449}) 

Accuracy: 0.233853006682
Accuracy for small interval: 0.438752783964
Accuracy for right direction: 0.233853006682 

Classwise evaluation:
             precision    recall  f1-score   support

         -3       0.00      0.00      0.00        69
         -2       0.00      0.00      0.00        95
         -1       0.00      0.00      0.00        34
          0       0.23      1.00      0.38       105
          1       0.00      0.00      0.00        58
          2       0.00      0.00      0.00        35
          3       0.00      0.00      0.00        53

avg / total       0.05      0.23      0.09       449



  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)


## Naive Bayes (comparison)

In [17]:
from sklearn.naive_bayes import GaussianNB

gnb = GaussianNB()
gnb.fit(X_train_nn, y_train)
evaluate_model(gnb, X_test_nn, y_test)

Real values: Counter({0: 105, -2: 95, -3: 69, 1: 58, 3: 53, 2: 35, -1: 34})
Predicted values: Counter({0: 308, -2: 96, 3: 45}) 

Accuracy: 0.216035634744
Accuracy for small interval: 0.412026726058
Accuracy for right direction: 0.293986636971 

Classwise evaluation:
             precision    recall  f1-score   support

         -3       0.00      0.00      0.00        69
         -2       0.00      0.00      0.00        95
         -1       0.06      0.18      0.09        34
          0       0.23      0.68      0.34       105
          1       0.18      0.14      0.16        58
          2       0.00      0.00      0.00        35
          3       0.00      0.00      0.00        53

avg / total       0.08      0.19      0.11       449



  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)


In [18]:
# Not suitable anymore because of nature of the data
# from sklearn.naive_bayes import MultinomialNB 

# mnb = MultinomialNB()
# mnb.fit(X_train, y_train)
# evaluate_model(mnb, X_test, y_test)


## Training with Grid Search

In [19]:
# Determining the hyperparameter C through grid search with cross validation
cs = [0.001, 0.01, 0.1, 1, 10]
grid = {'C': cs}

grid_search = GridSearchCV(svm.LinearSVC(), grid, cv=5, verbose=10)
grid_search.fit(X_train, y_train)
print(grid_search.best_params_)
evaluate_model(grid_search, X_test, y_test)


Fitting 5 folds for each of 5 candidates, totalling 25 fits
[CV] C=0.001 .........................................................
[CV] .......................... C=0.001, score=0.288703, total=   0.8s
[CV] C=0.001 .........................................................


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    1.1s remaining:    0.0s


[CV] .......................... C=0.001, score=0.285714, total=   0.8s
[CV] C=0.001 .........................................................


[Parallel(n_jobs=1)]: Done   2 out of   2 | elapsed:    2.1s remaining:    0.0s


[CV] .......................... C=0.001, score=0.288136, total=   0.8s
[CV] C=0.001 .........................................................


[Parallel(n_jobs=1)]: Done   3 out of   3 | elapsed:    3.2s remaining:    0.0s


[CV] .......................... C=0.001, score=0.289362, total=   0.8s
[CV] C=0.001 .........................................................


[Parallel(n_jobs=1)]: Done   4 out of   4 | elapsed:    4.3s remaining:    0.0s


[CV] .......................... C=0.001, score=0.291845, total=   0.8s
[CV] C=0.01 ..........................................................


[Parallel(n_jobs=1)]: Done   5 out of   5 | elapsed:    5.3s remaining:    0.0s


[CV] ........................... C=0.01, score=0.330544, total=   0.7s
[CV] C=0.01 ..........................................................


[Parallel(n_jobs=1)]: Done   6 out of   6 | elapsed:    6.4s remaining:    0.0s


[CV] ........................... C=0.01, score=0.327731, total=   0.8s
[CV] C=0.01 ..........................................................


[Parallel(n_jobs=1)]: Done   7 out of   7 | elapsed:    7.4s remaining:    0.0s


[CV] ........................... C=0.01, score=0.309322, total=   0.8s
[CV] C=0.01 ..........................................................


[Parallel(n_jobs=1)]: Done   8 out of   8 | elapsed:    8.5s remaining:    0.0s


[CV] ........................... C=0.01, score=0.297872, total=   0.7s
[CV] C=0.01 ..........................................................


[Parallel(n_jobs=1)]: Done   9 out of   9 | elapsed:    9.5s remaining:    0.0s


[CV] ........................... C=0.01, score=0.326180, total=   0.8s
[CV] C=0.1 ...........................................................
[CV] ............................ C=0.1, score=0.414226, total=   0.8s
[CV] C=0.1 ...........................................................
[CV] ............................ C=0.1, score=0.378151, total=   0.8s
[CV] C=0.1 ...........................................................
[CV] ............................ C=0.1, score=0.377119, total=   0.8s
[CV] C=0.1 ...........................................................
[CV] ............................ C=0.1, score=0.370213, total=   0.8s
[CV] C=0.1 ...........................................................
[CV] ............................ C=0.1, score=0.386266, total=   0.8s
[CV] C=1 .............................................................
[CV] .............................. C=1, score=0.397490, total=   0.9s
[CV] C=1 .............................................................
[CV] .

[Parallel(n_jobs=1)]: Done  25 out of  25 | elapsed:   33.2s finished


{'C': 0.1}
Real values: Counter({0: 105, -2: 95, -3: 69, 1: 58, 3: 53, 2: 35, -1: 34})
Predicted values: Counter({-2: 214, 0: 173, 3: 43, 1: 17, -3: 2}) 

Accuracy: 0.305122494432
Accuracy for small interval: 0.641425389755
Accuracy for right direction: 0.550111358575 

Classwise evaluation:
             precision    recall  f1-score   support

         -3       0.00      0.00      0.00        69
         -2       0.00      0.00      0.00        95
         -1       0.10      0.65      0.18        34
          0       0.28      0.47      0.35       105
          1       0.13      0.14      0.14        58
          2       0.00      0.00      0.00        35
          3       0.00      0.00      0.00        53

avg / total       0.09      0.18      0.11       449



  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)


## Feature Selection


### Variance Threshold

In [20]:
# Variance threshold removes all the features which variance is smaller than a certain percentage
thresholds = [0, 0.01, 0.05, 0.1, 0.5]

for threshold in thresholds:
    print("Threshhold:", threshold)
    
    variance_selector = VarianceThreshold(threshold)
    X_train_selected = variance_selector.fit_transform(X_train_nn)
    X_test_selected = variance_selector.transform(X_test_nn)
    
    fit_and_eval_svm(X_train_selected, y_train, X_test_selected, y_test)


Threshhold: 0
Real values: Counter({0: 105, -2: 95, -3: 69, 1: 58, 3: 53, 2: 35, -1: 34})
Predicted values: Counter({-2: 164, 3: 106, -3: 62, 0: 55, 1: 45, 2: 14, -1: 3}) 

Accuracy: 0.354120267261
Accuracy for small interval: 0.608017817372
Accuracy for right direction: 0.648106904232 

Classwise evaluation:
             precision    recall  f1-score   support

         -3       0.00      0.00      0.00        69
         -2       0.00      0.00      0.00        95
         -1       0.10      0.68      0.17        34
          0       0.36      0.19      0.25       105
          1       0.25      0.71      0.37        58
          2       0.00      0.00      0.00        35
          3       0.00      0.00      0.00        53

avg / total       0.12      0.19      0.12       449

Threshhold: 0.01


  'precision', 'predicted', average, warn_for)


Real values: Counter({0: 105, -2: 95, -3: 69, 1: 58, 3: 53, 2: 35, -1: 34})
Predicted values: Counter({-2: 194, 0: 90, 2: 55, 3: 53, -3: 29, 1: 15, -1: 13}) 

Accuracy: 0.307349665924
Accuracy for small interval: 0.619153674833
Accuracy for right direction: 0.587973273942 

Classwise evaluation:
             precision    recall  f1-score   support

         -3       0.00      0.00      0.00        69
         -2       0.00      0.00      0.00        95
         -1       0.10      0.68      0.17        34
          0       0.29      0.25      0.27       105
          1       0.18      0.38      0.24        58
          2       0.00      0.00      0.00        35
          3       0.00      0.00      0.00        53

avg / total       0.10      0.16      0.11       449

Threshhold: 0.05
Real values: Counter({0: 105, -2: 95, -3: 69, 1: 58, 3: 53, 2: 35, -1: 34})
Predicted values: Counter({-2: 118, 0: 103, -3: 62, 1: 62, -1: 50, 3: 45, 2: 9}) 

Accuracy: 0.340757238307
Accuracy for small int

  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)


### Selection of kBest

In [21]:
from sklearn.feature_selection import f_classif

# Uses chi square to find significance of features and only selects the top k features
ks = [10, 50, 100, 200, 500, 1000, 5000]

for k in ks:
    print("k:", k)
    
    kbest_selector = SelectKBest(f_classif, k=k)
    X_train_selected = kbest_selector.fit_transform(X_train, y_train)
    X_test_selected = kbest_selector.transform(X_test)
    
    support = kbest_selector.get_support()
    print("Num hashtags:", support[0])
    print("Num mult punct:", support[1])
    print("Pos tags", Counter(support[2:48]))
    print("Word vectors", Counter(support[48:41580]))
    print("Char vectors", Counter(support[41580:-9]))
    print("Emoticon sum:", support[-9])
    print("Emoticon max:", support[-8])
    print("Emoticon last:", support[-7])
    print("Hashtag sum:", support[-6])
    print("Hashtag max:", support[-5])
    print("Hashtag last:", support[-4])
    print("Token sum:", support[-3])
    print("Token max:", support[-2])
    print("Token last:", support[-1], "\n")
    
    fit_and_eval_svm(X_train_selected, y_train, X_test_selected, y_test)


k: 10


  f = msb / msw


Num hashtags: False
Num mult punct: False
Pos tags Counter({False: 46})
Word vectors Counter({False: 41532})
Char vectors Counter({False: 79869, True: 4})
Emoticon sum: False
Emoticon max: False
Emoticon last: False
Hashtag sum: True
Hashtag max: True
Hashtag last: True
Token sum: True
Token max: True
Token last: True 

Real values: Counter({0: 105, -2: 95, -3: 69, 1: 58, 3: 53, 2: 35, -1: 34})
Predicted values: Counter({0: 220, -2: 147, 3: 46, -3: 31, 1: 5}) 

Accuracy: 0.371937639198
Accuracy for small interval: 0.663697104677
Accuracy for right direction: 0.552338530067 

Classwise evaluation:
             precision    recall  f1-score   support

         -3       0.00      0.00      0.00        69
         -2       0.00      0.00      0.00        95
         -1       0.09      0.47      0.15        34
          0       0.31      0.66      0.42       105
          1       0.16      0.14      0.15        58
          2       0.00      0.00      0.00        35
          3       0.00  

  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)


Num hashtags: True
Num mult punct: False
Pos tags Counter({False: 46})
Word vectors Counter({False: 41532})
Char vectors Counter({False: 79830, True: 43})
Emoticon sum: False
Emoticon max: False
Emoticon last: False
Hashtag sum: True
Hashtag max: True
Hashtag last: True
Token sum: True
Token max: True
Token last: True 

Real values: Counter({0: 105, -2: 95, -3: 69, 1: 58, 3: 53, 2: 35, -1: 34})
Predicted values: Counter({0: 241, -2: 155, 3: 30, -3: 21, 1: 2}) 

Accuracy: 0.325167037862
Accuracy for small interval: 0.619153674833
Accuracy for right direction: 0.487750556793 

Classwise evaluation:
             precision    recall  f1-score   support

         -3       0.00      0.00      0.00        69
         -2       0.00      0.00      0.00        95
         -1       0.10      0.50      0.16        34
          0       0.27      0.62      0.38       105
          1       0.09      0.05      0.07        58
          2       0.00      0.00      0.00        35
          3       0.00  

In [22]:
print(results)

[[0.34298440979955458, 0.65033407572383073, 0.61024498886414258, 0.38275938536416942, 0.34298440979955458, 0.28619360819738771], [0.23385300668151449, 0.43875278396436523, 0.23385300668151449, 0.054687228733984454, 0.23385300668151449, 0.088644641521873707], [0.21603563474387527, 0.41202672605790647, 0.29398663697104677, 0.11372578794627793, 0.21603563474387527, 0.13916877236466987], [0.30512249443207129, 0.64142538975501107, 0.55011135857461024, 0.35111462470286053, 0.30512249443207129, 0.23143988901447315], [0.35412026726057905, 0.60801781737193772, 0.64810690423162587, 0.32488280883762405, 0.35412026726057905, 0.31876701171969679], [0.30734966592427615, 0.61915367483296213, 0.58797327394209353, 0.31003466552931841, 0.30734966592427615, 0.27763493352054686], [0.34075723830734966, 0.63919821826280621, 0.5835189309576837, 0.33643365767514954, 0.34075723830734966, 0.33481332125343549], [0.36525612472160357, 0.65924276169265039, 0.59242761692650336, 0.35576139133066503, 0.365256124721603

### Recursive Feature Elimination with Cross Validation


In [23]:
# Another form of feature selection

# selector = RFECV(svm.LinearSVC(), step=500, cv=5, verbose=10)
# selector.fit(X_train, y_train)
# evaluate_model(selector, X_test, y_test)
