In [2]:
import pickle
import re
import numpy as np
from nltk.tokenize import TweetTokenizer
from sklearn import svm
from sklearn import metrics
from sklearn.model_selection import GridSearchCV
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2
from sklearn.feature_selection import VarianceThreshold
from sklearn.feature_selection import RFECV
from nltk.data import load


In [5]:
train = np.asarray(pickle.load(open("train_set", "rb")))
test = np.asarray(pickle.load(open("test_set", "rb")))

print(train.shape, test.shape)

(1181,) (449,)


In [6]:
# Extracting valence from the dataset

y_train = np.asarray([int(tweet.valence) for tweet in train])
y_test = np.asarray([int(tweet.valence) for tweet in test])


## Feature Generation


In [7]:
def get_num_of_hashtags(data_set):
    # This oddly shape array is needed, because the features need to be in a column-like shape
    X = np.zeros((len(data_set), 1))

    for i, tweet in enumerate(data_set):
        X[i, 0] = np.asarray(len(tweet.hashtags))

    return X


In [8]:
def get_num_of_mult_punctuation(data_set):
    # multiple punctuation means sth. like ../!!!
    X = np.zeros((len(data_set), 1))

    for i, tweet in enumerate(data_set):
        X[i, 0] = len(re.findall(r"[?!]{2,}", tweet.raw))
        
    return X


In [9]:
def get_num_pos_tags(train, test):
    # First find out which POS tags are possible
    vocabulary = list(load('help/tagsets/upenn_tagset.pickle'))
    pos_tags_train = [tweet.pos_tags for tweet in train]
    pos_tags_test = [tweet.pos_tags for tweet in test]

    # CountVectorizer is used to create a vector for each tweet. Each number in this vector 
    # represents the number of occurrences for a specific POS tag.
    # All those vectors have the same length, which is needed to use them for the SVM.
    vectorizer = CountVectorizer(vocabulary=vocabulary, tokenizer=lambda doc: doc, lowercase=False)

    train_vector = vectorizer.transform(pos_tags_train)
    test_vector = vectorizer.transform(pos_tags_test)
   
    return np.asarray(train_vector.toarray()), np.asarray(test_vector.toarray())



In [10]:
def get_ngram_vectors(train, test, method="word", upper=3, lower=1):
    # This method either extracts word (method="word)" or char ngrams (method="char").
    # Again, they need to be transformed into one hot encoded vectors
    vectorizer = CountVectorizer(ngram_range=(lower, upper), tokenizer=TweetTokenizer().tokenize, 
                                 analyzer=method)
    
    raw_train = [tweet.raw for tweet in train]
    raw_test = [tweet.raw for tweet in test]
    
    train_vector = vectorizer.fit_transform(raw_train)
    test_vector = vectorizer.transform(raw_test)

    return np.asarray(train_vector.toarray()), np.asarray(test_vector.toarray())


In [11]:
def extract_features(train, test):
    word_ngram_upper_bound = 4
    char_ngram_upper_bound = 5
    char_ngram_lower_bound = 3

    pos_vec_train, pos_vec_test = get_num_pos_tags(train, test)
    train_word_vectors, test_word_vectors = get_ngram_vectors(
        train, test, "word", word_ngram_upper_bound)
    train_char_vectors, test_char_vectors = get_ngram_vectors(
        train, test, "char", char_ngram_upper_bound, char_ngram_lower_bound)

    # Put all the features together. Each feature is represented as a column in the resulting array
    train_features = np.concatenate([get_num_of_hashtags(train),
                                     get_num_of_mult_punctuation(train),
                                     pos_vec_train,
                                     train_word_vectors,
                                     train_char_vectors], axis=1)

    test_features = np.concatenate([get_num_of_hashtags(test),
                                    get_num_of_mult_punctuation(test),
                                    test_word_vectors,
                                    test_char_vectors,
                                    pos_vec_test], axis=1)

    return train_features, test_features


In [12]:
X_train, X_test = extract_features(train, test)


## Baseline

In [13]:
def transform_to_pos_neg(np_array):
    np_array[np_array < 0] = -1
    np_array[np_array > 0] = 1
    
    return np_array


In [14]:
def evaluate_model(model, X, y):
    # To make sure that the original y is not modified when applying the transform_to... function
    y_copy = np.copy(y)  
    prediction = model.predict(X)

    acc = metrics.accuracy_score(y, prediction)
    print("Accuracy:", acc)

    # Accuracy if predicted value is +-1
    acc_interval = acc + metrics.accuracy_score(y_copy + 1, prediction) \
                   + metrics.accuracy_score(y_copy - 1, prediction)
    print("Accuracy for small interval:", acc_interval)

    # checks if at least the predicted direction is the right one
    acc_dir = metrics.accuracy_score(transform_to_pos_neg(y_copy), transform_to_pos_neg(prediction))
    print("Accuracy for right direction:", acc_dir, "\n")


In [15]:
def fit_and_eval_svm(X_train, y_train, X_test, y_test):
    lin_svm = svm.LinearSVC()
    lin_svm.fit(X_train, y_train)
    evaluate_model(lin_svm, X_test, y_test)


In [16]:
fit_and_eval_svm(X_train, y_train, X_test, y_test)

Accuracy: 0.224944320713
Accuracy for small interval: 0.445434298441
Accuracy for right direction: 0.412026726058 



## Training with Grid Search

In [18]:
# Determining the hyperparameter C through grid search with cross validation
cs = [0.001, 0.01, 0.1, 1, 10]
grid = {'C': cs}

grid_search = GridSearchCV(svm.LinearSVC(), grid, cv=5, verbose=10)
grid_search.fit(X_train, y_train)
print(grid_search.best_params_)
evaluate_model(grid_search, X_test, y_test)


Fitting 5 folds for each of 5 candidates, totalling 25 fits
[CV] C=0.001 .........................................................


[CV] .......................... C=0.001, score=0.338983, total=   1.3s
[CV] C=0.001 .........................................................


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    1.7s remaining:    0.0s


[CV] .......................... C=0.001, score=0.372881, total=   1.3s
[CV] C=0.001 .........................................................


[Parallel(n_jobs=1)]: Done   2 out of   2 | elapsed:    3.4s remaining:    0.0s


[CV] .......................... C=0.001, score=0.323404, total=   1.1s
[CV] C=0.001 .........................................................


[Parallel(n_jobs=1)]: Done   3 out of   3 | elapsed:    4.9s remaining:    0.0s


[CV] .......................... C=0.001, score=0.350427, total=   1.1s
[CV] C=0.001 .........................................................


[Parallel(n_jobs=1)]: Done   4 out of   4 | elapsed:    6.3s remaining:    0.0s


[CV] .......................... C=0.001, score=0.313305, total=   0.8s
[CV] C=0.01 ..........................................................


[Parallel(n_jobs=1)]: Done   5 out of   5 | elapsed:    7.4s remaining:    0.0s


[CV] ........................... C=0.01, score=0.313559, total=   1.1s
[CV] C=0.01 ..........................................................


[Parallel(n_jobs=1)]: Done   6 out of   6 | elapsed:    8.7s remaining:    0.0s


[CV] ........................... C=0.01, score=0.322034, total=   1.0s
[CV] C=0.01 ..........................................................


[Parallel(n_jobs=1)]: Done   7 out of   7 | elapsed:   10.0s remaining:    0.0s


[CV] ........................... C=0.01, score=0.344681, total=   1.1s
[CV] C=0.01 ..........................................................


[Parallel(n_jobs=1)]: Done   8 out of   8 | elapsed:   11.4s remaining:    0.0s


[CV] ........................... C=0.01, score=0.333333, total=   1.1s
[CV] C=0.01 ..........................................................


[Parallel(n_jobs=1)]: Done   9 out of   9 | elapsed:   12.8s remaining:    0.0s


[CV] ........................... C=0.01, score=0.296137, total=   1.1s
[CV] C=0.1 ...........................................................


[CV] ............................ C=0.1, score=0.300847, total=   2.1s
[CV] C=0.1 ...........................................................


[CV] ............................ C=0.1, score=0.313559, total=   1.8s
[CV] C=0.1 ...........................................................


[CV] ............................ C=0.1, score=0.327660, total=   3.1s
[CV] C=0.1 ...........................................................


[CV] ............................ C=0.1, score=0.316239, total=   2.1s
[CV] C=0.1 ...........................................................


[CV] ............................ C=0.1, score=0.287554, total=   1.9s
[CV] C=1 .............................................................


[CV] .............................. C=1, score=0.313559, total=   2.8s
[CV] C=1 .............................................................


[CV] .............................. C=1, score=0.322034, total=   2.5s
[CV] C=1 .............................................................


[CV] .............................. C=1, score=0.323404, total=   3.1s
[CV] C=1 .............................................................


[CV] .............................. C=1, score=0.307692, total=   2.5s
[CV] C=1 .............................................................


[CV] .............................. C=1, score=0.278970, total=   2.6s
[CV] C=10 ............................................................


[CV] ............................. C=10, score=0.313559, total=   3.0s
[CV] C=10 ............................................................


[CV] ............................. C=10, score=0.317797, total=   2.8s
[CV] C=10 ............................................................


[CV] ............................. C=10, score=0.323404, total=   3.4s
[CV] C=10 ............................................................


[CV] ............................. C=10, score=0.303419, total=   2.8s
[CV] C=10 ............................................................


[CV] ............................. C=10, score=0.283262, total=   2.9s


[Parallel(n_jobs=1)]: Done  25 out of  25 | elapsed:   58.2s finished


{'C': 0.001}
Accuracy: 0.185
Accuracy for small interval: 0.375
Accuracy for right direction: 0.415 



## Feature Selection


### Variance Threshold

In [19]:
# Variance threshold removes all the features which variance is smaller than a certain percentage
thresholds = [0, 0.01, 0.05, 0.1, 0.5]

for threshold in thresholds:
    variance_selector = VarianceThreshold(threshold)
    X_train_selected = variance_selector.fit_transform(X_train)
    X_test_selected = variance_selector.transform(X_test)

    print("Threshhold:", threshold)
    fit_and_eval_svm(X_train_selected, y_train, X_test_selected, y_test)


Threshhold: 0


Accuracy: 0.225
Accuracy for small interval: 0.465
Accuracy for right direction: 0.41 



Threshhold: 0.01


Accuracy: 0.175
Accuracy for small interval: 0.41
Accuracy for right direction: 0.33 



Threshhold: 0.05


Accuracy: 0.135
Accuracy for small interval: 0.425
Accuracy for right direction: 0.335 



Threshhold: 0.1


Accuracy: 0.155
Accuracy for small interval: 0.36
Accuracy for right direction: 0.36 



Threshhold: 0.5


Accuracy: 0.195
Accuracy for small interval: 0.41
Accuracy for right direction: 0.235 



### Selection of kBest

In [20]:
# Uses chi square to find significance of features and only selects the top k features
ks = [10, 50, 100, 200, 500, 1000, 5000]

for k in ks:
    kbest_selector = SelectKBest(chi2, k=k)
    X_train_selected = kbest_selector.fit_transform(X_train, y_train)
    X_test_selected = kbest_selector.transform(X_test)
    
    print("k:", k)
    fit_and_eval_svm(X_train_selected, y_train, X_test_selected, y_test)


k: 10


Accuracy: 0.215
Accuracy for small interval: 0.44
Accuracy for right direction: 0.22 



k: 50


Accuracy: 0.19
Accuracy for small interval: 0.405
Accuracy for right direction: 0.2 



k: 100


Accuracy: 0.21
Accuracy for small interval: 0.445
Accuracy for right direction: 0.235 



k: 200


Accuracy: 0.195
Accuracy for small interval: 0.44
Accuracy for right direction: 0.27 



k: 500


Accuracy: 0.195
Accuracy for small interval: 0.425
Accuracy for right direction: 0.28 



k: 1000


Accuracy: 0.22
Accuracy for small interval: 0.47
Accuracy for right direction: 0.32 



k: 5000


Accuracy: 0.22
Accuracy for small interval: 0.47
Accuracy for right direction: 0.345 



### Recursive Feature Elimination with Cross Validation


In [22]:
# Another form of feature selection

selector = RFECV(svm.LinearSVC(), step=500, cv=5, verbose=10)
selector.fit(X_train, y_train)
evaluate_model(selector, X_test, y_test)


Fitting estimator with 121131 features.


Fitting estimator with 120631 features.


Fitting estimator with 120131 features.


Fitting estimator with 119631 features.


Fitting estimator with 119131 features.


Fitting estimator with 118631 features.


Fitting estimator with 118131 features.


Fitting estimator with 117631 features.


Fitting estimator with 117131 features.


Fitting estimator with 116631 features.


Fitting estimator with 116131 features.


Fitting estimator with 115631 features.


ERROR:root:Internal Python error in the inspect module.
Below is the traceback from this internal error.



Traceback (most recent call last):
  File "/home/hendaet/anaconda3/lib/python3.6/site-packages/IPython/core/interactiveshell.py", line 2881, in run_code
    exec(code_obj, self.user_global_ns, self.user_ns)
  File "<ipython-input-22-1adb49fe5d6d>", line 4, in <module>
    selector.fit(X_train, y_train)
  File "/home/hendaet/anaconda3/lib/python3.6/site-packages/sklearn/feature_selection/rfe.py", line 430, in fit
    for train, test in cv.split(X, y))


  File "/home/hendaet/anaconda3/lib/python3.6/site-packages/sklearn/feature_selection/rfe.py", line 430, in <genexpr>
    for train, test in cv.split(X, y))
  File "/home/hendaet/anaconda3/lib/python3.6/site-packages/sklearn/feature_selection/rfe.py", line 30, in _rfe_single_fit
    X_train, y_train, lambda estimator, features:
  File "/home/hendaet/anaconda3/lib/python3.6/site-packages/sklearn/feature_selection/rfe.py", line 169, in _fit
    estimator.fit(X[:, features], y)
  File "/home/hendaet/anaconda3/lib/python3.6/site-packages/sklearn/svm/classes.py", line 207, in fit
    dtype=np.float64, order="C")
  File "/home/hendaet/anaconda3/lib/python3.6/site-packages/sklearn/utils/validation.py", line 521, in check_X_y
    ensure_min_features, warn_on_dtype, estimator)
  File "/home/hendaet/anaconda3/lib/python3.6/site-packages/sklearn/utils/validation.py", line 382, in check_array
    array = np.array(array, dtype=dtype, order=order, copy=copy)
KeyboardInterrupt

During handling of the

KeyboardInterrupt: 

## Naive Bayes (comparison)

In [18]:
from sklearn.naive_bayes import GaussianNB

gnb = GaussianNB()
gnb.fit(X_train, y_train)
evaluate_model(gnb, X_test, y_test)

Accuracy: 0.216035634744
Accuracy for small interval: 0.412026726058
Accuracy for right direction: 0.293986636971 



In [19]:
from sklearn.naive_bayes import MultinomialNB 

mnb = MultinomialNB()
mnb.fit(X_train, y_train)
evaluate_model(mnb, X_test, y_test)


Accuracy: 0.102449888641
Accuracy for small interval: 0.407572383073
Accuracy for right direction: 0.43429844098 



In [20]:
from sklearn.svm import SVC

svm = SVC()
svm.fit(X_train, y_train)
evaluate_model(svm, X_test, y_test)

Accuracy: 0.233853006682
Accuracy for small interval: 0.438752783964
Accuracy for right direction: 0.233853006682 

