In [1]:
import pickle
import re
import numpy as np
from nltk.tokenize import TweetTokenizer
from sklearn import svm
from sklearn import metrics
from sklearn.model_selection import GridSearchCV
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2
from sklearn.feature_selection import VarianceThreshold
from sklearn.feature_selection import RFECV
from sklearn.svm import SVC
from sklearn.metrics import classification_report

from nltk.data import load
from collections import Counter


In [2]:
train = np.asarray(pickle.load(open("train_set", "rb")))
test = np.asarray(pickle.load(open("test_set", "rb")))

print(train.shape, test.shape)

(1181,) (449,)


In [3]:
# Extracting valence from the dataset

y_train = np.asarray([int(tweet.valence) for tweet in train])
y_test = np.asarray([int(tweet.valence) for tweet in test])


## Feature Generation


In [4]:
def get_num_of_hashtags(data_set):
    # This oddly shape array is needed, because the features need to be in a column-like shape
    X = np.zeros((len(data_set), 1))

    for i, tweet in enumerate(data_set):
        X[i, 0] = np.asarray(len(tweet.hashtags))

    return X


In [5]:
def get_num_of_mult_punctuation(data_set):
    # multiple punctuation means sth. like ../!!!
    X = np.zeros((len(data_set), 1))

    for i, tweet in enumerate(data_set):
        X[i, 0] = len(re.findall(r"[?!]{2,}", tweet.raw))
        
    return X


In [6]:
def get_num_pos_tags(train, test):
    # First find out which POS tags are possible
    vocabulary = list(load('help/tagsets/upenn_tagset.pickle'))
    pos_tags_train = [tweet.pos_tags for tweet in train]
    pos_tags_test = [tweet.pos_tags for tweet in test]

    # CountVectorizer is used to create a vector for each tweet. Each number in this vector 
    # represents the number of occurrences for a specific POS tag.
    # All those vectors have the same length, which is needed to use them for the SVM.
    vectorizer = CountVectorizer(vocabulary=vocabulary, tokenizer=lambda doc: doc, lowercase=False)

    train_vector = vectorizer.transform(pos_tags_train)
    test_vector = vectorizer.transform(pos_tags_test)
   
    return np.asarray(train_vector.toarray()), np.asarray(test_vector.toarray())



In [7]:
def get_ngram_vectors(train, test, method="word", upper=3, lower=1):
    # This method either extracts word (method="word)" or char ngrams (method="char").
    # Again, they need to be transformed into one hot encoded vectors
    vectorizer = CountVectorizer(ngram_range=(lower, upper), tokenizer=TweetTokenizer().tokenize, 
                                 analyzer=method)
    
    raw_train = [tweet.raw for tweet in train]
    raw_test = [tweet.raw for tweet in test]
    
    train_vector = vectorizer.fit_transform(raw_train)
    test_vector = vectorizer.transform(raw_test)

    return np.asarray(train_vector.toarray()), np.asarray(test_vector.toarray())


In [8]:
from sklearn.preprocessing import normalize

def extract_features(train, test):
    word_ngram_upper_bound = 4
    char_ngram_upper_bound = 5
    char_ngram_lower_bound = 3

    pos_vec_train, pos_vec_test = get_num_pos_tags(train, test)
    train_word_vectors, test_word_vectors = get_ngram_vectors(
        train, test, "word", word_ngram_upper_bound)
    train_char_vectors, test_char_vectors = get_ngram_vectors(
        train, test, "char", char_ngram_upper_bound, char_ngram_lower_bound)
    
    print(pos_vec_train.shape, train_word_vectors.shape, train_char_vectors.shape)

    # Put all the features together. Each feature is represented as a column in the resulting array
    train_features = np.concatenate([get_num_of_hashtags(train),
                                     get_num_of_mult_punctuation(train),
                                     pos_vec_train,
                                     train_word_vectors,
                                     train_char_vectors], axis=1)

    test_features = np.concatenate([get_num_of_hashtags(test),
                                    get_num_of_mult_punctuation(test),
                                    test_word_vectors,
                                    test_char_vectors,
                                    pos_vec_test], axis=1)

    return train_features, test_features


In [9]:
# Extract non normalized features
X_train_nn, X_test_nn = extract_features(train, test)
X_train = normalize(X_train_nn)
X_test = normalize(X_test_nn)

(1181, 45) (1181, 41531) (1181, 79875)


## Baseline

In [10]:
def transform_to_pos_neg(np_array):
    np_array[np_array < 0] = -1
    np_array[np_array > 0] = 1
    
    return np_array


In [11]:
from collections import Counter
from sklearn.metrics import precision_recall_fscore_support
from sklearn.metrics import f1_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score

results = []

def evaluate_model(model, X, y):
    # To make sure that the original y is not modified when applying the transform_to... function
    y_copy = np.copy(y)  
    prediction = model.predict(X)
    
    print("Real values:", Counter(y))
    print("Predicted values:", Counter(prediction), "\n")

    acc = metrics.accuracy_score(y, prediction)
    print("Accuracy:", acc)
    
    precision = precision_score(y, prediction, average="weighted")
    recall = recall_score(y, prediction, average="weighted")
    f1 = f1_score(y, prediction, average="weighted")

    # Accuracy if predicted value is +-1
    acc_interval = acc + metrics.accuracy_score(y_copy + 1, prediction) \
                   + metrics.accuracy_score(y_copy - 1, prediction)
    print("Accuracy for small interval:", acc_interval)

    # checks if at least the predicted direction is the right one
    acc_dir = metrics.accuracy_score(transform_to_pos_neg(y_copy), transform_to_pos_neg(prediction))
    print("Accuracy for right direction:", acc_dir, "\n")
    
    print("Classwise evaluation:")
    print(classification_report(y, prediction))
    
    class_scores = np.asarray(precision_recall_fscore_support(y, prediction))
    
    results.append([acc, acc_interval, acc_dir, precision, recall, f1])


In [12]:
def fit_and_eval_svm(X_train, y_train, X_test, y_test):
    lin_svm = svm.LinearSVC()
    lin_svm.fit(X_train, y_train)
    evaluate_model(lin_svm, X_test, y_test)

In [13]:
fit_and_eval_svm(X_train, y_train, X_test, y_test)

Real values: Counter({0: 105, -2: 95, -3: 69, 1: 58, 3: 53, 2: 35, -1: 34})
Predicted values: Counter({-2: 317, 0: 124, 3: 8}) 

Accuracy: 0.238307349666
Accuracy for small interval: 0.45434298441
Accuracy for right direction: 0.400890868597 

Classwise evaluation:
             precision    recall  f1-score   support

         -3       0.00      0.00      0.00        69
         -2       0.00      0.00      0.00        95
         -1       0.07      0.65      0.13        34
          0       0.28      0.33      0.31       105
          1       0.00      0.00      0.00        58
          2       0.00      0.00      0.00        35
          3       0.00      0.00      0.00        53

avg / total       0.07      0.13      0.08       449



  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)


## Non Linear SVM (comparison)

In [14]:
non_lin_svm = SVC()
non_lin_svm.fit(X_train, y_train)
evaluate_model(non_lin_svm, X_test, y_test)

Real values: Counter({0: 105, -2: 95, -3: 69, 1: 58, 3: 53, 2: 35, -1: 34})
Predicted values: Counter({0: 449}) 

Accuracy: 0.233853006682
Accuracy for small interval: 0.438752783964
Accuracy for right direction: 0.233853006682 

Classwise evaluation:
             precision    recall  f1-score   support

         -3       0.00      0.00      0.00        69
         -2       0.00      0.00      0.00        95
         -1       0.00      0.00      0.00        34
          0       0.23      1.00      0.38       105
          1       0.00      0.00      0.00        58
          2       0.00      0.00      0.00        35
          3       0.00      0.00      0.00        53

avg / total       0.05      0.23      0.09       449



  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)


## Naive Bayes (comparison)

In [15]:
from sklearn.naive_bayes import GaussianNB

gnb = GaussianNB()
gnb.fit(X_train_nn, y_train)
evaluate_model(gnb, X_test_nn, y_test)

Real values: Counter({0: 105, -2: 95, -3: 69, 1: 58, 3: 53, 2: 35, -1: 34})
Predicted values: Counter({0: 308, -2: 96, 3: 45}) 

Accuracy: 0.216035634744
Accuracy for small interval: 0.412026726058
Accuracy for right direction: 0.293986636971 

Classwise evaluation:
             precision    recall  f1-score   support

         -3       0.00      0.00      0.00        69
         -2       0.00      0.00      0.00        95
         -1       0.06      0.18      0.09        34
          0       0.23      0.68      0.34       105
          1       0.18      0.14      0.16        58
          2       0.00      0.00      0.00        35
          3       0.00      0.00      0.00        53

avg / total       0.08      0.19      0.11       449



  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)


In [16]:
from sklearn.naive_bayes import MultinomialNB 

mnb = MultinomialNB()
mnb.fit(X_train_nn, y_train)
evaluate_model(mnb, X_test_nn, y_test)


Real values: Counter({0: 105, -2: 95, -3: 69, 1: 58, 3: 53, 2: 35, -1: 34})
Predicted values: Counter({-1: 158, 2: 147, 3: 64, -3: 42, -2: 21, 1: 17}) 

Accuracy: 0.102449888641
Accuracy for small interval: 0.407572383073
Accuracy for right direction: 0.43429844098 

Classwise evaluation:
             precision    recall  f1-score   support

         -3       0.00      0.00      0.00        69
         -2       0.00      0.00      0.00        95
         -1       0.08      0.50      0.13        34
          0       0.00      0.00      0.00       105
          1       0.14      0.55      0.22        58
          2       0.00      0.00      0.00        35
          3       0.00      0.00      0.00        53

avg / total       0.02      0.11      0.04       449



  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)


## Training with Grid Search

In [None]:
# Determining the hyperparameter C through grid search with cross validation
cs = [0.001, 0.01, 0.1, 1, 10]
grid = {'C': cs}

grid_search = GridSearchCV(svm.LinearSVC(), grid, cv=5, verbose=10)
grid_search.fit(X_train, y_train)
print(grid_search.best_params_)
evaluate_model(grid_search, X_test, y_test)


## Feature Selection


### Variance Threshold

In [17]:
# Variance threshold removes all the features which variance is smaller than a certain percentage
thresholds = [0, 0.01, 0.05, 0.1, 0.5]

for threshold in thresholds:
    print("Threshhold:", threshold)
    
    variance_selector = VarianceThreshold(threshold)
    X_train_selected = normalize(variance_selector.fit_transform(X_train))
    X_test_selected = normalize(variance_selector.transform(X_test))
    
    # Returns an array which indicates if number of hastags/number of multiple punctuation was selected as features
    print(variance_selector.get_support()[0:2])
    
    fit_and_eval_svm(X_train_selected, y_train, X_test_selected, y_test)


Threshhold: 0
[ True  True]
Real values: Counter({0: 105, -2: 95, -3: 69, 1: 58, 3: 53, 2: 35, -1: 34})
Predicted values: Counter({-2: 317, 0: 124, 3: 8}) 

Accuracy: 0.238307349666


  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)


Accuracy for small interval: 0.45434298441
Accuracy for right direction: 0.400890868597 

Classwise evaluation:


  'precision', 'predicted', average, warn_for)


             precision    recall  f1-score   support

         -3       0.00      0.00      0.00        69
         -2       0.00      0.00      0.00        95
         -1       0.07      0.65      0.13        34
          0       0.28      0.33      0.31       105
          1       0.00      0.00      0.00        58
          2       0.00      0.00      0.00        35
          3       0.00      0.00      0.00        53

avg / total       0.07      0.13      0.08       449

Threshhold: 0.01
[False False]
Real values: Counter({0: 105, -2: 95, -3: 69, 1: 58, 3: 53, 2: 35, -1: 34})
Predicted values: Counter({0: 449}) 

Accuracy: 0.233853006682
Accuracy for small interval: 0.438752783964
Accuracy for right direction: 0.233853006682 

Classwise evaluation:
             precision    recall  f1-score   support

         -3       0.00      0.00      0.00        69
         -2       0.00      0.00      0.00        95
         -1       0.00      0.00      0.00        34
          0       0.23  

ValueError: No feature in X meets the variance threshold 0.05000

### Selection of kBest

In [22]:
# Uses chi square to find significance of features and only selects the top k features
ks = [10, 50, 100, 200, 500, 1000, 5000]

for k in ks:
    print("k:", k)
    
    kbest_selector = SelectKBest(chi2, k=k)
    X_train_selected = kbest_selector.fit_transform(X_train, y_train)
    X_test_selected = kbest_selector.transform(X_test)
    
    support = kbest_selector.get_support()
    print("Num hashtags:", support[0])
    print("Num mult punct:", support[1])
    print("Pos tags", Counter(support[2:48]))
    print("Word vectors", Counter(support[48:41580]))
    print("Char vectors", Counter(support[41580:]))
    
    fit_and_eval_svm(X_train_selected, y_train, X_test_selected, y_test)


k: 10
Num ashtags: True
Num mult punct: False
Pos tags Counter({False: 46})
Word vectors Counter({False: 41532})
Char vectors Counter({False: 79864, True: 9})
Real values: Counter({0: 105, -2: 95, -3: 69, 1: 58, 3: 53, 2: 35, -1: 34})
Predicted values: Counter({0: 384, 3: 65}) 

Accuracy: 0.207126948775
Accuracy for small interval: 0.398663697105
Accuracy for right direction: 0.236080178174

  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)


 

Classwise evaluation:
             precision    recall  f1-score   support

         -3       0.00      0.00      0.00        69
         -2       0.00      0.00      0.00        95
         -1       0.00      0.00      0.00        34
          0       0.21      0.78      0.34       105
          1       0.11      0.12      0.11        58
          2       0.00      0.00      0.00        35
          3       0.00      0.00      0.00        53

avg / total       0.06      0.20      0.09       449

k: 50
Num ashtags: True
Num mult punct: False
Pos tags Counter({False: 45, True: 1})
Word vectors Counter({False: 41530, True: 2})
Char vectors Counter({False: 79827, True: 46})
Real values: Counter({0: 105, -2: 95, -3: 69, 1: 58, 3: 53, 2: 35, -1: 34})
Predicted values: Counter({0: 416, 3: 18, -2: 15}) 

Accuracy: 0.218262806236
Accuracy for small interval: 0.420935412027
Accuracy for right direction: 0.227171492205 

Classwise evaluation:
             precision    recall  f1-score   suppo

In [None]:
print(results)

### Recursive Feature Elimination with Cross Validation


In [None]:
# Another form of feature selection

selector = RFECV(svm.LinearSVC(), step=500, cv=5, verbose=10)
selector.fit(X_train, y_train)
evaluate_model(selector, X_test, y_test)
