In [1]:
import pickle
import re
from nltk.tokenize import TweetTokenizer
import numpy as np
from sklearn import svm
from sklearn import metrics
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import GridSearchCV 


In [2]:

train_set = np.asarray(pickle.load(open("train_set.p", "rb")))
test_set = np.asarray(pickle.load(open("test_set.p", "rb")))


In [3]:
y_train = train_set[:, 1].astype(np.int_)
train = train_set[:, 0]

y_test = test_set[:, 1].astype(np.int_)
test = test_set[:, 0]


In [4]:
def get_num_of_hashtags(data_set):
    X = np.zeros((len(data_set), 1))
    
    for i, tweet in enumerate(data_set):
        X[i, 0] = np.asarray(tweet.count("#"))
        
    return X


In [5]:
def get_num_of_mult_punctuation(data_set):
    X = np.zeros((len(data_set), 1))

    for i, tweet in enumerate(data_set):
        X[i, 0] = len(re.findall(r"[?!]{2,}", tweet))
        
    return X



In [6]:
# TODO: split for train and test and return vectorizer


def get_ngram_vectors(train, test, method="word", upper=3, lower=1):
    vectorizer = CountVectorizer(ngram_range=(lower, upper), tokenizer=TweetTokenizer().tokenize, 
                                 analyzer=method)
    train_vector = vectorizer.fit_transform(train)
    test_vector = vectorizer.transform(test)

    return np.asarray(train_vector.toarray()), np.asarray(test_vector.toarray())




In [7]:
def extract_features(train, test):
    word_ngram_upper_bound = 4
    char_ngram_upper_bound = 5
    char_ngram_lower_bound = 3
    
    train_word_vectors, test_word_vectors = get_ngram_vectors(
        train, test, "word", word_ngram_upper_bound)
    train_char_vectors, test_char_vectors = get_ngram_vectors(
        train, test, "char", char_ngram_upper_bound, char_ngram_lower_bound)

    train_features = np.concatenate([get_num_of_hashtags(train),
                                     get_num_of_mult_punctuation(train),
                                     train_word_vectors,
                                     train_char_vectors], axis=1)

    test_features = np.concatenate([get_num_of_hashtags(test),
                                    get_num_of_mult_punctuation(test),
                                    test_word_vectors,
                                    test_char_vectors], axis=1)
    
    return train_features, test_features



In [8]:
def transform_to_pos_neg(np_array):
    np_array[np_array < 0] = -1
    np_array[np_array > 0] = 1
    
    return np_array



In [9]:
def evaluate_model(model, X, y):
    prediction = model.predict(X)

    acc = metrics.accuracy_score(y, prediction)
    print("Accuracy:", acc)

    acc_interval = acc + metrics.accuracy_score(y + 1, prediction) \
                   + metrics.accuracy_score(y - 1, prediction)
    print("Accuracy for small interval:", acc_interval)  # Accuracy if predicted value is +-1

    acc_dir = metrics.accuracy_score(transform_to_pos_neg(y), transform_to_pos_neg(prediction))
    print("Accuracy for right direction:", acc_dir)


In [10]:
X_train, X_test = extract_features(train, test)


## Training without Grid Search

In [11]:
# TODO: normalization? 

lin_svm = svm.LinearSVC()
lin_svm.fit(X_train, y_train)

LinearSVC(C=1.0, class_weight=None, dual=True, fit_intercept=True,
     intercept_scaling=1, loss='squared_hinge', max_iter=1000,
     multi_class='ovr', penalty='l2', random_state=None, tol=0.0001,
     verbose=0)

In [12]:
evaluate_model(lin_svm, X_test, y_test)

Accuracy: 0.285
Accuracy for small interval: 0.54
Accuracy for right direction: 0.495


## Training with Grid Search

In [14]:
cs = [0.001, 0.01, 0.1, 1, 10]
grid = {'C': cs}

grid_search = GridSearchCV(svm.LinearSVC(), grid, cv=5, verbose=10)
grid_search.fit(X_train, y_train)
print(grid_search.best_params_)

Fitting 5 folds for each of 5 candidates, totalling 25 fits
[CV] C=0.001 .........................................................


[CV] .......................... C=0.001, score=0.309322, total=   1.0s
[CV] C=0.001 .........................................................


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    1.2s remaining:    0.0s


[CV] .......................... C=0.001, score=0.360169, total=   0.9s
[CV] C=0.001 .........................................................


[Parallel(n_jobs=1)]: Done   2 out of   2 | elapsed:    2.3s remaining:    0.0s


[CV] .......................... C=0.001, score=0.340426, total=   0.9s
[CV] C=0.001 .........................................................


[Parallel(n_jobs=1)]: Done   3 out of   3 | elapsed:    3.5s remaining:    0.0s


[CV] .......................... C=0.001, score=0.354701, total=   0.8s
[CV] C=0.001 .........................................................


[Parallel(n_jobs=1)]: Done   4 out of   4 | elapsed:    4.6s remaining:    0.0s


[CV] .......................... C=0.001, score=0.334764, total=   0.9s
[CV] C=0.01 ..........................................................


[Parallel(n_jobs=1)]: Done   5 out of   5 | elapsed:    5.8s remaining:    0.0s


[CV] ........................... C=0.01, score=0.296610, total=   1.3s
[CV] C=0.01 ..........................................................


[Parallel(n_jobs=1)]: Done   6 out of   6 | elapsed:    7.3s remaining:    0.0s


[CV] ........................... C=0.01, score=0.381356, total=   1.2s
[CV] C=0.01 ..........................................................


[Parallel(n_jobs=1)]: Done   7 out of   7 | elapsed:    8.8s remaining:    0.0s


[CV] ........................... C=0.01, score=0.331915, total=   1.2s
[CV] C=0.01 ..........................................................


[Parallel(n_jobs=1)]: Done   8 out of   8 | elapsed:   10.3s remaining:    0.0s


In [42]:
evaluate_model(grid_search, X_test, y_test)


0.14499999999999999