In [1]:
import pickle
import re
from nltk.tokenize import TweetTokenizer
from nltk import ngrams
import numpy as np
from sklearn import svm
from sklearn import metrics
from sklearn.feature_extraction.text import CountVectorizer


In [2]:

train_set = np.asarray(pickle.load(open("train_set.p", "rb")))
test_set = np.asarray(pickle.load(open("test_set.p", "rb")))


In [3]:
y_train = train_set[:, 1].astype(np.int_)
train = train_set[:, 0]

y_test = test_set[:, 1].astype(np.int_)
test = test_set[:, 0]


In [5]:
def get_num_of_hastags(data_set):
    X = np.zeros((len(data_set), 1))
    
    for i, tweet in enumerate(data_set):
        X[i, 0] = np.asarray(tweet.count("#"))
        
    return X


In [6]:
def get_num_of_mult_punctuation(data_set):
    X = np.zeros((len(data_set), 1))

    for i, tweet in enumerate(data_set):
        X[i, 0] = len(re.findall(r"[?!]{2,}", tweet))
        
    return X



In [7]:
def get_ngram_vectors(train, test, n):
    vectorizer = CountVectorizer(ngram_range=(1, n), tokenizer=TweetTokenizer().tokenize)
    train_vector = vectorizer.fit_transform(train)
    test_vector = vectorizer.transform(test)

    return np.asarray(train_vector.toarray()), np.asarray(test_vector.toarray())




In [8]:
def extract_features(train, test):
    # TODO: remove magic number?
    train_vectors, test_vectors = get_ngram_vectors(train, test, 3)

    train_features = np.concatenate([get_num_of_hastags(train),
                                     get_num_of_mult_punctuation(train),
                                     train_vectors], axis=1)

    test_features = np.concatenate([get_num_of_hastags(test),
                                    get_num_of_mult_punctuation(test),
                                    test_vectors], axis=1)
    return train_features, test_features



In [9]:
def transform_to_pos_neg(np_array):
    np_array[np_array < 0] = -1
    np_array[np_array > 0] = 1
    
    return np_array



In [10]:
def eveluate_model(model, X_test):
    prediction = model.predict(X_test)

    acc = metrics.accuracy_score(y_test, prediction)
    print("Accuracy:", acc)

    acc_interval = acc + metrics.accuracy_score(y_test + 1, prediction) \
                   + metrics.accuracy_score(y_test - 1, prediction)
    print("Accuracy for small interval:", acc_interval)  # Accuracy if predicted value is +-1

    acc_dir = metrics.accuracy_score(transform_to_pos_neg(y_test), transform_to_pos_neg(prediction))
    print("Accuracy for right direction:", acc_dir)


In [11]:
X_train, X_test = extract_features(train, test)


In [12]:
# TODO: implement cross validation
# TODO: normalization? 
# TODO: find out about c and y (grid search maybe)

lin_svm = svm.LinearSVC()
lin_svm.fit(X_train, y_train)

LinearSVC(C=1.0, class_weight=None, dual=True, fit_intercept=True,
     intercept_scaling=1, loss='squared_hinge', max_iter=1000,
     multi_class='ovr', penalty='l2', random_state=None, tol=0.0001,
     verbose=0)

In [13]:
eveluate_model(lin_svm, X_test)

Accuracy: 0.26
Accuracy for small interval: 0.505
Accuracy for right direction: 0.45
