In [186]:
import numpy as np
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn.naive_bayes import MultinomialNB
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score, f1_score, accuracy_score, confusion_matrix
from sklearn.linear_model import SGDClassifier
from sklearn.svm import SVC
import nltk
from sklearn.model_selection import cross_val_score, StratifiedKFold

In [67]:
df = pd.read_csv('data/data_no_tweet.tsv', sep='\t')
df['target_text'] = df.apply(lambda x: x.text + ' [SEP] ' + x.target, axis=1)
df_train, df_test = train_test_split(
  df,
  test_size=0.2,
  random_state=42
)

# building tf-idf matrix

In [68]:
count_vect = CountVectorizer()
word_idx = count_vect.fit_transform(df_train.target_text.values)
word_idx.shape

(4865, 5009)

In [69]:
count_vect.vocabulary_.get('pizza')

3281

In [70]:
tfidf_transformer = TfidfTransformer().fit(word_idx)
train_tfidf = tfidf_transformer.transform(word_idx)
train_tfidf.shape

(4865, 5009)

In [71]:
train_tfidf

<4865x5009 sparse matrix of type '<class 'numpy.float64'>'
	with 81469 stored elements in Compressed Sparse Row format>

In [72]:
# df = df.sample(frac=0.1)

In [10]:
df_train, df_test = train_test_split(
  df,
  test_size=0.2,
  random_state=42
)

In [73]:
X_train = tfidf_transformer.transform(count_vect.transform(df_train.text.values))

# Fitting NB classifier

In [8]:
nb = MultinomialNB().fit(train_tfidf, df_train.sentiment)

In [17]:
X_test = tfidf_transformer.transform(count_vect.transform(df_test.text.values))
predicted = nb.predict(X_test)

In [18]:
accuracy_score(df_test.sentiment, predicted), f1_score(df_test.sentiment, predicted, average='macro')

(0.7345932621199671, 0.508865100630436)

In [19]:
confusion_matrix(df_test.sentiment, predicted)

array([[ 89,   2, 171],
       [ 31,  21, 103],
       [ 13,   3, 784]], dtype=int64)

In [14]:
predicted = nb.predict(train_tfidf)

In [15]:
accuracy_score(df_train.sentiment, predicted), f1_score(df_train.sentiment, predicted, average='macro')

(0.7677286742034943, 0.6070861723998013)

In [20]:
df.sentiment.value_counts()

 1    3816
-1    1464
 0     802
Name: sentiment, dtype: int64

# SVM

In [111]:
sgd = SGDClassifier(loss='hinge', penalty='l2',
                          alpha=1e-4, random_state=42,
                          max_iter=1000, tol=None)

In [112]:
sgd.fit(train_tfidf, df_train.sentiment)

SGDClassifier(random_state=42, tol=None)

In [113]:
X_test = tfidf_transformer.transform(count_vect.transform(df_test.target_text.values))
predicted = sgd.predict(X_test)

In [114]:
accuracy_score(df_test.sentiment, predicted), f1_score(df_test.sentiment, predicted, average='macro')

(0.7830731306491372, 0.6742810967187066)

In [79]:
confusion_matrix(df_test.sentiment, predicted)

array([[165,  20,  77],
       [ 39,  71,  45],
       [ 49,  34, 717]], dtype=int64)

In [80]:
predicted = sgd.predict(train_tfidf)

In [81]:
accuracy_score(df_train.sentiment, predicted), f1_score(df_train.sentiment, predicted, average='macro')

(0.9085303186022611, 0.8661016725465255)

In [40]:
svm = SVC(kernel='rbf', C=10).fit(train_tfidf, df_train.sentiment)

In [41]:
predicted = svm.predict(X_test)
accuracy_score(df_test.sentiment, predicted), f1_score(df_test.sentiment, predicted, average='macro')

(0.7584223500410846, 0.6341398506885992)

# Extracting the sentence containing the target

In [88]:
def get_sent(row):
    sents = nltk.sent_tokenize(row.text)
    for sent in sents:
        if row.target.lower() in sent.lower():
            return sent
    return row.text

df['sent'] = df.apply(get_sent, axis=1)

In [92]:
df_train, df_test = train_test_split(
  df,
  test_size=0.2,
  random_state=42
)

In [93]:
sgd = SGDClassifier(loss='hinge', penalty='l2',
                          alpha=1e-4, random_state=42,
                          max_iter=1000, tol=None)

sgd.fit(train_tfidf, df_train.sentiment)

X_test = tfidf_transformer.transform(count_vect.transform(df_test.sent.values))
predicted = sgd.predict(X_test)

accuracy_score(df_test.sentiment, predicted), f1_score(df_test.sentiment, predicted, average='macro')

(0.7871815940838126, 0.6805891682930723)

In [94]:
confusion_matrix(df_test.sentiment, predicted)

array([[171,  17,  74],
       [ 40,  71,  44],
       [ 48,  36, 716]], dtype=int64)

# Fine tune   

In [2]:
from hyperopt import fmin, tpe, hp, STATUS_OK, Trials
from hyperopt.pyll import scope

C:\Users\wojtek\anaconda3\lib\site-packages\numpy\.libs\libopenblas.EL2C6PLE4ZYW3ECEVIV3OXXGRN2NRFM2.gfortran-win_amd64.dll
C:\Users\wojtek\anaconda3\lib\site-packages\numpy\.libs\libopenblas.FB5AE2TYXYH2IJRDKGDGQ3XBKLKTF43H.gfortran-win_amd64.dll


In [211]:
def prepare_data_splits(df, vectorizer_params):
    df_train, df_test = train_test_split(
      df,
      test_size=0.2,
      random_state=42
    )

    count_vect = CountVectorizer(**vectorizer_params)
    word_idx = count_vect.fit_transform(df_train.target_text.values)

    tfidf_transformer = TfidfTransformer().fit(word_idx)

    x_train = tfidf_transformer.transform(word_idx)
    x_test = tfidf_transformer.transform(count_vect.transform(df_test.target_text.values))
    
    return x_train, x_test, df_train.sentiment, df_test.sentiment, count_vect, tfidf_transformer

In [208]:
def hyperopt_search(x_train, y_train, search_space):
    def objective(params):
        sgd = SGDClassifier(max_iter=1000, **params)
        kfold = StratifiedKFold(n_splits=5, shuffle=True)
        neg_acc = -cross_val_score(sgd, x_train, y_train, cv=kfold, scoring='accuracy', verbose=False).mean() 
        return {'loss': neg_acc, 'status': STATUS_OK}
    
    loss = ['hinge', 'log', 'perceptron', 'modified_huber']
    penalty = ['l1', 'l2']

    search_space = {
        'loss': hp.choice('loss', loss),
        'alpha': hp.loguniform('alpha', -8, -1),
        'penalty': hp.choice('penalty', penalty),
        'tol': hp.loguniform('tol', -4, 0)
    }

    best_result = fmin(
        fn=objective,
        space=search_space,
        algo=tpe.suggest,
        max_evals=1000,
        trials=Trials()
    )

    best_result['loss'] = loss[best_result['loss']]
    best_result['penalty'] = penalty[best_result['penalty']]

    return best_result

In [202]:
params = hyperopt_search(x_train, df_train.sentiment, search_space)

100%|████████████████████████████████████████████████████████████████████████████████| 1000/1000 [00:58<00:00, 17.03trial/s, best loss: -0.7882836587872559]


In [203]:
params

{'alpha': 0.00033562385583315647,
 'loss': 'modified_huber',
 'penalty': 'l2',
 'tol': 0.07672722709490018}

In [212]:
def train_best_model(x_train, y_train, x_test, y_test, params):
    sgd = SGDClassifier(max_iter=1000, **params)
    sgd.fit(x_train, y_train)
    predicted = sgd.predict(x_test)

    return accuracy_score(y_test, predicted), f1_score(y_test, predicted, average='macro')

In [205]:
train_best_model(x_train, df_train.sentiment, x_test, df_test.sentiment, params)

(0.7888249794576828, 0.6851497684823272)

In [216]:
vectorizer_params = dict(ngram_range=(1, 2), max_df=0.8)
x_train, x_test, y_train, y_test, _, _ = prepare_data_splits(df, vectorizer_params)
params = hyperopt_search(x_train, y_train, search_space)
train_best_model(x_train, y_train, x_test, y_test, params)

100%|████████████████████████████████████████████████████████████████████████████████| 1000/1000 [01:27<00:00, 11.39trial/s, best loss: -0.7843782117163413]


(0.7871815940838126, 0.6812598597825724)

In [217]:
vectorizer_params = dict(ngram_range=(2, 2), max_df=0.8)
x_train, x_test, y_train, y_test, _, _ = prepare_data_splits(df, vectorizer_params)
params = hyperopt_search(x_train, y_train, search_space)
train_best_model(x_train, y_train, x_test, y_test, params)

100%|████████████████████████████████████████████████████████████████████████████████| 1000/1000 [01:04<00:00, 15.62trial/s, best loss: -0.7683453237410072]


(0.7551355792933443, 0.6593593665186033)

In [218]:
vectorizer_params = dict(ngram_range=(1, 3), max_df=0.8)
x_train, x_test, y_train, y_test, _, _ = prepare_data_splits(df, vectorizer_params)
params = hyperopt_search(x_train, y_train, search_space)
train_best_model(x_train, y_train, x_test, y_test, params)

100%|████████████████████████████████████████████████████████████████████████████████| 1000/1000 [01:43<00:00,  9.67trial/s, best loss: -0.7757451181911614]


(0.7691043549712407, 0.6608638355645332)

In [219]:
vectorizer_params = dict(ngram_range=(1, 1), max_df=0.8)
x_train, x_test, y_train, y_test, _, _ = prepare_data_splits(df, vectorizer_params)
params = hyperopt_search(x_train, y_train, search_space)
train_best_model(x_train, y_train, x_test, y_test, params)

100%|████████████████████████████████████████████████████████████████████████████████| 1000/1000 [00:57<00:00, 17.36trial/s, best loss: -0.7868448098663926]


(0.7822514379622021, 0.6697735030238526)

In [214]:
params

{'alpha': 0.0005918613003051993,
 'loss': 'modified_huber',
 'penalty': 'l2',
 'tol': 0.3204223406108451}