In [1]:
import os
import sys

sys.path.append('../')

from experiments.microblog_deep import *
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
from sklearn.svm import SVR, LinearSVR

import numpy as np
import pandas as pd

Using TensorFlow backend.


..


In [2]:
config.features_to_use_mb = [
    'google_word_emb',
    'unigram',
    'bigram',
]

### Features using:
* `'google_word_emb'`: Average of Google Word Embedding Vectors
* `'unigram'`: One-hot Encoding of Word
* `'bigram'`: One-hot Encoding of 2-gram words

### Features to try:
* `rf_unigram, rf_bigram`: unigram, bigram with weight 
$$w = \max\big(\ln(2 + \frac{c_+}{\max(1, c_-)}), \ln(2 + \frac{c_-}{\max(1, c_+)})\big)$$

In [3]:
# loading data...
X = joblib.load(config.DUMPED_VECTOR_DIR_HL + 'hl_sequences.pkl')
print('shape of X:', np.shape(X))
features, fmap = get_features('hl')
print('shape of features:', np.shape(features), end='\n\n')
y = joblib.load(os.path.join(config.DATA_DIR, 'vectors_hl_new', 'hl_scores.pkl'))
print('shape of y:', np.shape(y))

shape of X: (1633, 18)

---------------------------------------------
Loading google_word_emb from  /home/niyan/SemEval17-05-kar/data/vectors_hl_new/hl_google_word_emb.pkl
Shape =  (1633, 300), type = <class 'numpy.ndarray'>
---------------------------------------------

---------------------------------------------
Loading unigram from  /home/niyan/SemEval17-05-kar/data/vectors_hl_new/hl_unigram.pkl
Shape =  (1633, 2236), type = <class 'numpy.ndarray'>
---------------------------------------------

---------------------------------------------
Loading bigram from  /home/niyan/SemEval17-05-kar/data/vectors_hl_new/hl_bigram.pkl
Shape =  (1633, 6651), type = <class 'numpy.ndarray'>
---------------------------------------------
shape of features: (1633, 9187)

shape of y: (1633, 1)


In [4]:
# split data
n_train = 1142
X_train, X_dev, X_test, Y_train, Y_dev, Y_test = X[:n_train], X, X[n_train:], y[:n_train], y, y[n_train:]
ft_train, ft_dev, ft_test = features[:n_train], features, features[n_train:] 

In [5]:
def cos_scorer(y, y_pred):
    return cosine_similarity(np.reshape(y, (1, -1)), np.reshape(y_pred, (1, -1)))

### experiment for SVM model:

In [None]:
p_grid = {'C': [0.5, 1.0, 2.0, 5.0, 10.0, 20.0, ],
          'epsilon': [1e-3, 5e-3, 1e-2, 5e-2, 0.1, 0.5],
          'gamma': [1e-4, 5e-4, 1e-3, 0.005, 0.01, 0.05, 0.1, 0.5, 1.0]}
model = SVR()
best_model = RandomizedSearchCV(model, param_distributions=p_grid, cv=6, n_iter=200, verbose=1, n_jobs=6, 
                          scoring=make_scorer(cos_scorer))
_ = best_model.fit(ft_train, Y_train)

In [None]:
print("best param: ", best_model.best_params_)
print("best score: ", best_model.best_score_)
pd.DataFrame(best_model.cv_results_).sort_values(by=['mean_test_score'])

* SVM Result: 
    * best param:  `{'C': 2.0, 'gamma': 0.1, 'epsilon': 0.01}`
    * best cv score:  0.683590272578
    * test score: 0.695948522005

In [None]:
Y_pred = best_model.predict(ft_test)
score_ts = cosine_similarity(np.reshape(Y_test, (1, -1)), np.reshape(Y_pred, (1, -1)))
print("cosine score on test data: ", score_ts[0][0])

### experiment for Deep Learning Model:

In [6]:
from experiments.headline_deep import *
from keras.wrappers.scikit_learn import KerasRegressor

In [7]:
def nn_model():
    embedding_weights = joblib.load(config.DUMPED_VECTOR_DIR + 'hl_voc_embeddings_prs.pkl')
    print(embedding_weights.shape)
    model = Sequential()
    model.add(Embedding(max_features,
                        embedding_dims,
                        input_length=max_len,
                        weights=[embedding_weights],
                        trainable=False))
    model.add(Conv1D(256, 3, activation='relu'))
    model.add(MaxPooling1D(pool_size=16))
    model.add(Flatten())
    model.add(Dropout(rate=0.8))
    model.add(Dense(50, activation='tanh'))
    model.add(Dropout(rate=0.5))
    model.add(Dense(1, init='normal', activation='tanh'))

    model.compile(loss='mean_squared_error', optimizer='adam')
    # model.compile(loss='cosine_proximity', optimizer='rmsprop', metrics={'output_a': cosine_similarity})
    # model.compile(loss=compile_cos_sim_theano, optimizer='adam', metrics=[compile_cos_sim_theano])
    print(model.summary())

    return model

In [14]:
config.DUMPED_VECTOR_DIR = config.DUMPED_VECTOR_DIR_HL
model = KerasRegressor(build_fn=nn_model, epochs=100, batch_size=BATCH_SIZE, verbose=1)

In [15]:
model.fit(X_train, Y_train)
print('Train Score: ', cos_scorer(Y_train, model.predict(X_train)))
print('Test Score: ', cos_scorer(Y_test, model.predict(X_test)))

(3301, 300)
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_4 (Embedding)      (None, 18, 300)           990300    
_________________________________________________________________
conv1d_4 (Conv1D)            (None, 16, 256)           230656    
_________________________________________________________________
max_pooling1d_4 (MaxPooling1 (None, 1, 256)            0         
_________________________________________________________________
flatten_4 (Flatten)          (None, 256)               0         
_________________________________________________________________
dropout_7 (Dropout)          (None, 256)               0         
_________________________________________________________________
dense_7 (Dense)              (None, 50)                12850     
_________________________________________________________________
dropout_8 (Dropout)          (None, 50)                0        

Epoch 79/100
Epoch 80/100
Epoch 81/100
Epoch 82/100
Epoch 83/100
Epoch 84/100
Epoch 85/100
Epoch 86/100
Epoch 87/100
Epoch 88/100
Epoch 89/100
Epoch 90/100
Epoch 91/100
Epoch 92/100
Epoch 93/100
Epoch 94/100
Epoch 95/100
Epoch 96/100
Epoch 97/100
Epoch 98/100
Epoch 99/100
Epoch 100/100
Train Score:  [[ 0.96526624]]
Test Score:  [[ 0.73031797]]
