In [1]:
%load_ext autoreload

In [2]:
%autoreload 2

In [6]:
import functools
import numpy as np
import pandas as pd
from tqdm import tqdm_notebook

from sklearn.feature_extraction.text import TfidfVectorizer

In [19]:
import stats
import senticnet
import features

In [5]:
import read_data
data = read_data.read_all_datasets()

# Extract features

### Load some resources first

In [80]:
# load some things before, plz
affectivespace, affectivespace_vocab = senticnet.load_affectivespace()
senticnet5, senticnet5_vocab, senticnet5_full = senticnet.load_senticnet()


feats = {data_name: dict() for data_name in data.keys()}

In [20]:
emb = features.load_embeddings()

In [83]:
ngrams = dict()
for data_name in tqdm_notebook(data.keys()):
    ngrams[data_name] = senticnet.extract_ngrams(data[data_name]['text'], (1,5))
    
custom_lexicon = dict()
for data_name in tqdm_notebook(data.keys()):
    custom_lexicon[data_name] = features.generate_custom_lexicon(data[data_name])
    
senticnet5_lexicon = dict()
for data_name in tqdm_notebook(data.keys()):
    senticnet5_lexicon[data_name] = features.generate_senticnet5_lexicon(data[data_name], senticnet5_full)

HBox(children=(IntProgress(value=0, max=5), HTML(value='')))




HBox(children=(IntProgress(value=0, max=5), HTML(value='')))




HBox(children=(IntProgress(value=0, max=5), HTML(value='')))




### Extract the features

In [86]:
# affectivespace
for data_name in tqdm_notebook(data.keys()):
    feats[data_name]['affectivespace'] = senticnet.extract_affectivespace_features(
        ngrams[data_name],
        affectivespace=affectivespace,
        affectivespace_vocab= affectivespace_vocab,
    )

# senticnet5
for data_name in tqdm_notebook(data.keys()):
    feats[data_name]['senticnet5'] = senticnet.extract_senticnet_features(
        ngrams[data_name],
        senticnet=senticnet5,
        senticnet_vocab= senticnet5_vocab,
    )
    
# TF-IDF
for data_name in tqdm_notebook(data.keys()):
    tfidf = TfidfVectorizer(ngram_range=(1,1), max_features=5000)
    feats[data_name]['tfidf'] = tfidf.fit_transform(data[data_name]['text'].values)
    
# simon - custom lexicon
for data_name in tqdm_notebook(data.keys()):
    feats[data_name]['simon-custom'] = features.simon_feat_extractor(
        dataset=data[data_name],
        lexicon=custom_lexicon[data_name],
        embedding_model=emb,
        n_lexicon_words=200,
        percentage=100,
)
    
# simon - sentincnet5 lexicon
for data_name in tqdm_notebook(data.keys()):
    feats[data_name]['simon-senticnet5'] = features.simon_feat_extractor(
        dataset=data[data_name],
        lexicon=senticnet5_lexicon[data_name],
        embedding_model=emb,
        n_lexicon_words=200,
        percentage=100,
)

HBox(children=(IntProgress(value=0, max=5), HTML(value='')))




HBox(children=(IntProgress(value=0, max=5), HTML(value='')))




HBox(children=(IntProgress(value=0, max=5), HTML(value='')))




HBox(children=(IntProgress(value=0, max=5), HTML(value='')))

  f = msb / msw





HBox(children=(IntProgress(value=0, max=5), HTML(value='')))




In [88]:
# affectivespace + senticnet5
for data_name in tqdm_notebook(data.keys()):
    feats[data_name]['affectivespace+senticnet5'] = np.concatenate((
        feats[data_name]['affectivespace'],
        feats[data_name]['senticnet5'],
    ), axis=1)
    
# tfidf + affectivespace
for data_name in tqdm_notebook(data.keys()):
    feats[data_name]['tfidf+affectivespace'] = np.concatenate((
        feats[data_name]['tfidf'].toarray(),
        feats[data_name]['affectivespace'],
    ), axis=1)
    
# tfidf + senticnet5
for data_name in tqdm_notebook(data.keys()):
    feats[data_name]['tfidf+senticnet5'] = np.concatenate((
        feats[data_name]['tfidf'].toarray(),
        feats[data_name]['senticnet5'],
    ), axis=1)
    
# tfidf + affectivespace + senticnet5
for data_name in tqdm_notebook(data.keys()):
    feats[data_name]['tfidf+affectivespace+senticnet5'] = np.concatenate((
        feats[data_name]['tfidf'].toarray(),
        feats[data_name]['affectivespace'],
        feats[data_name]['senticnet5'],
    ), axis=1)
    
# simon-custom + affectivespace
for data_name in tqdm_notebook(data.keys()):
    feats[data_name]['simon-custom+affectivespace'] = np.concatenate((
        feats[data_name]['simon-custom'],
        feats[data_name]['affectivespace'],
    ), axis=1)
    
# simon-custom + senticnet5
for data_name in tqdm_notebook(data.keys()):
    feats[data_name]['simon-custom+senticnet5'] = np.concatenate((
        feats[data_name]['simon-custom'],
        feats[data_name]['senticnet5'],
    ), axis=1)
    
# simon-custom + affectivespace + senticnet5
for data_name in tqdm_notebook(data.keys()):
    feats[data_name]['simon-custom+affectivespace+senticnet5'] = np.concatenate((
        feats[data_name]['simon-custom'],
        feats[data_name]['affectivespace'],
        feats[data_name]['senticnet5'],
    ), axis=1)

# simon-senticnet5 + affectivespace
for data_name in tqdm_notebook(data.keys()):
    feats[data_name]['simon-senticnet5+affectivespace'] = np.concatenate((
        feats[data_name]['simon-senticnet5'],
        feats[data_name]['affectivespace'],
    ), axis=1)
    
# simon-senticnet5 + senticnet5
for data_name in tqdm_notebook(data.keys()):
    feats[data_name]['simon-senticnet5+senticnet5'] = np.concatenate((
        feats[data_name]['simon-senticnet5'],
        feats[data_name]['senticnet5'],
    ), axis=1)
    
# simon-senticnet5 + affectivespace + senticnet5
for data_name in tqdm_notebook(data.keys()):
    feats[data_name]['simon-senticnet5+affectivespace+senticnet5'] = np.concatenate((
        feats[data_name]['simon-senticnet5'],
        feats[data_name]['affectivespace'],
        feats[data_name]['senticnet5'],
    ), axis=1)


HBox(children=(IntProgress(value=0, max=5), HTML(value='')))




HBox(children=(IntProgress(value=0, max=5), HTML(value='')))




HBox(children=(IntProgress(value=0, max=5), HTML(value='')))




HBox(children=(IntProgress(value=0, max=5), HTML(value='')))




HBox(children=(IntProgress(value=0, max=5), HTML(value='')))




HBox(children=(IntProgress(value=0, max=5), HTML(value='')))




HBox(children=(IntProgress(value=0, max=5), HTML(value='')))




HBox(children=(IntProgress(value=0, max=5), HTML(value='')))




HBox(children=(IntProgress(value=0, max=5), HTML(value='')))




HBox(children=(IntProgress(value=0, max=5), HTML(value='')))




# Evaluate

In [None]:
from evaluation import *

classifiers = ('LogR', 'LinSVM')

results = dict()
for data_name in tqdm_notebook(data.keys(), desc='dataset'):
    for clf_name in tqdm_notebook(classifiers, desc='clf'):
        
        clf = select_classifier(clf_name=clf_name)
        for feats_name in feats[data_name].keys():
            y = data[data_name]['label'].values
            exp_code = experiment_code(data_name, feats_name, clf_name)
            results[exp_code] = predict_evaluate(clf, feats[data_name][feats_name], y)

HBox(children=(IntProgress(value=0, description='dataset', max=5, style=ProgressStyle(description_width='initi…

HBox(children=(IntProgress(value=0, description='clf', max=2, style=ProgressStyle(description_width='initial')…





HBox(children=(IntProgress(value=0, description='clf', max=2, style=ProgressStyle(description_width='initial')…





In [None]:
save_results(results)

In [118]:
features_order = (
    'tfidf', 'affectivespace', 'senticnet5', 'simon-custom', 'simon-senticnet5',
    'affectivespace+senticnet5', 'tfidf+affectivespace', 'tfidf+senticnet5',
    'simon-custom+affectivespace', 'simon-custom+senticnet5', 'simon-custom+affectivespace+senticnet5', 
    'simon-senticnet5+affectivespace', 'simon-senticnet5+senticnet5',
    'tfidf+affectivespace+senticnet5', 'simon-senticnet5+affectivespace+senticnet5',
)

def get_f1score(results, key):
    return results[key].loc['weighted avg', 'f1-score']

iterables = [[clf_name for clf_name in classifiers],
             [feats for feats in features_order]]
mi = pd.MultiIndex.from_product(iterables, names=['classifier', 'features'])
view = pd.DataFrame(columns=('pro-neu', 'pro-anti', 'magazines'), index=mi)
c = 0
for data_name in data.keys():
    print(data_name)
    for clf_name in classifiers:
        for feats_name in features_order:
            exp_code = "{}_{}_{}".format(data_name, feats_name, clf_name)
            view.loc[(clf_name, feats_name), data_name] = get_f1score(results, exp_code)
            c += 1
view

semeval19hate
davidson
pro-neu
magazines
pro-anti


Unnamed: 0_level_0,Unnamed: 1_level_0,pro-neu,pro-anti,magazines,semeval19hate,davidson
classifier,features,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
LogR,tfidf,86.61,84.63,88.89,75.4,89.33
LogR,affectivespace,87.05,72.53,67.52,64.49,81.45
LogR,senticnet5,70.98,71.64,70.3,65.15,80.61
LogR,simon-custom,97.77,86.66,94.02,72.74,90.54
LogR,simon-senticnet5,96.43,82.77,88.25,71.23,89.9
LogR,affectivespace+senticnet5,70.98,72.0,70.3,67.31,83.19
LogR,tfidf+affectivespace,87.05,84.81,89.1,75.65,89.48
LogR,tfidf+senticnet5,90.62,86.75,86.75,75.07,89.02
LogR,simon-custom+affectivespace,97.77,86.66,94.02,72.95,90.49
LogR,simon-custom+senticnet5,97.32,86.66,94.23,72.86,90.51


In [119]:
x = view.reset_index()
# x = x[~x['features'].str.contains('simon-custom')]

In [120]:
_, friedman_list = stats.main(x)
friedman_list

k: 30       N: 5       a: 0.01
chi2:  91.68000000000005
Friedman's F:  6.877719429857475
F(29,116)|0.01:  1.8784961968012859


Unnamed: 0,0
LinSVM simon-custom+senticnet5,6.8
LinSVM simon-custom+affectivespace+senticnet5,7.2
LinSVM simon-custom,7.3
LinSVM simon-custom+affectivespace,8.0
LogR simon-custom+affectivespace+senticnet5,8.7
LogR simon-custom+affectivespace,8.8
LogR simon-custom+senticnet5,9.1
LogR simon-custom,9.2
LinSVM tfidf+affectivespace+senticnet5,9.4
LinSVM tfidf+senticnet5,10.0


In [None]:
!/notebooks/scripts/trainingbot.sh "SenticReady!"

In [None]:
raise
view.to_pickle("../results/2020-05-20/view.pck")