In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
from IPython.core.display import display, HTML
display(HTML("<style>.container { width:100% !important; }</style>"))

import sys
import numpy as np
from os.path import basename
from typing import Dict, Callable

from pathlib import Path
import pickle

import pandas as pd

from collections import namedtuple
from tqdm import tqdm_notebook
from operator import itemgetter

%matplotlib inline

sys.path.append('/home/laugustyniak/github/phd/sentiment-backend/')
from aspects.analysis.nlp_architect import get_metrics, get_models_params_from_name, filter_datasets
from aspects.analysis.statistics_dataset import get_unique_words_from_corpus, get_uni_and_multigram_aspects_stats

In [11]:
models_to_skip = ['char-bilstm', 'char-lstm']

In [12]:
def skip_char_models(model_name):
    return not any(m in model_name.as_posix() for m in models_to_skip)

In [13]:
list(Path('../models/glove.840B.300d/').glob('*'))

[PosixPath('../models/glove.840B.300d/model-info-char-word-lstm-crf-10epochs-Restaurants_poria-train.conll.info'),
 PosixPath('../models/glove.840B.300d/model-info-char-word-bilstm-crf-10epochs-Restaurants_poria-train.conll.info'),
 PosixPath('../models/glove.840B.300d/model-info-char-word-lstm-crf-10epochs-Laptops_poria-train.conll.info'),
 PosixPath('../models/glove.840B.300d/model-info-char-word-bilstm-crf-10epochs-Laptops_poria-train.conll.info')]

In [14]:
reindex_results_order = ['word bilstm', 'char word bilstm', 'word bilstm crf', 'char word bilstm crf', 'word lstm', 'char word lstm', 'word lstm crf', 'char word lstm crf']

In [15]:
def get_models_f1_metric(all_models_path=Path('../models-2/')):
    model_f1_by_word_embedding = {}
    
    for word_embedding_models_path in all_models_path.glob('*'):
        models_f1 = {}
        models_paths = filter(skip_char_models, word_embedding_models_path.glob('*'))
        models_metrics = get_metrics(models_paths)
        
        for model_name, model_metrics in models_metrics.items():
            if model_name not in models_to_skip:
                model_name = get_models_params_from_name(model_name)
                models_f1[model_name] = model_metrics.f1 
        
        if models_f1:
            model_f1_by_word_embedding[word_embedding_models_path.stem] = models_f1
    
    return pd.DataFrame.from_dict(model_f1_by_word_embedding).round(2).reindex(reindex_results_order)

In [16]:
df = get_models_f1_metric()
print(df.to_latex())
df

\begin{tabular}{l}
\toprule
Empty DataFrame
Columns: Index([], dtype='object')
Index: Index(['word bilstm', 'char word bilstm', 'word bilstm crf',
       'char word bilstm crf', 'word lstm', 'char word lstm', 'word lstm crf',
       'char word lstm crf'],
      dtype='object') \\
\bottomrule
\end{tabular}



word bilstm
char word bilstm
word bilstm crf
char word bilstm crf
word lstm
char word lstm
word lstm crf
char word lstm crf


In [17]:
# df.to_excel('/home/laugustyniak/luk.augustyniak@gmail.com/Projects/PRELUDIUM/Artykuły/aspect-based sentiment 2018/laptops-aspects.xlsx')

In [18]:
# def draw_metrics(models_metrics: Dict, title=None):
#     pd.DataFrame.from_dict({
#         get_models_params_from_name(k): v
#         for k, v
#         in list(models_metrics.items())
#     }, orient='index').sort_index(axis=0).plot(kind='bar', figsize=(25,12), title=title)

In [19]:
# draw_metrics(models_metrics)

In [20]:
# list(models_metrics.items())

# draw_metrics(dict(models_f1))

In [21]:
# for we_name, we_f1 in model_f1_by_word_embedding.items():
#     draw_metrics(dict(we_f1), title=we_name)

In [22]:
for word_embedding_models_path in list(Path('../models').glob('*')):
    models_f1 = []
    
    models_paths = word_embedding_models_path.glob('*10epoch*')

# Check coverage of words in embeddings for aspect datasets

In [23]:
corpus_words = get_unique_words_from_corpus()

Corpus iterator: 49475it [00:00, 1103036.70it/s]
Corpus iterator: 53781it [00:00, 634647.03it/s]
Corpus iterator: 12470it [00:00, 613978.32it/s]
Corpus iterator: 13257it [00:00, 631173.81it/s]


In [24]:
def load_word_embeddings(file_path):
    with open(file_path, encoding='utf-8') as fp:
        words = []
        try:
            for line in tqdm_notebook(fp, desc=file_path + ': embedding loading'):
                line_fields = line.split()
                if len(line_fields) < 5:
                    continue
                else:
                    if line[0] == ' ':
                        pass
                    else:
                        word = line_fields[0]
                        try:
                            pass
                        except:
                            continue
                        words.append(word)
        except UnicodeDecodeError:
            pass
    return set(words)

In [27]:
EMBEDDINGS_PATH = Path('/home/laugustyniak/data/embeddings/')

EMBEDDINGS = [
    'GoogleNews-vectors-negative300.txt',
    'glove.6B.50d.txt',
    'glove.6B.100d.txt',
    'glove.6B.200d.txt',
    'glove.6B.300d.txt',
    'glove.twitter.27B.200d.txt',
    'glove.42B.300d.txt',
    'glove.840B.300d.txt',
    'numberbatch-en.txt',
    'crawl-300d-2M.vec',
    'wiki-news-300d-1M-subword.vec',
    'wiki-news-300d-1M.vec',
    'bow2.words',
    'bow2.contexts',
    'bow5.words',
    'bow5.contexts',
    'deps.words',
#     'deps.contexts',
    'sota-google.txt',
    'sota-retrofit-600.txt',
    'sota-sswe-50.txt',
    'sota-wiki-600.txt',
    'sentic2vec.txt',
    'lexvec.commoncrawl.ngramsubwords.300d.W.pos.vectors',
    'lexvec.enwiki+newscrawl.300d.W.pos.vectors',    
]

In [28]:
def get_word_embeddings_vocabularies(word_embedding_names):
    return {
        word_embedding_name: load_word_embeddings((EMBEDDINGS_PATH / word_embedding_name).as_posix())
        for word_embedding_name 
        in tqdm_notebook(word_embedding_names)
    }
   
word_embedding_vocabulalaries = get_word_embeddings_vocabularies(EMBEDDINGS)

HBox(children=(IntProgress(value=0, max=24), HTML(value='')))

HBox(children=(IntProgress(value=1, bar_style='info', description='/home/laugustyniak/data/embeddings/GoogleNe…

HBox(children=(IntProgress(value=1, bar_style='info', description='/home/laugustyniak/data/embeddings/glove.6B…

HBox(children=(IntProgress(value=1, bar_style='info', description='/home/laugustyniak/data/embeddings/glove.6B…

HBox(children=(IntProgress(value=1, bar_style='info', description='/home/laugustyniak/data/embeddings/glove.6B…

HBox(children=(IntProgress(value=1, bar_style='info', description='/home/laugustyniak/data/embeddings/glove.6B…

HBox(children=(IntProgress(value=1, bar_style='info', description='/home/laugustyniak/data/embeddings/glove.tw…

HBox(children=(IntProgress(value=1, bar_style='info', description='/home/laugustyniak/data/embeddings/glove.42…

HBox(children=(IntProgress(value=1, bar_style='info', description='/home/laugustyniak/data/embeddings/glove.84…

HBox(children=(IntProgress(value=1, bar_style='info', description='/home/laugustyniak/data/embeddings/numberba…

HBox(children=(IntProgress(value=1, bar_style='info', description='/home/laugustyniak/data/embeddings/crawl-30…

FileNotFoundError: [Errno 2] No such file or directory: '/home/laugustyniak/data/embeddings/wiki-news-300d-1M-subword.vec'

In [None]:
def get_datasets_coverage_with_word_embedding(corpus_words, word_embedding_vocabulalaries):
    word_embedding_coverage = {}
    for word_embedding_name, vocabulary in word_embedding_vocabulalaries.items():
        word_embedding_coverage[word_embedding_name] = {
            corpus_name: len(words.difference(vocabulary))/len(words)
            for corpus_name, words
            in corpus_words.items()
        }
    return word_embedding_coverage

In [None]:
{k: len(v) for k, v in  corpus_words.items()}

In [18]:
len(corpus_words['Restaurants_poria-test'] - corpus_words['Restaurants_poria-train'])

747

In [19]:
len(corpus_words['Laptops_poria-test'] - corpus_words['Laptops_poria-train'])

497

In [20]:
{word_embedding_name: len(vocabulary) for word_embedding_name, vocabulary in word_embedding_vocabulalaries.items()}

{'glove.6B.50d.txt': 400000,
 'glove.6B.100d.txt': 400000,
 'glove.6B.300d.txt': 400000,
 'glove.twitter.27B.200d.txt': 1193515,
 'glove.42B.300d.txt': 1917494,
 'glove.840B.300d.txt': 2195884,
 'numberbatch-en.txt': 417194,
 'crawl-300d-2M.vec': 1999995,
 'bow2.words': 183870,
 'bow2.contexts': 183870,
 'bow5.words': 183870,
 'bow5.contexts': 183870,
 'deps.words': 174015,
 'deps.contexts': 561754,
 'sota-google.txt': 60349,
 'sota-retrofit-600.txt': 29444,
 'sota-sswe-50.txt': 137052,
 'sota-wiki-600.txt': 29444,
 'sentic2vec.txt': 42007}

In [21]:
datasets_coverage_with_word_embedding = get_datasets_coverage_with_word_embedding(corpus_words, word_embedding_vocabulalaries)

In [22]:
lack_of_words_percentage = pd.DataFrame(datasets_coverage_with_word_embedding).round(4)*100

In [23]:
lack_of_words_percentage

Unnamed: 0,glove.6B.50d.txt,glove.6B.100d.txt,glove.6B.300d.txt,glove.twitter.27B.200d.txt,glove.42B.300d.txt,glove.840B.300d.txt,numberbatch-en.txt,crawl-300d-2M.vec,bow2.words,bow2.contexts,bow5.words,bow5.contexts,deps.words,deps.contexts,sota-google.txt,sota-retrofit-600.txt,sota-sswe-50.txt,sota-wiki-600.txt,sentic2vec.txt
Laptops_poria-test,4.27,4.27,4.27,8.17,1.77,2.24,9.83,2.39,6.71,6.71,6.71,6.71,6.66,100.0,10.51,12.54,6.14,12.54,17.17
Laptops_poria-train,9.18,9.18,9.18,11.7,3.4,4.2,14.06,4.32,11.02,11.02,11.02,11.02,11.23,100.0,18.92,20.86,11.99,20.86,18.55
Restaurants_poria-test,7.4,7.4,7.4,8.63,3.66,4.36,9.12,4.27,10.53,10.53,10.53,10.53,11.1,100.0,19.91,21.81,9.87,21.81,32.2
Restaurants_poria-train,10.78,10.78,10.78,11.99,5.0,6.11,13.7,5.98,14.38,14.38,14.38,14.38,15.26,100.0,26.65,28.38,14.09,28.38,34.38


# 

In [3]:
aspects = get_uni_and_multigram_aspects_stats()

Corpus iterator: 49475it [00:00, 524886.09it/s]
Corpus iterator: 53781it [00:00, 661840.76it/s]
Corpus iterator: 12470it [00:00, 770907.21it/s]
Corpus iterator: 13257it [00:00, 708889.67it/s]


In [9]:
print(pd.DataFrame.from_dict(aspects, orient='index', columns=['Multi aspect ']).to_latex())

\begin{tabular}{lr}
\toprule
{} &  Multi aspect  \\
\midrule
Restaurants\_poria-train &          24.77 \\
Laptops\_poria-train     &          36.94 \\
Laptops\_poria-test      &          44.63 \\
Restaurants\_poria-test  &          27.75 \\
\bottomrule
\end{tabular}



# Words/aspects appeared in test data but not in training 

In [10]:
corpus_words

NameError: name 'corpus_words' is not defined