# Explore

In [34]:
from datasets import list_datasets, load_dataset, list_metrics, load_metric
import spacy
from elasticsearch import Elasticsearch
from datetime import datetime
from tqdm import tqdm


In [2]:
#!pip install datasets
#!pip install spacy
#!pip install elasticsearch[async]

In [3]:
list_datasets()

['acronym_identification',
 'ade_corpus_v2',
 'aeslc',
 'afrikaans_ner_corpus',
 'ag_news',
 'ai2_arc',
 'air_dialogue',
 'ajgt_twitter_ar',
 'allegro_reviews',
 'allocine',
 'alt',
 'amazon_polarity',
 'amazon_reviews_multi',
 'amazon_us_reviews',
 'ambig_qa',
 'amttl',
 'anli',
 'app_reviews',
 'aqua_rat',
 'aquamuse',
 'ar_cov19',
 'ar_res_reviews',
 'arabic_billion_words',
 'arabic_pos_dialect',
 'arcd',
 'arsentd_lev',
 'art',
 'arxiv_dataset',
 'aslg_pc12',
 'asnq',
 'asset',
 'assin',
 'assin2',
 'atomic',
 'autshumato',
 'bc2gm_corpus',
 'best2009',
 'bianet',
 'bible_para',
 'big_patent',
 'billsum',
 'bing_coronavirus_query_set',
 'biomrc',
 'blended_skill_talk',
 'blimp',
 'blog_authorship_corpus',
 'bookcorpus',
 'bookcorpusopen',
 'boolq',
 'bprec',
 'break_data',
 'brwac',
 'bsd_ja_en',
 'bswac',
 'c3',
 'c4',
 'cail2018',
 'capes',
 'catalonia_independence',
 'cawac',
 'cc100',
 'cdsc',
 'cdt',
 'cfq',
 'chr_en',
 'cifar10',
 'circa',
 'civil_comments',
 'clickbait_news_

In [4]:
#WIKITEXT_DATASET = "wikitext-2-raw-v1"
#WIKITEXT_DATASET = "wikitext-103-raw-v1"
#WIKITEXT_DATASET = "wikitext-2-v1"
WIKITEXT_DATASET = "wikitext-103-v1"

In [5]:
wikitext_dataset = load_dataset('wikitext', WIKITEXT_DATASET)
train_wikitext_dataset = wikitext_dataset["train"]

# List all the available metrics
print("Available metrics: {}".format(list_metrics()))

## Load a metric
#squad_metric = load_metric('squad')

Reusing dataset wikitext (/home/wotan/.cache/huggingface/datasets/wikitext/wikitext-103-v1/1.0.0/47c57a6745aa5ce8e16a5355aaa4039e3aa90d1adad87cef1ad4e0f29e74ac91)


Available metrics: ['accuracy', 'bertscore', 'bleu', 'bleurt', 'coval', 'f1', 'gleu', 'glue', 'indic_glue', 'meteor', 'precision', 'recall', 'rouge', 'sacrebleu', 'seqeval', 'squad', 'squad_v2', 'xnli']


In [6]:
len(train_wikitext_dataset)

1801350

In [7]:
train_wikitext_dataset[1]

{'text': ' = Valkyria Chronicles III = \n'}

**Identify sentences and send them to Solr to be indexed. Use Spacy for sentence tokenization.**

In [8]:
#!python -m spacy download en_core_web_sm

In [9]:
nlp = spacy.load("en_core_web_sm")

In [10]:
class Normalizer(object):
    PUNCTUATION_CHARS = [".", ",", ":", ";", "?", "!", "¿", '"', "'"]
    
    @staticmethod
    def get_normalized_sentence(s: spacy.tokens.span.Span) -> str:
        """
        Transform a spacy Span object into a normalized string sentence:
        - Lower case.
        - Without punctuation chars.
        """
        res = [t.text.lower() for t in s if t.text not in Normalizer.PUNCTUATION_CHARS]
        return " ".join(res).strip()

In [16]:
def send_to_elasticsearch(s: str, model: "spacy.language.Language", es: Elasticsearch, es_index: str):
    """
    Send to Elastic Search the sentences contained in a given text.
    The text is split into sentences, normalized an then sent to ElasticSearch (one document per sentence).
    """
    doc = model(s)
    s_sentences = [Normalizer.get_normalized_sentence(sent) for sent in doc.sents]
    orig_s_sentences = [str(sent) for sent in doc.sents]
#    print(s_sentences)
    
    for i in range(len(s_sentences)):
        doc = {
            'original_text': orig_s_sentences[i],
            'text': s_sentences[i],
            'timestamp': datetime.now(),
        }
        res = es.index(index=es_index, body=doc)
    
    # TODO

Use the index "concept_embeddings"

In [63]:
ES_INDEX = "concept_embeddings"
ES_QUERY_SIZE = 1000

In [18]:
es = Elasticsearch(
    hosts=["localhost:9200"],
#    sniff_on_start=True,
#    sniff_on_connection_fail=True,
#    sniffer_timeout=60
)

In [19]:
#?es.index

See https://elasticsearch-py.readthedocs.io/en/7.10.0/api.html

In [22]:
#send_to_elasticsearch(train_wikitext_dataset[4]["text"], nlp, es, ES_INDEX)

In [70]:
es.count()

{'count': 3321,
 '_shards': {'total': 1, 'successful': 1, 'skipped': 0, 'failed': 0}}

In [78]:
es.search(size=1000, index=ES_INDEX)

{'took': 117,
 'timed_out': False,
 '_shards': {'total': 1, 'successful': 1, 'skipped': 0, 'failed': 0},
 'hits': {'total': {'value': 10000, 'relation': 'gte'},
  'max_score': 1.0,
  'hits': [{'_index': 'concept_embeddings',
    '_type': '_doc',
    '_id': 'aFqKs3YB9GIGtQkyQvsE',
    '_score': 1.0,
    '_source': {'original_text': 'She hosted television shows in Bahrain at the age of fourteen .',
     'text': 'she hosted television shows in bahrain at the age of fourteen',
     'timestamp': '2020-12-30T13:05:51.236269'}},
   {'_index': 'concept_embeddings',
    '_type': '_doc',
    '_id': 'aVqKs3YB9GIGtQkyQvsR',
    '_score': 1.0,
    '_source': {'original_text': 'After receiving her early education in Bahrain , she pursued a degree in mass communication from the University of Sydney in Australia .',
     'text': 'after receiving her early education in bahrain she pursued a degree in mass communication from the university of sydney in australia',
     'timestamp': '2020-12-30T13:05:51.

Query example

In [75]:
q = '''{
  "query": {
    "match": {
      "text": "game"
    }
  }
}'''


In [80]:
es.search(body = q, index=ES_INDEX, size=ES_QUERY_SIZE)

{'took': 610,
 'timed_out': False,
 '_shards': {'total': 1, 'successful': 1, 'skipped': 0, 'failed': 0},
 'hits': {'total': {'value': 10000, 'relation': 'gte'},
  'max_score': 7.0213437,
  'hits': [{'_index': 'concept_embeddings',
    '_type': '_doc',
    '_id': 'bmc8tHYB9GIGtQkyQWPl',
    '_score': 7.0213437,
    '_source': {'original_text': 'Game Critics also named the game " Best Action Game . "',
     'text': 'game critics also named the game best action game',
     'timestamp': '2020-12-30T16:20:16.613220'}},
   {'_index': 'concept_embeddings',
    '_type': '_doc',
    '_id': 'PGpntHYB9GIGtQkyRFy0',
    '_score': 7.0024157,
    '_source': {'original_text': 'The Game ( mind game ) = \n',
     'text': 'the game ( mind game ) =',
     'timestamp': '2020-12-30T17:07:15.379597'}},
   {'_index': 'concept_embeddings',
    '_type': '_doc',
    '_id': 'lYpvtnYB9GIGtQky2-_U',
    '_score': 6.900861,
    '_source': {'original_text': 'The game was named " Best Original Game " and " Best Handh

**Index all phrases**

In [77]:
MAKE_INDEX = False

if MAKE_INDEX:
    for d in tqdm(train_wikitext_dataset):
        send_to_elasticsearch(d["text"], nlp, es, ES_INDEX)

100%|██████████| 1801350/1801350 [22:02:28<00:00, 22.70it/s]    


In [23]:
doc = nlp(train_wikitext_dataset[4]["text"])

In [28]:
for t in list(doc.sents)[0]:
    print(t.text)

 
The
game
began
development
in
2010
,
carrying
over
a
large
portion
of
the
work
done
on
Valkyria
Chronicles
II
.


In [29]:
type(list(doc.sents)[0])

spacy.tokens.span.Span

In [14]:
counter = 0
for d in train_wikitext_dataset:
    text = d["text"]
    doc = nlp(text)
    for sent in doc.sents:
        send_to_solr(sent)
    if counter >= 10:
        break
    counter += 1

#doc = nlp("This is a sentence. This is another sentence.")
#for sent in doc.sents:
#    print(sent.text)

 =
Valkyria Chronicles III = 

 Senjō no Valkyria 3 :
<unk
>
Chronicles ( Japanese : 戦場のヴァルキュリア3 , lit .
Valkyria of the Battlefield 3 ) , commonly referred to as Valkyria Chronicles III outside Japan , is a tactical role @-@ playing video game developed by Sega and Media.
Vision for the PlayStation Portable .
Released in January 2011 in Japan , it is the third game in the Valkyria series .
Employing the same fusion of tactical and real @-@ time gameplay as its predecessors , the story runs parallel to the first game and follows the " Nameless " , a penal military unit serving the nation of Gallia during the Second Europan War who perform secret black operations and are pitted against the Imperial unit " <unk
> Raven " . 

 The game began development in 2010 , carrying over a large portion of the work done on Valkyria Chronicles II .
While it retained the standard features of the series , it also underwent multiple adjustments , such as making the game more forgiving for series newcome