# PNL

python 3.6.13
conda env py3.16.13

In [7]:
text = """Esta es una prueba. Con este lema miles de personas han recorrido las calles de Murcia este jueves para reclamar a las administraciones públicas la adopción de medidas que recuperen su laguna, aquella de aguas transparentes que conocieron antaño y han perdido. Nueve colectivos ciudadanos se han unido en torno a la consigna 'SOS Mar Menor. Por un mar Menor con futuro' para protestar por la situación de la laguna, dos meses después de la grave crisis que sufrió en agosto, cuando se sacaron 15 toneladas de peces. Manuel trabaja en el Banco Asiático de Desarrollo (ADB), en Manila."""

In [8]:
import spacy
from spacy.lang.es import Spanish
import es_dep_news_trf

nlp = Spanish()
nlp = es_dep_news_trf.load()
doc = nlp(text)
spacy.info()

{'spacy_version': '3.1.3',
 'location': '/Users/mbook/opt/anaconda3/envs/py3.16.13/lib/python3.6/site-packages/spacy',
 'platform': 'Darwin-22.1.0-x86_64-i386-64bit',
 'python_version': '3.6.13',
 'pipelines': {'es_core_news_sm': '3.1.0',
  'es_dep_news_trf': '3.1.0',
  'es_core_news_md': '3.1.0',
  'es_core_news_lg': '3.1.0'}}

[Trained pipelines for Spanish](https://spacy.io/models/es):

- es_core_news_sm: Spanish pipeline optimized for CPU. Components: tok2vec, morphologizer, parser, senter, ner, attribute_ruler, lemmatizer. 13 MB.
- es_core_news_md: Spanish pipeline optimized for CPU. Components: tok2vec, morphologizer, parser, senter, ner, attribute_ruler, lemmatizer. 40 MB.
- es_core_news_lg: Spanish pipeline optimized for CPU. Components: tok2vec, morphologizer, parser, senter, ner, attribute_ruler, lemmatizer. 542 MB.
- es_dep_news_trf: Spanish transformer pipeline (dccuchile/bert-base-spanish-wwm-cased). Components: transformer, morphologizer, parser, attribute_ruler, lemmatizer. 391 MB.

# Morfología

## Segmentación por palabras (tokenización)

In [9]:
for token in doc:
    print(token.text)

Esta
es
una
prueba
.
Con
este
lema
miles
de
personas
han
recorrido
las
calles
de
Murcia
este
jueves
para
reclamar
a
las
administraciones
públicas
la
adopción
de
medidas
que
recuperen
su
laguna
,
aquella
de
aguas
transparentes
que
conocieron
antaño
y
han
perdido
.
Nueve
colectivos
ciudadanos
se
han
unido
en
torno
a
la
consigna
'
SOS
Mar
Menor
.
Por
un
mar
Menor
con
futuro
'
para
protestar
por
la
situación
de
la
laguna
,
dos
meses
después
de
la
grave
crisis
que
sufrió
en
agosto
,
cuando
se
sacaron
15
toneladas
de
peces
.
Manuel
trabaja
en
el
Banco
Asiático
de
Desarrollo
(
ADB
)
,
en
Manila
.


## Análisis morfológico de las palabras

Lemmatizer, Tagger, Morphologizer, 

In [10]:
import pandas as pd

pd.DataFrame([[lexeme.text, lexeme.lemma_, lexeme.pos_, lexeme.tag_, spacy.explain(lexeme.tag_), lexeme.morph, lexeme.is_alpha, lexeme.is_digit, lexeme.shape_, lexeme.lang_, lexeme.is_stop, lexeme.prefix_, lexeme.suffix_, ] for lexeme in doc], columns=['Token', 'Lemma', 'POS', 'Tag', 'Explain', 'Morph', 'Is alphabetical', 'Is digit', 'Shape', 'Language', 'Is stop', 'Prefix', 'Suffix'])

Unnamed: 0,Token,Lemma,POS,Tag,Explain,Morph,Is alphabetical,Is digit,Shape,Language,Is stop,Prefix,Suffix
0,Esta,este,PRON,PRON,pronoun,"(Gender=Fem, Number=Sing, PronType=Dem)",True,False,Xxxx,es,True,E,sta
1,es,ser,AUX,AUX,auxiliary,"(Mood=Ind, Number=Sing, Person=3, Tense=Pres, ...",True,False,xx,es,True,e,es
2,una,uno,DET,DET,determiner,"(Definite=Ind, Gender=Fem, Number=Sing, PronTy...",True,False,xxx,es,True,u,una
3,prueba,prueba,NOUN,NOUN,noun,"(Gender=Fem, Number=Sing)",True,False,xxxx,es,False,p,eba
4,.,.,PUNCT,PUNCT,punctuation,(PunctType=Peri),False,False,.,es,False,.,.
...,...,...,...,...,...,...,...,...,...,...,...,...,...
107,),),PUNCT,PUNCT,punctuation,"(PunctSide=Fin, PunctType=Brck)",False,False,),es,False,),)
108,",",",",PUNCT,PUNCT,punctuation,(PunctType=Comm),False,False,",",es,False,",",","
109,en,en,ADP,ADP,adposition,(AdpType=Prep),True,False,xx,es,True,e,en
110,Manila,Manila,PROPN,PROPN,proper noun,(),True,False,Xxxxx,es,False,M,ila


# Sintaxis

## Segmentación por oraciones

*SentenceRecognizer*.

In [11]:
assert doc.has_annotation("SENT_START")
for sent in doc.sents:
    print(sent.text + "\n")
    

Esta es una prueba.

Con este lema miles de personas han recorrido las calles de Murcia este jueves para reclamar a las administraciones públicas la adopción de medidas que recuperen su laguna, aquella de aguas transparentes que conocieron antaño y han perdido.

Nueve colectivos ciudadanos se han unido en torno a la consigna '

SOS

Mar

Menor

.

Por un mar Menor con futuro' para protestar por la situación de la laguna, dos meses después de la grave crisis que sufrió en agosto, cuando se sacaron 15 toneladas de peces.

Manuel trabaja en el Banco Asiático de Desarrollo (ADB), en Manila.



## POS Tagging

In [12]:
import pandas as pd

pd.DataFrame([[word.text, spacy.explain(word.dep_), word.dep_] for word in doc], columns=['Palabra', 'Función sintáctica', 'Abreviatura de la función'])

Unnamed: 0,Palabra,Función sintáctica,Abreviatura de la función
0,Esta,nominal subject,nsubj
1,es,copula,cop
2,una,determiner,det
3,prueba,,ROOT
4,.,punctuation,punct
...,...,...,...
107,),punctuation,punct
108,",",punctuation,punct
109,en,case marking,case
110,Manila,modifier of nominal,nmod


In [13]:
import explacy

explacy.print_parse_info(nlp, text)

Dep tree                         Token            Dep type Lemma          Part of Sp
──────────────────────────────── ──────────────── ──────── ────────────── ──────────
                           ┌───► Esta             nsubj    este           PRON      
                           │┌──► es               cop      ser            AUX       
                           ││┌─► una              det      uno            DET       
                           └┴┼── prueba           ROOT     prueba         NOUN      
                             └─► .                punct    .              PUNCT     
                            ┌──► Con              case     con            ADP       
                            │┌─► este             det      este           DET       
                      ┌────►└┴── lema             obl      lema           NOUN      
                      │┌─►┌───── miles            nsubj    mil            NUM       
                      ││  │  ┌─► de               case     de    

In [14]:
from spacy import displacy

sentence_spans = list(doc.sents)
options = {'compact': False, 'color': 'brown', 'font': 'Monaco', 'distance': 110}
displacy.render(sentence_spans, style="dep", jupyter=True, page=True, options=options)

## Noun chunks

In [16]:
import pandas as pd

pd.DataFrame([[chunk.text, chunk.root.text, chunk.root.head.text, spacy.explain(chunk.root.dep_), chunk.root.dep_] for chunk in doc.noun_chunks], columns=['Sintagma nominal (SN)', 'Núcleo del SN', 'Depende sintácticamente de', 'en función sintáctica de', 'Abreviatura de la función'])

Unnamed: 0,Sintagma nominal (SN),Núcleo del SN,Depende sintácticamente de,en función sintáctica de,Abreviatura de la función
0,Esta,Esta,prueba,nominal subject,nsubj
1,una prueba,prueba,prueba,,ROOT
2,este lema,lema,recorrido,oblique nominal,obl
3,personas,personas,miles,modifier of nominal,nmod
4,las calles,calles,recorrido,object,obj
5,Murcia,Murcia,calles,modifier of nominal,nmod
6,este jueves,jueves,recorrido,oblique nominal,obl
7,las administraciones,administraciones,reclamar,object,obj
8,la adopción,adopción,reclamar,object,obj
9,medidas,medidas,adopción,modifier of nominal,nmod


# Likelihood of a sentence
Anothe key concept in NLP is that of **Language Models**. It is a function that is able to tell us the likelihood of a sentence of appearing in the real word. One such model, albeit a very simple one, is the one that multiplies the probabilities of every token in the sentence

# Probabilidad de una sentencia
Otro concepto clave en la PNL es el de **Modelos de lenguaje**. Es una función que es capaz de decirnos la probabilidad de que una oración aparezca en la palabra real. Uno de esos modelos, aunque muy simple, es el que multiplica las probabilidades de cada ficha en la oración.

In [18]:
import pandas as pd
import math

pd.DataFrame([[word.text, math.exp(word.prob), word.prob] for word in doc], columns=['Token', 'Prob', 'Log Prob'])

Unnamed: 0,Token,Prob,Log Prob
0,Esta,2.061154e-09,-20.0
1,es,2.061154e-09,-20.0
2,una,2.061154e-09,-20.0
3,prueba,2.061154e-09,-20.0
4,.,2.061154e-09,-20.0
...,...,...,...
107,),2.061154e-09,-20.0
108,",",2.061154e-09,-20.0
109,en,2.061154e-09,-20.0
110,Manila,2.061154e-09,-20.0


# Dependency parsing (revisar)

A syntactic dependency parser, with a rich API for navigating the tree. The parser also powers the sentence boundary detection, and lets you iterate over base noun phrases, or "chunks". 

In [21]:
from spacy.symbols import nsubj, VERB

# Finding a verb with a subject from belEspañaow — good
verbs = set()
for possible_subject in doc:
    if possible_subject.dep == nsubj and possible_subject.head.pos == VERB:
        verbs.add(possible_subject.head)

In [22]:
# Finding a verb with a subject from above — less good
verbs = []
for possible_verb in doc:
    if possible_verb.pos == VERB:
        for possible_subject in possible_verb.children:
            if possible_subject.dep == nsubj:
                verbs.append(possible_verb)
                break

# Text readability

## Lecturabilidad de Fernández Huerta

Basada en The Flesch Reading Ease formula.

| Lecturabilidad (L)  |  nivel	            |   Nivel educativo     |
|---------------------|---------------------|-----------------------|
| 0-30                |  muy difícil         |   universitario (especialización)        |   
| 30-50	              |  difícil	            | cursos selectivos     |
| 50-60	              |  algo difícil	    |     preuniversitario      |
| 60-70	              |  normal(para adulto)	|    7º u 8º grado      |
| 70-80	              |  algo fácil	        |     6º grado      |
| 80-90	              |  fácil	            |     5º grado      |
| 90-100	          |  muy fácil	        |     4º grado      |

In [23]:
import textstat

textstat.fernandez_huerta(text)

96.7

## Índice de perspicuidad de Szigriszt-Pazos


| Perspicuidad	        | estilo	       |  tipo de publicación	        |       estudios                                |
|-----------|------------------|--------------------------------|-----------------------------------------------|
| 0 a 15	| muy difícil	   |  científica, filosófica	    |    titulados universitarios   |
| 16 a 35  	| árido	           |  pedagógica, técnica	        |    selectividad y estudios universitarios |
| 36 a 50  	| bastante difícil | literatura y divulgación	    |    cursos secundarios |
| 51 a 65  	| normal	       |  medios de comunicación	    |     popular   |
| 66 a 75  	| bastante fácil   |   novela, revista femenina 	|    12 años    |
| 76 a 85  	| fácil	           |  para quioscos	                |    11 años    |
| 86 a 100	| muy fácil	       |  cómics, tebeos y viñetas	    |     6 a 10 años   |


In [24]:
import textstat

textstat.szigriszt_pazos(text)

96.42

## Fórmula de Crawford

Años de escolaridad necesarios para entender el texto.

In [25]:
import textstat

textstat.crawford(text)

2.7

## Fórmula de comprensibilidad de Gutiérrez de Polini

Sólo para niños de 11-12 años. Cuanto más bajo el número, más difícil de leer.

In [26]:
import textstat

textstat.gutierrez_polini(text)

42.02

## Número de palabras

In [27]:
import textstat

textstat.lexicon_count(text, removepunct=True)

99

## Número de oraciones

In [28]:
import textstat

textstat.sentence_count(text)

5

## Tiempo de lectura

En segundos. 14,69 ms. por letra.

In [29]:
import textstat

textstat.reading_time(text, ms_per_char=14.69)

7.12

In [31]:
# streamlit_app.py
import spacy_streamlit

models = ["es_dep_news_trf", "es_core_news_lg"]
default_text = "La comida del restaurante de Miguel es muy sabrosa."
spacy_streamlit.visualize(models, default_text)

AttributeError: 'NoneType' object has no attribute 'replace'

# Entity Recogniser and Linker a Wikipedia

In [34]:
import spacy
# load your spacy pipeline
nlp = spacy.blank('es')

# add the pipeline stage with the configuration options:
nlp.add_pipe('dbpedia_spotlight', config={'confidence': 0.4})

# use it
doc = nlp(text)

# you can change the confidence if you have already instantiated the pipeline stage
nlp.get_pipe('dbpedia_spotlight').language_code='es'

# now recompute the document
doc = nlp(text)
# this now won't have any results

# print([(ent.text, ent.kb_id_, ent._.dbpedia_raw_result['@similarityScore']) for ent in doc.ents])
# print([(ent, ent.kb_id_) for ent in doc.ents])

mi_bucle=[(ent, ent.kb_id_) for ent in doc.ents]
print(*mi_bucle, sep='\n- ')

(Murcia, 'http://es.dbpedia.org/resource/Murcia')
- (colectivos, 'http://es.dbpedia.org/resource/Colectivos_de_Buenos_Aires')
- (SOS, 'http://es.dbpedia.org/resource/SOS')
- (Mar Menor, 'http://es.dbpedia.org/resource/Mar_Menor')
- (mar Menor, 'http://es.dbpedia.org/resource/Mar_Menor')
- (peces, 'http://es.dbpedia.org/resource/Pez')
- (Banco Asiático de Desarrollo, 'http://es.dbpedia.org/resource/Banco_Asiático_de_Desarrollo')
- (ADB, 'http://es.dbpedia.org/resource/Apple_Desktop_Bus')
- (Manila, 'http://es.dbpedia.org/resource/Manila')


# Language detection

# Automated Term Extraction

In [35]:
import spacy
from pyate.term_extraction_pipeline import TermExtractionPipeline

nlp = spacy.load("es_core_news_lg")
nlp.add_pipe("combo_basic")
doc = nlp(text)
print(doc._.combo_basic.sort_values(ascending=False).head(5))

situación de la laguna    1.386294
torno a la consigna       1.386294
toneladas de peces        1.098612
adopción de medidas       1.098612
grave crisis              0.693147
dtype: float64


# Extracción de entidades (NER)

*Entity recognizer*.

Statistical entity recognition system that assigns labels to contiguous spans of tokens. 

The default model identifies a variety of named and numeric entities, including companies, locations, organizations and products. You can add arbitrary classes to the entity recognition system, and update the model with new examples.

In [36]:
from spacy import displacy
import es_core_news_lg

nlp = es_core_news_lg.load()
doc2 = nlp(text)
options = {"colors": {"PER": "pink", "ORG": "cyan", "MISC": "yellow"}}
displacy.render(doc2, style="ent", jupyter=True, page=True, options=options)

In [37]:
import pandas as pd

pd.DataFrame([[ent.text, ent.label_, spacy.explain(ent.label_)] for ent in doc2.ents], columns=['Text', 'Label', 'Description'])

Unnamed: 0,Text,Label,Description
0,Con este lema miles de personas,MISC,"Miscellaneous entities, e.g. events, nationali..."
1,de Murcia,LOC,"Non-GPE locations, mountain ranges, bodies of ..."
2,Mar Menor,LOC,"Non-GPE locations, mountain ranges, bodies of ..."
3,Por un mar Menor,MISC,"Miscellaneous entities, e.g. events, nationali..."
4,Manuel,PER,Named person or family.
5,Banco Asiático de Desarrollo,LOC,"Non-GPE locations, mountain ranges, bodies of ..."
6,ADB,MISC,"Miscellaneous entities, e.g. events, nationali..."
7,Manila,LOC,"Non-GPE locations, mountain ranges, bodies of ..."


In [38]:
doc = nlp('Las naranjas y las manzanas se parecen')
naranja = doc[1]
manzana = doc[4]
naranja.similarity(manzana)

0.7107749

One amazing property of word vectors is that they represent analogies really well, for instance:
 
 * Argentina - Macri = Alemania - Merkel
 * Reina - Mujer = Rey - Hombre
 
We can also average out the vectors in a whole sentence to get a similarity between them. NLP methods that disregard the word order such as this one are commonly referred as **Bag of Words**

In [39]:
nlp = spacy.load('es_dep_news_trf')
doc = nlp(u"El presidente Aznar ha estado en Madrid")
doc2 = nlp(u"La premier alemana Angela Merkel visitó Buenos Aires esta semana")
doc.similarity(doc2)

  after removing the cwd from sys.path.
  after removing the cwd from sys.path.


0.0

## Training and updating

# Tokenization

During processing, spaCy first tokenizes the text, i.e. segments it into words, punctuation and so on. The tokenizer is the **first component of the processing pipeline** and the only one that can't be replaced by writing to nlp.pipeline. It takes a text and returns a `Doc`, whereas all other components expect to already receive a tokenized Doc.

![Order of processes in pipeline](https://spacy.io/assets/img/pipeline.svg)


This is done by applying rules specific to each language. 

In [40]:
nlp = spacy.load('es_dep_news_trf')
for token in doc:
    print(token.text)

El
presidente
Aznar
ha
estado
en
Madrid


## Adding special case tokenization rules

Tokenization rules that are specific to one language, but can be generalised across that language should ideally live in the language data in `spacy/lang`. [Here](https://spacy.io/usage/linguistic-features#named-entities) is how to add them.

In [41]:
nlp = spacy.load('es_dep_news_trf')
for token in doc:
    print(token.text)

El
presidente
Aznar
ha
estado
en
Madrid


## Bringing your own annotations

Look [here](https://spacy.io/usage/linguistic-features#own-annotations) at what to do if your data is partially annotated, e.g. with pre-existing tokenization, part-of-speech tags, etc.

# Rule-based matching

# Language Processing Pipelines

Each pipeline component returns the processed Doc, which is then passed on to the next component.

# Adding languages

The **shared language data** in the directory root includes rules that can be generalised across languages – for example, rules for basic punctuation, emoji, emoticons, single-letter abbreviations and norms for equivalent tokens with different spellings, like `"` and `”`. This helps the models make more accurate predictions. The **individual language data** in a submodule contains rules that are only relevant to a particular language. 

In [42]:
from spacy.lang.en import English
from spacy.lang.es import Spanish

nlp_en = English() # includes English data
nlp_es = Spanish() # includes Spanish data

## Stop words

A "stop list" is a classic trick from the early days of information retrieval when search was largely about keyword presence and absence. It is still sometimes useful today to filter out common words from a bag-of-words model. To improve readability, [STOP_WORDS](https://spacy.io/usage/adding-languages#stop-words) are separated by spaces and newlines, and added as a multiline string.

It is in `/Users/macbook/anaconda3/lib/python3.6/site-packages/spacy/lang/es/stop_words.py`

## Lemmatizer

spaCy supports simple lookup-based [lemmatization](https://spacy.io/usage/adding-languages#lemmatizer). This is usually the quickest and easiest way to get started. The data is stored in a dictionary mapping a string to its lemma. To determine a token's lemma, spaCy simply looks it up in the table. It is in `/Users/macbook/anaconda3/lib/python3.6/site-packages/spacy/lang/es/lemmatizer.py`.

In [43]:
from spacy import lemmatizer
lemma_lookup = dict(LOOKUP)

ImportError: cannot import name 'lemmatizer'

# Tag map

Most treebanks define a custom part-of-speech tag scheme. You need to define how those symbols map down to the [Universal Dependencies tag set](http://universaldependencies.org/u/pos/all.html). This is done by providing a [tag map](https://spacy.io/usage/adding-languages#tag-map).


# Training SpaCy

Leer [Training spaCy's Statistical Models](https://spacy.io/usage/training).

## Training a language model

spaCy expects that common words will be cached in a Vocab instance. The vocabulary caches lexical features, and makes it easy to use information from unlabelled text samples in your models. Specifically, you'll usually want to collect word frequencies, and train word vectors. Note that your corpus should not be preprocessed (i.e. you need punctuation for example). 

To [generate the word frequencies](https://spacy.io/usage/adding-languages#training) from a large, raw corpus, you can use the [word_freqs.py](https://github.com/explosion/spacy-dev-resources/blob/master/training/word_freqs.py) script from the spaCy developer resources.

## Training the word vectors for similarity models

[Word2vec](https://en.wikipedia.org/wiki/Word2vec) and related algorithms let you train useful word similarity models from unlabelled text. This is a key part of using [deep learning](https://spacy.io/usage/deep-learning) for NLP with limited labelled data. The vectors are also useful by themselves – they power the `.similarity()` methods in spaCy. 

For best results, you should pre-process the text with spaCy before training the Word2vec model. This ensures your tokenization will match. You can use our [word vectors training script](https://github.com/explosion/spacy-dev-resources/blob/master/training/word_vectors.py), which pre-processes the text with your language-specific tokenizer and trains the model using [Gensim](https://radimrehurek.com/gensim/).

See [Training the word vectors](https://spacy.io/usage/adding-languages#word-vectors).

### Training the tagger and parser

You can now [train the tagger and parser model](https://spacy.io/usage/adding-languages#train-tagger-parser) using a corpus for your language annotated with [Universal Dependencies](http://universaldependencies.org/). If your corpus uses the [CoNLL-U](http://universaldependencies.org/docs/format.html) format, i.e. files with the extension `.conllu`, you can use the [`convert` ](https://spacy.io/api/cli#convert) command to convert it to spaCy's [JSON format](https://spacy.io/api/annotation#json-input) for training. Once you have your UD corpus transformed into JSON, you can train your model use the using spaCy's [`train` ](https://spacy.io/api/cli#train) command.

# Semantic Similarity and Word Vectors

https://spacy.io/usage/vectors-similarity

In [44]:
from __future__ import unicode_literals, print_function

import plac
import spacy


@plac.annotations(
    model=("Model to load", "positional", None, str))
def main(model='es_dep_news_trf'):
    nlp = spacy.load(model)
    print("Loaded model '%s'" % model)

    doc = nlp("displaCy uses CSS and JavaScript to show you how computers "
               "understand language")

    # The easiest way is to find the head of the subtree you want, and then use
    # the `.subtree`, `.children`, `.lefts` and `.rights` iterators. `.subtree`
    # is the one that does what you're asking for most directly:
    for word in doc:
        if word.dep_ in ('xcomp', 'ccomp'):
            print(''.join(w.text_with_ws for w in word.subtree))

    # It'd probably be better for `word.subtree` to return a `Span` object
    # instead of a generator over the tokens. If you want the `Span` you can
    # get it via the `.right_edge` and `.left_edge` properties. The `Span`
    # object is nice because you can easily get a vector, merge it, etc.
    for word in doc:
        if word.dep_ in ('xcomp', 'ccomp'):
            subtree_span = doc[word.left_edge.i : word.right_edge.i + 1]
            print(subtree_span.text, '|', subtree_span.root.text)

    # You might also want to select a head, and then select a start and end
    # position by walking along its children. You could then take the
    # `.left_edge` and `.right_edge` of those tokens, and use it to calculate
    # a span.

if __name__ == '__main__':
    plac.call(main)

    # Expected output:
    # to show you how computers understand language
    # how computers understand language
    # to show you how computers understand language | show
    # how computers understand language | understand


usage: ipykernel_launcher.py [-h] [model]
ipykernel_launcher.py: error: unrecognized arguments: -f


SystemExit: 2

  warn("To exit: use 'exit', 'quit', or Ctrl-D.", stacklevel=1)


In [46]:
from typing import List
from thinc.types import Floats2d
from thinc.api import Model, PyTorchWrapper, chain, with_array
import spacy
from spacy.tokens.doc import Doc
from spacy.ml import CharacterEmbed
from torch import nn

@spacy.registry.architectures("CustomTorchModel.v1")
def create_torch_model(
    nO: int,
    width: int,
    hidden_width: int,
    embed_size: int,
    nM: int,
    nC: int,
    dropout: float,
) -> Model[List[Doc], List[Floats2d]]:
    char_embed = CharacterEmbed(width, embed_size, nM, nC)
    torch_model = nn.Sequential(
        nn.Linear(width, hidden_width),
        nn.ReLU(),
        nn.Dropout2d(dropout),
        nn.Linear(hidden_width, nO),
        nn.ReLU(),
        nn.Dropout2d(dropout),
        nn.Softmax(dim=1)
    )
    wrapped_pt_model = PyTorchWrapper(torch_model)
    model = chain(char_embed, with_array(wrapped_pt_model))
    return model