# Word Lemmatizer

La lemmatización lleva la palabra a su raíz.

<a href="https://es.wikipedia.org/wiki/Lematizaci%C3%B3n">Wiki</a>

```python
from nltk.corpus import stopwords
sw_english = stopwords.words("english")
len(sw_english), sw_english[:5]
```

In [1]:
from nltk.corpus import stopwords
sw_english = stopwords.words("english")
len(sw_english), sw_english[:5]

(179, ['i', 'me', 'my', 'myself', 'we'])

In [2]:
from nltk.stem import WordNetLemmatizer

In [3]:
import nltk
nltk.download('omw-1.4')
nltk.download("averaged_perceptron_tagger")
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('punkt')

[nltk_data] Downloading package omw-1.4 to
[nltk_data]     C:\Users\jahof\AppData\Roaming\nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\jahof\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\jahof\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\jahof\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\jahof\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [4]:
palabra = "singing"
lemmatizer = WordNetLemmatizer()
lemmatizer.lemmatize(palabra)

'singing'

In [5]:
from nltk import pos_tag, word_tokenize

In [6]:
tag = pos_tag(word_tokenize(palabra))
tag

[('singing', 'VBG')]

In [7]:
tag_letter = tag[0][1][0].lower() # sacar la primera letra del tag
tag_letter = tag_letter if tag_letter in ["a", "r", "n", "v"] else None

lemma = palabra if not tag_letter else lemmatizer.lemmatize(palabra, tag_letter)
lemma

'sing'

In [7]:
def get_lemma(lmtz, word):
    tag_parts = pos_tag(word_tokenize(word))
    tag_letter = tag_parts[0][1][0].lower() # sacar la primera letra del tag
    tag_letter = tag_letter if tag_letter in ["a", "r", "n", "v"] else None
    return word if not tag_letter else lmtz.lemmatize(word, tag_letter)

get_lemma(lemmatizer, palabra)

'sing'

In [8]:
# Un twit de prueba
tweet = "Happy Mama's day to all mothers"

# Lista con las palabras del tweet
tweet_words = tweet.split()

# Se obtiene los lemmas de cada palabra
tweet_lemmas = [
    get_lemma(lemmatizer, w)
    for w in tweet_words
]

# Se vuelve a armar la frase con lemmas
tweet_lemmatized = " ".join(tweet_lemmas)
tweet_lemmatized

"Happy Mama's day to all mother"

In [9]:
"day" in sw_english

False

# TfIdf

In [10]:
import re
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import TfidfVectorizer
import string
import pandas as pd

In [11]:
def text_preprocessor(text, lmtz):
    text = text.lower()
    text = re.sub('\[.*?¿\]\%', ' ', text)
    text = re.sub('[%s]' % re.escape(string.punctuation), ' ', text)
    text = re.sub('\w*\d\w*', '', text)
    text = re.sub('[‘’“”…«»]', '', text)
    text = re.sub("^\d+\s|\s\d+\s|\s\d+$", " ", text)
    text = re.sub('\n', ' ', text)
    
    def get_lemma(lmtz, word):
        tag_parts = pos_tag(word_tokenize(word))
        tag_letter = tag_parts[0][1][0].lower() # sacar la primera letra del tag
        tag_letter = tag_letter if tag_letter in ["a", "r", "n", "v"] else None
        return word if not tag_letter else lmtz.lemmatize(word, tag_letter)

    text = ' '.join([get_lemma(lmtz, word) for word in text.split()])
    return text

modified_stop_words = sw_english + ['day']

In [12]:
pipe = Pipeline([
    (
        "vectorizer",
        TfidfVectorizer(
            analyzer="word", # Procesar palabras, no "caracteres"
            preprocessor=lambda x: text_preprocessor(x, lemmatizer),
            sublinear_tf=True, # google dice que sí
            min_df=0.15, # aumentar a 5 con más datos (cant. mínima de ocurrencias para preservar palabra, evita of)
            norm='l2', # norma euclídea de regularización (evita of, puede empeorar desempeño en train)
            encoding='latin-1', 
            ngram_range=(1, 2), # considerar palabras aisladas y pares de palabras
            stop_words= modified_stop_words
        )
    ),
    (
        "classifier",
        LogisticRegression()
    )
])

In [13]:
pipe.fit([tweet, "singing mother this singing day is mothers singing another sample Mothers"], [0, 1])



In [14]:
frecuencias = pipe[0].transform([tweet]).toarray()
frecuencias

array([[0.        , 0.        , 0.47107781, 0.47107781, 0.47107781,
        0.47107781, 0.33517574, 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.        ]])

In [15]:
palabras = pipe[0].get_feature_names_out()
palabras

array(['another', 'another sample', 'happy', 'happy mama', 'mama',
       'mama mother', 'mother', 'mother sing', 'sample', 'sample mother',
       'sing', 'sing another', 'sing mother'], dtype=object)

In [16]:
pd.DataFrame(
    pipe[0].transform([tweet]).toarray(),
    columns=palabras
)

Unnamed: 0,another,another sample,happy,happy mama,mama,mama mother,mother,mother sing,sample,sample mother,sing,sing another,sing mother
0,0.0,0.0,0.471078,0.471078,0.471078,0.471078,0.335176,0.0,0.0,0.0,0.0,0.0,0.0
