# Pre-procesamiento de texto

## Conversion de mayúsculas o minúsculas

In [1]:
oracion = 'ESTE es UN ejemplo DE mayusculas Y minusculas'
oracion

'ESTE es UN ejemplo DE mayusculas Y minusculas'

In [2]:
oracion.lower()

'este es un ejemplo de mayusculas y minusculas'

In [3]:
oracion.upper()

'ESTE ES UN EJEMPLO DE MAYUSCULAS Y MINUSCULAS'

In [4]:
oracion.title()

'Este Es Un Ejemplo De Mayusculas Y Minusculas'

## Tokens

In [5]:
import nltk
print(nltk.word_tokenize(oracion))

['ESTE', 'es', 'UN', 'ejemplo', 'DE', 'mayusculas', 'Y', 'minusculas']


## Acentos

In [6]:
import unicodedata

def removerAcentos(texto):
    texto = unicodedata.normalize('NFKD', texto).encode('ascii','ignore').decode('utf-8','ignore')
    return texto

In [7]:
oracion = 'Esté es Un ÉJÉmpLó de data acentÚADá'
oracion

'Esté es Un ÉJÉmpLó de data acentÚADá'

In [8]:
removerAcentos(oracion.lower())

'este es un ejemplo de data acentuada'

## Caracteres especiales, números y símbolos

In [9]:
import re
def removerCaracteresEspecialesNumerosSimbolos(texto, removerDigitos =False):
    patron = r'[^a-zA-Z0-9\s]' if not removerDigitos else r'[^a-zA-Z\s]'
    texto = re.sub(patron,'', texto)
    return texto
oracion = 'Hoy es el partido de fútbol a las 7:45 @$%&'
removerCaracteresEspecialesNumerosSimbolos(removerAcentos(oracion))


'Hoy es el partido de futbol a las 745 '

In [10]:
removerCaracteresEspecialesNumerosSimbolos(removerAcentos(oracion), True)

'Hoy es el partido de futbol a las  '

## Contracciones en el texto

In [11]:
!pip install contractions
!pip install textsearch

Collecting contractions
  Downloading contractions-0.1.73-py2.py3-none-any.whl (8.7 kB)
Collecting textsearch>=0.0.21 (from contractions)
  Downloading textsearch-0.0.24-py2.py3-none-any.whl (7.6 kB)
Collecting anyascii (from textsearch>=0.0.21->contractions)
  Downloading anyascii-0.3.2-py3-none-any.whl (289 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m289.9/289.9 kB[0m [31m2.5 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
[?25hCollecting pyahocorasick (from textsearch>=0.0.21->contractions)
  Downloading pyahocorasick-2.0.0-cp311-cp311-macosx_10_9_universal2.whl (63 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m63.6/63.6 kB[0m [31m3.2 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: pyahocorasick, anyascii, textsearch, contractions
Successfully installed anyascii-0.3.2 contractions-0.1.73 pyahocorasick-2.0.0 textsearch-0.0.24

[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m

In [12]:
texto = "Y'all can't expand contractions I'd think! You wouldn't be able to. How'd you do it?"
texto

"Y'all can't expand contractions I'd think! You wouldn't be able to. How'd you do it?"

In [13]:
import contractions
list(contractions.contractions_dict.items())[:10]

[("I'm", 'I am'),
 ("I'm'a", 'I am about to'),
 ("I'm'o", 'I am going to'),
 ("I've", 'I have'),
 ("I'll", 'I will'),
 ("I'll've", 'I will have'),
 ("I'd", 'I would'),
 ("I'd've", 'I would have'),
 ('Whatcha', 'What are you'),
 ("amn't", 'am not')]

In [14]:
contractions.fix(texto)

'You all cannot expand contractions I would think! You would not be able to. How did you do it?'

## Stop Words

In [15]:
oracion = 'ESTE es UN ejemplo DE mayusculas Y minusculas'
import nltk
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /Users/jyass/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [16]:
tokens = nltk.word_tokenize(oracion.lower())
tokens

['este', 'es', 'un', 'ejemplo', 'de', 'mayusculas', 'y', 'minusculas']

In [17]:
stop_words_spanish = nltk.corpus.stopwords.words('spanish')
tokens_filtrados = [token for token in tokens if token not in stop_words_spanish]
tokens_filtrados

['ejemplo', 'mayusculas', 'minusculas']

In [18]:
stop_words_english = nltk.corpus.stopwords.words('english')

In [19]:
tokens_english = nltk.word_tokenize(contractions.fix(texto))
tokens_filtrados_english = [token for token in tokens_english if token not in stop_words_english]
texto


"Y'all can't expand contractions I'd think! You wouldn't be able to. How'd you do it?"

In [20]:
tokens_filtrados_english

['You',
 'expand',
 'contractions',
 'I',
 'would',
 'think',
 '!',
 'You',
 'would',
 'able',
 '.',
 'How',
 '?']

## Stemming

In [22]:
from nltk.stem import PorterStemmer
ps = PorterStemmer()
#ps.stem('jumping'), ps.stem('jumps'), ps.stem('jumped')
ps.stem('argue'), ps.stem('argued'), ps.stem('arguing'), ps.stem('argus')


('argu', 'argu', 'argu', 'argu')

## Lemmatization

In [23]:
from nltk.stem import WordNetLemmatizer
wnl = WordNetLemmatizer()

In [25]:
#Lemmatize sustantivos
print(wnl.lemmatize('cars', 'n'))
print(wnl.lemmatize('boxes', 'n'))
print(wnl.lemmatize('studying', 'n'))

car
box
studying


In [26]:
#Lemmatize verbos
print(wnl.lemmatize('jumped', 'v'))
print(wnl.lemmatize('jumping', 'v'))
print(wnl.lemmatize('studying', 'v'))

jump
jump
study


In [27]:
#Lemmatize adjetivos
print(wnl.lemmatize('saddest', 'a'))
print(wnl.lemmatize('fancier', 'a'))

sad
fancy
