In [1]:
import re
import time
from pathlib import Path
import shutil
import glob
import string
import unicodedata
from datetime import datetime, timedelta

import pandas as pd
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.support.select import Select
from sqlalchemy import create_engine
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.by import By

from unicodedata import normalize, category

In [2]:
import nltk
from nltk.corpus import stopwords
from nltk.stem.snowball import SnowballStemmer

stopset = set(stopwords.words("spanish"))
print(stopset)

{'al', 'hubiesen', 'sentidos', 'habrán', 'tendremos', 'estando', 'tendrías', 'tuviese', 'habremos', 'tuvieses', 'otra', 'lo', 'habréis', 'hayan', 'nuestros', 'tendrá', 'estás', 'unos', 'habría', 'tendré', 'hubiera', 'teniendo', 'nuestro', 'has', 'una', 'de', 'tuvieron', 'estuve', 'fueses', 'fuéramos', 'esta', 'tenidos', 'esto', 'le', 'un', 'con', 'como', 'hubiéramos', 'hayamos', 'sería', 'está', 'hubiésemos', 'cuando', 'fuiste', 'estuvisteis', 'estáis', 'fuerais', 'estamos', 'estuviéramos', 'poco', 'estén', 'estaríais', 'se', 'tus', 'tuyas', 'estaréis', 'muchos', 'que', 'estuvo', 'tengamos', 'tendrán', 'estad', 'siente', 'habidos', 'estada', 'mí', 'donde', 'fuimos', 'estoy', 'a', 'sea', 'qué', 'otras', 'estar', 'ellos', 'tenías', 'ese', 'estuvieseis', 'somos', 'hayas', 'te', 'estuvieran', 'están', 'hubieses', 'esa', 'seríamos', 'estaré', 'estuviera', 'tendréis', 'sentidas', 'tiene', 'estuvieras', 'el', 'contra', 'estuviste', 'fueron', 'vosotras', 'tenidas', 'yo', 'mis', 'estos', 'serem

In [3]:
hoy = datetime.today()
hoy_format = hoy.strftime('%d%m%Y') 
hoy_format

'24102021'

In [4]:
for file in glob.iglob('news/{}/**/*.txt'.format(hoy_format), recursive=True):
    listado = file.replace('/', '\\').split('\\')
    
    if 'descartado' in listado:
        continue
        
    print(listado)
    with open(file, encoding='UTF-8') as f:
        new = f.read()
        
    break

In [5]:
from unicodedata import normalize, category

def clean_text_1(_text):
    """
        Ll : minusculas
        Zs : espacios
        Lu : mayusculas
        Nd : numeros
        Mn : tildes
        Ps : abre dieresis, .....
        Pe : cierra dieresis, .......
        Pi : abre parentesis, .....
        Po : cierra parentesis, punto, coma, .....        
        ...
    """
    selects_characters = ['Ll', 'Zs', 'Lu']
    
    return ''.join([_.lower() for _ in normalize('NFD', _text.replace(' ', 'ZZZ')) if category(_) in selects_characters])

### SIN QUITAR STOPWORDS

In [None]:
tokenizacion = [
    [clean_text_1(_) for _ in nltk.word_tokenize(sentence) if len(clean_text_1(_)) > 0] for sentence in nltk.sent_tokenize(new)
]

for oracion in tokenizacion:
    print(oracion)
    print("-"*100)

### QUITANDO STOPWORDS

In [None]:
stopset_cleaned = [clean_text_1(word) for word in stopset]
print(stopset_cleaned)

In [None]:
tokenizacion = [
    [
        clean_text_1(_) for _ in nltk.word_tokenize(sentence) 
                     if len(clean_text_1(_)) > 0 and clean_text_1(_) not in stopset_cleaned
    ] for sentence in nltk.sent_tokenize(new)
]

for oracion in tokenizacion:
    if len(oracion) <= 0:
        continue

    print(oracion)
    print("-"*100)

### LEMMATIZACION

In [None]:
#!pip install spacy                                  -- descomentar la 1era vez

In [None]:
#!pip install stanza                                 -- descomentar la 1era vez

In [None]:
#!pip install "spacy-stanza<0.3.0" --user            -- descomentar la 1era vez

In [None]:
import stanza

In [None]:
#stanza.download('es')                                -- descomentar la 1era vez

In [None]:
from spacy_stanza import StanzaLanguage

In [None]:
snlp = stanza.Pipeline(lang="es")
nlp = StanzaLanguage(snlp)

In [None]:
tokenizacion = [
    [
       ''.join(
           [token.lemma_ for token in nlp(clean_text_1(_))]
       ) for _ in nltk.word_tokenize(sentence) 
         if len(clean_text_1(_)) > 0 and clean_text_1(_) not in stopset_cleaned
    ] for sentence in nltk.sent_tokenize(new)
]

for oracion in tokenizacion:
    if len(oracion) <= 0:
        continue

    print(oracion)
    print("-"*100)

### REGRESANDO A MODO TEXTO

In [None]:
textos = [' '.join(sentence) for sentence in tokenizacion]
print(textos)

### BOLSA DE PALABRAS

In [None]:
from sklearn.feature_extraction.text import CountVectorizer

cv = CountVectorizer()
cv_fit = cv.fit_transform(textos)

In [None]:
cv_fit

In [None]:
print([name for name in dir(cv_fit) if '__' not in name])

In [None]:
print(cv.get_feature_names())

In [None]:
print(cv.vocabulary_)

In [None]:
tamanio_vocabulario = len(cv.vocabulary_)
tamanio_vocabulario

In [None]:
cantidad_textos = len(textos)
cantidad_textos

In [None]:
espacios_con_ceros = cv_fit.toarray().sum(axis=1).sum()
espacios_con_ceros

### Porcentaje de ceros

In [None]:
sparsity = 1 - 172/ (119*14)
sparsity