In [None]:
!python -m spacy download en_core_web_sm

In [None]:
import os
import pandas as pd
import matplotlib.pyplot as plt
from wordcloud import WordCloud
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer
from nltk.stem import WordNetLemmatizer
from collections import Counter
import PyPDF2
import unidecode
import contractions
import spacy


Descargar recursos de NLTK

In [None]:
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('averaged_perceptron_tagger')
nltk.download('wordnet')

Inicializar spaCy para lematización y POS tagging

In [None]:
nlp = spacy.load('en_core_web_sm')

Función para extraer texto de un archivo PDF

In [None]:
def extract_text_from_pdf(pdf_path):
    with open(pdf_path, 'rb') as file:
        reader = PyPDF2.PdfReader(file)
        text = ''
        for page in reader.pages:
            text += page.extract_text()
    return text

Función para preprocesar el texto

In [None]:

def preprocess_text(text):
    # Expandir contracciones
    text = contractions.fix(text)
    
    # Eliminar diacríticos
    text = unidecode.unidecode(text)
    
    # Tokenización y eliminación de puntuación
    tokens = word_tokenize(text)
    
    # Convertir a minúsculas y eliminar stopwords
    stop_words = set(stopwords.words('english'))
    tokens = [token.lower() for token in tokens if token.isalpha() and token.lower() not in stop_words]
    
    # Lematización y stemming
    lemmatizer = WordNetLemmatizer()
    stemmer = PorterStemmer()
    lemmas = [lemmatizer.lemmatize(token) for token in tokens]
    stems = [stemmer.stem(token) for token in tokens]
    
    # Part of Speech Tagging
    pos_tags = nltk.pos_tag(tokens)
    
    return tokens, lemmas, stems, pos_tags

Leer texto de todos los PDFs en el directorio

In [None]:
pdf_directory = 'pdf/'


documents = []
for filename in os.listdir(pdf_directory):
    if filename.endswith('.pdf'):
        pdf_path = os.path.join(pdf_directory, filename)
        text = extract_text_from_pdf(pdf_path)
        documents.append(preprocess_text(text))

Ejemplo de cómo manejar los resultados

In [None]:
for tokens, lemmas, stems, pos_tags in documents:
    print("Tokens:", tokens[:2])
    print("Lemmas:", lemmas[:2])
    print("Stems:", stems[:2])
    print("POS tags:", pos_tags[:2])

Convertir documentos a un DataFrame

In [None]:
df = pd.DataFrame(documents, columns=['text','A','B','C'])


Tokenización y Eliminación de stopwords

In [None]:
df['tokens'] = df['text']

stop_words = set(stopwords.words('english'))
df['filtered_tokens'] = df['tokens'].apply(lambda x: [word for word in x if word.lower() not in stop_words])



Concatenar todas las palabras de todos los documentos para análisis de frecuencia

In [None]:
all_words = [word.lower() for tokens in df['filtered_tokens'] for word in tokens]
word_freq = Counter(all_words)


## Nube de Palabras

In [None]:
wordcloud = WordCloud(width=800, height=400, background_color='white').generate_from_frequencies(word_freq)

plt.figure(figsize=(10, 6))
plt.imshow(wordcloud, interpolation='bilinear')
plt.title('Nube de Palabras')
plt.axis('off')
plt.show()



## Distribución de Términos

In [None]:
common_words = [word for word, freq in word_freq.most_common(10)]

plt.figure(figsize=(10, 6))
plt.bar(common_words, [word_freq[word] for word in common_words], color='lightgreen')
plt.title('Frecuencia de Palabras Comunes')
plt.xlabel('Palabras')
plt.ylabel('Frecuencia')
plt.xticks(rotation=45)
plt.show()