In [2]:
import os
import pandas as pd
import matplotlib.pyplot as plt
from wordcloud import WordCloud
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer
from nltk.stem import WordNetLemmatizer
from collections import Counter
import PyPDF2
import unidecode
import contractions
import spacy


In [3]:
# Descargar recursos de NLTK
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('averaged_perceptron_tagger')
nltk.download('wordnet')

[nltk_data] Downloading package punkt to
[nltk_data]     /Users/joseaguilar/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/joseaguilar/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /Users/joseaguilar/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger.zip.
[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/joseaguilar/nltk_data...


True

In [4]:
# Inicializar spaCy para lematización y POS tagging
nlp = spacy.load('en_core_web_sm')

In [5]:
# Función para extraer texto de un archivo PDF
def extract_text_from_pdf(pdf_path):
    with open(pdf_path, 'rb') as file:
        reader = PyPDF2.PdfReader(file)
        text = ''
        for page in reader.pages:
            text += page.extract_text()
    return text

In [6]:
# Función para preprocesar el texto
def preprocess_text(text):
    # Expandir contracciones
    text = contractions.fix(text)
    
    # Eliminar diacríticos
    text = unidecode.unidecode(text)
    
    # Tokenización y eliminación de puntuación
    tokens = word_tokenize(text)
    
    # Convertir a minúsculas y eliminar stopwords
    stop_words = set(stopwords.words('english'))
    tokens = [token.lower() for token in tokens if token.isalpha() and token.lower() not in stop_words]
    
    # Lematización y stemming
    lemmatizer = WordNetLemmatizer()
    stemmer = PorterStemmer()
    lemmas = [lemmatizer.lemmatize(token) for token in tokens]
    stems = [stemmer.stem(token) for token in tokens]
    
    # Part of Speech Tagging
    pos_tags = nltk.pos_tag(tokens)
    
    return tokens, lemmas, stems, pos_tags

In [7]:
# Directorio donde se encuentran los PDFs
pdf_directory = 'pdf/'

# Leer texto de todos los PDFs en el directorio
documents = []
for filename in os.listdir(pdf_directory):
    if filename.endswith('.pdf'):
        pdf_path = os.path.join(pdf_directory, filename)
        text = extract_text_from_pdf(pdf_path)
        documents.append(preprocess_text(text))

In [8]:
# Ejemplo de cómo manejar los resultados
for tokens, lemmas, stems, pos_tags in documents:
    print("Tokens:", tokens[:10])
    print("Lemmas:", lemmas[:10])
    print("Stems:", stems[:10])
    print("POS tags:", pos_tags[:10])

Tokens: ['provided', 'proper', 'attribution', 'provided', 'google', 'hereby', 'grants', 'permission', 'reproduce', 'tables']
Lemmas: ['provided', 'proper', 'attribution', 'provided', 'google', 'hereby', 'grant', 'permission', 'reproduce', 'table']
Stems: ['provid', 'proper', 'attribut', 'provid', 'googl', 'herebi', 'grant', 'permiss', 'reproduc', 'tabl']
POS tags: [('provided', 'VBN'), ('proper', 'JJ'), ('attribution', 'NN'), ('provided', 'VBD'), ('google', 'JJ'), ('hereby', 'NN'), ('grants', 'NNS'), ('permission', 'NN'), ('reproduce', 'VBP'), ('tables', 'NNS')]
Tokens: ['journal', 'machine', 'learning', 'research', 'submitted', 'published', 'aneural', 'probabilistic', 'language', 'model']
Lemmas: ['journal', 'machine', 'learning', 'research', 'submitted', 'published', 'aneural', 'probabilistic', 'language', 'model']
Stems: ['journal', 'machin', 'learn', 'research', 'submit', 'publish', 'aneur', 'probabilist', 'languag', 'model']
POS tags: [('journal', 'JJ'), ('machine', 'NN'), ('learn

[(['provided',
   'proper',
   'attribution',
   'provided',
   'google',
   'hereby',
   'grants',
   'permission',
   'reproduce',
   'tables',
   'figures',
   'paper',
   'solely',
   'use',
   'journalistic',
   'scholarly',
   'works',
   'attention',
   'need',
   'ashish',
   'vaswani',
   'google',
   'brain',
   'avaswani',
   'shazeer',
   'google',
   'brain',
   'noam',
   'parmar',
   'google',
   'research',
   'nikip',
   'uszkoreit',
   'google',
   'research',
   'usz',
   'llion',
   'jones',
   'google',
   'research',
   'llion',
   'gomez',
   'university',
   'toronto',
   'aidan',
   'kaiser',
   'google',
   'brain',
   'lukaszkaiser',
   'illia',
   'polosukhin',
   'abstract',
   'dominant',
   'sequence',
   'transduction',
   'models',
   'based',
   'complex',
   'recurrent',
   'convolutional',
   'neural',
   'networks',
   'include',
   'encoder',
   'decoder',
   'best',
   'performing',
   'models',
   'also',
   'connect',
   'encoder',
   'decoder',

In [16]:
# Convertir documentos a un DataFrame
df = pd.DataFrame(documents, columns=['text','A','B','C'])


Unnamed: 0,text,A,B,C
0,"[provided, proper, attribution, provided, goog...","[provided, proper, attribution, provided, goog...","[provid, proper, attribut, provid, googl, here...","[(provided, VBN), (proper, JJ), (attribution, ..."


In [14]:


# Tokenización de palabras
df['tokens'] = df['text'].apply(word_tokenize)

# Eliminación de stopwords
stop_words = set(stopwords.words('english'))
df['filtered_tokens'] = df['tokens'].apply(lambda x: [word for word in x if word.lower() not in stop_words])

# Concatenar todas las palabras de todos los documentos para análisis de frecuencia
all_words = [word.lower() for tokens in df['filtered_tokens'] for word in tokens]
word_freq = Counter(all_words)

# Nube de Palabras
wordcloud = WordCloud(width=800, height=400, background_color='white').generate_from_frequencies(word_freq)

plt.figure(figsize=(10, 6))
plt.imshow(wordcloud, interpolation='bilinear')
plt.title('Nube de Palabras')
plt.axis('off')
plt.show()

# Distribución de Términos
common_words = [word for word, freq in word_freq.most_common(10)]

plt.figure(figsize=(10, 6))
plt.bar(common_words, [word_freq[word] for word in common_words], color='lightgreen')
plt.title('Frecuencia de Palabras Comunes')
plt.xlabel('Palabras')
plt.ylabel('Frecuencia')
plt.xticks(rotation=45)
plt.show()

TypeError: expected string or bytes-like object, got 'list'