In [2]:
# pip install spacy python-docx PyPDF2 pandas
# python -m spacy download en_core_web_sm


Using NER and python help with a code that upload a file (doc, csv, pdf, jason) then make nlt analysis (tokenizacion, remove stopword, lematizacion, steaming, etc) finally  return a dict with the name, dates, places. 

In [None]:
import spacy
import docx
import PyPDF2
import pandas as pd
import json
from datetime import datetime

# Load the pre-trained spaCy model for NER
nlp = spacy.load("en_core_web_sm")

# Function to extract text from DOCX files
def extract_text_docx(file_path):
    doc = docx.Document(file_path)
    text = []
    for para in doc.paragraphs:
        text.append(para.text)
    return "\n".join(text)

# Function to extract text from PDF files
def extract_text_pdf(file_path):
    with open(file_path, 'rb') as file:
        reader = PyPDF2.PdfReader(file)
        text = []
        for page in range(len(reader.pages)):
            text.append(reader.pages[page].extract_text())
    return "\n".join(text)

# Function to extract text from CSV files
def extract_text_csv(file_path):
    df = pd.read_csv(file_path)
    text = " ".join(df.astype(str).values.flatten())
    return text

# Function to extract text from JSON files
def extract_text_json(file_path):
    with open(file_path, 'r') as file:
        data = json.load(file)
    text = json.dumps(data)
    return text

# Function to perform NER and extract relevant entities
def extract_entities(text):
    doc = nlp(text)
    names = []
    dates = []
    places = []

    # Iterate over recognized entities
    for ent in doc.ents:
        if ent.label_ == "PERSON":
            names.append(ent.text)
        elif ent.label_ == "DATE":
            # Convert date to standardized format if possible
            try:
                date_obj = datetime.strptime(ent.text, '%B %d, %Y')  # Example for month-day-year format
                dates.append(date_obj.strftime('%Y-%m-%d'))
            except ValueError:
                dates.append(ent.text)
        elif ent.label_ == "GPE":  # GPE: Geopolitical Entity (places)
            places.append(ent.text)
    
    return {
        "names": names,
        "dates": dates,
        "places": places
    }

# Function to process the file and return entities
def process_file(file_path):
    file_extension = file_path.split('.')[-1].lower()
    text = ""

    if file_extension == 'docx':
        text = extract_text_docx(file_path)
    elif file_extension == 'pdf':
        text = extract_text_pdf(file_path)
    elif file_extension == 'csv':
        text = extract_text_csv(file_path)
    elif file_extension == 'json':
        text = extract_text_json(file_path)
    else:
        return {"error": "Unsupported file format"}

    # Extract and return entities
    return extract_entities(text)

# Example Usage
file_path = 'path_to_your_file.pdf'  # Replace with your file path
result = process_file(file_path)
print(result)


con NLP analysis

In [None]:
import nltk
nltk.download(['punkt', 'stopwords', 'averaged_perceptron_tagger', 'maxent_ne_chunker', 'words'])


In [None]:
import docx
import pandas as pd
import PyPDF2
import json
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer, WordNetLemmatizer
from nltk import ne_chunk, pos_tag
from nltk.tree import Tree
import spacy

# Initialize spaCy for NER (Named Entity Recognition)
nlp = spacy.load("en_core_web_sm")

# Initialize NLTK tools
stop_words = set(stopwords.words("english"))
stemmer = PorterStemmer()
lemmatizer = WordNetLemmatizer()

# Helper function for reading files
def read_file(file_path):
    ext = file_path.split('.')[-1].lower()
    
    # Read DOCX files
    if ext == 'docx':
        doc = docx.Document(file_path)
        text = '\n'.join([para.text for para in doc.paragraphs])
    # Read CSV files (assuming the text is in the first column)
    elif ext == 'csv':
        df = pd.read_csv(file_path)
        text = ' '.join(df.iloc[:, 0].dropna().astype(str))
    # Read PDF files
    elif ext == 'pdf':
        with open(file_path, 'rb') as file:
            reader = PyPDF2.PdfReader(file)
            text = ''
            for page in reader.pages:
                text += page.extract_text()
    # Read JSON files (assuming JSON has a key 'text')
    elif ext == 'json':
        with open(file_path, 'r') as file:
            data = json.load(file)
            text = data.get('text', '')
    else:
        raise ValueError("Unsupported file type!")
    return text

# Tokenization, stopword removal, lemmatization, stemming
def preprocess_text(text):
    # Tokenize
    tokens = word_tokenize(text)
    
    # Remove stopwords
    filtered_tokens = [word for word in tokens if word.lower() not in stop_words]
    
    # Lemmatize and Stem
    lemmatized_tokens = [lemmatizer.lemmatize(word) for word in filtered_tokens]
    stemmed_tokens = [stemmer.stem(word) for word in filtered_tokens]
    
    return lemmatized_tokens, stemmed_tokens

# Named Entity Recognition (NER)
def extract_entities(text):
    # Use SpaCy NER for extracting names, places, dates, etc.
    doc = nlp(text)
    
    entities = {'names': [], 'places': [], 'dates': []}
    
    for ent in doc.ents:
        if ent.label_ == 'PERSON':
            entities['names'].append(ent.text)
        elif ent.label_ == 'GPE':  # Geopolitical Entity (places)
            entities['places'].append(ent.text)
        elif ent.label_ == 'DATE':
            entities['dates'].append(ent.text)
    
    return entities

# Main function to upload file, process text and extract entities
def process_file(file_path):
    # Read the file content
    text = read_file(file_path)
    
    # Preprocess the text
    lemmatized_tokens, stemmed_tokens = preprocess_text(text)
    
    # Extract named entities
    entities = extract_entities(text)
    
    # Create a result dictionary
    result = {
        'file_name': file_path.split('/')[-1],
        'entities': entities,
        'lemmatized_tokens': lemmatized_tokens,
        'stemmed_tokens': stemmed_tokens
    }
    
    return result

# Example usage
file_path = 'path_to_your_file.pdf'  # Replace with your file path
result = process_file(file_path)
print(result)
