<a href="https://colab.research.google.com/github/kumar045/Assignment-For-Filed/blob/main/NERANDTEXTCLEANING.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!pip install transformers



In [6]:
from transformers import AutoTokenizer, AutoModelForTokenClassification
from transformers import pipeline

def process(text, prompt, threshold=0.5):
    input_ = f"{prompt}\n{text}"
    results = nlp(input_)
    processed_results = []
    prompt_length = len(prompt)
    for result in results:
        if result['score'] < threshold:
            continue
        start = result['start'] - prompt_length
        if start < 0:
            continue
        end = result['end'] - prompt_length
        span = text[start:end]
        processed_result = {
            'span': span,
            'start': start,
            'end': end,
            'score': result['score']
        }
        processed_results.append(processed_result)
    return processed_results

tokenizer = AutoTokenizer.from_pretrained("knowledgator/UTC-DeBERTa-small")
model = AutoModelForTokenClassification.from_pretrained("knowledgator/UTC-DeBERTa-small")
nlp = pipeline("ner", model=model, tokenizer=tokenizer, aggregation_strategy='first')

# NER
prompt = """Identify the following entity classes in the text:
computer

Text:
"""
text = """Apple was founded as Apple Computer Company on April 1, 1976, by Steve Wozniak, Steve Jobs (1955–2011) and Ronald Wayne to develop and sell Wozniak's Apple I personal computer.
It was incorporated by Jobs and Wozniak as Apple Computer, Inc. in 1977. The company's second computer, the Apple II, became a best seller and one of the first mass-produced microcomputers.
Apple went public in 1980 to instant financial success."""

results_ner = process(text, prompt)
print("NER",results_ner)

# QA
prompt_qa = "Who founded Apple?"
text_qa = "Apple was founded by Steve Jobs and Steve Wozniak."
results_qa = process(text_qa, prompt_qa)
print("QA",results_qa)

# Relation Extraction
rex_prompt = """
Identify target entity given the following relation: "{}" and the following source entity: "{}"

Text:
"""
text_rex = "Steve Jobs founded Apple."
entity_rex = "Steve Jobs"
relation_rex = "founded"
prompt_rex = rex_prompt.format(relation_rex, entity_rex)
results_rex = process(text_rex, prompt_rex)
print("Relation Extraction",results_rex)

# Text Cleaning
prompt_clean = """Clean the following text extracted from the web matching not relevant parts:"""
text_clean = "Here is some text with irrelevant information. Delete this part."
results_clean = process(text_clean, prompt_clean)
print("Text Cleaning",results_clean)

# Summarization
prompt_summ = "Summarize the following text, highlighting the most important sentences:"
text_summ = "Apple Inc. was founded by Steve Jobs, Steve Wozniak, and Ronald Wayne."
results_summ = process(text_summ, prompt_summ)
print("Summarise",results_summ)

# Coreference Resolution
# The provided article does not specify a coreference resolution example using this model. Typically, specialized models or tools are used for coreference resolution.

Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


NER [{'span': 'Apple I ', 'start': 150, 'end': 158, 'score': 0.54925555}, {'span': 'Apple II, ', 'start': 285, 'end': 295, 'score': 0.70257914}]
QA [{'span': 'Steve Jobs ', 'start': 21, 'end': 32, 'score': 0.9088955}, {'span': 'Steve Wozniak.', 'start': 36, 'end': 51, 'score': 0.6151945}]
Relation Extraction [{'span': 'Apple.', 'start': 19, 'end': 26, 'score': 0.946839}]
Text Cleaning [{'span': 'Delete ', 'start': 47, 'end': 54, 'score': 0.5665469}]
Summarise [{'span': 'Inc. was founded by Steve Jobs, Steve Wozniak, and Ronald Wayne.', 'start': 6, 'end': 71, 'score': 0.58014005}]
