In [1]:
#The notebook explores the Unstructured Library 
#with the intention of using various ecosystem drivers

In [None]:
!pip install unstructured["local-inference"]==0.5.2 layoutparser

In [None]:
# Install Requirements
!apt-get -qq install poppler-utils tesseract-ocr
# Upgrade Pillow to latest version
!pip install -q --user --upgrade pillow

In [None]:
!apt-get -qq install libreoffice

In [None]:
!pip install -q "detectron2@git+https://github.com/facebookresearch/detectron2.git@v0.6#egg=detectron2"

In [None]:
import nltk
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')

In [None]:
!git clone https://github.com/Unstructured-IO/unstructured.git

In [6]:
!cp -r /content/unstructured/example-docs/ /content/

## Writing the examples to the content folder for easy access

In [1]:
import os
Example_dir = "/content/example-docs"

In [2]:
#basic partitioning

file_lpf = os.path.join(Example_dir,"layout-parser-paper-fast.pdf")

In [None]:
from unstructured.partition.auto import partition

#partition class
with open(file_lpf, "rb") as lpf:
  element_lpf = partition(file= lpf, 
                          include_page_breaks=True)

In [6]:
len(element_lpf)

24

In [4]:
from unstructured.partition.html import partition_html

url = "https://www.cnn.com/2023/01/30/sport/empire-state-building-green-philadelphia-eagles-spt-intl/index.html"


In [None]:
elements = partition_html(url=url)

print("\n\n".join([str(el) for el in elements]))

In [9]:
from unstructured.partition.ppt import partition_ppt

elements = partition_ppt(filename=Example_dir + "/fake-power-point.ppt")

In [None]:
elements

In [13]:
str(elements[4])

'Here is a lot of text!'

In [16]:
from unstructured.partition.image import partition_image

# Returns a List[Element] present in the pages of the parsed image document
elements = partition_image("example-docs/layout-parser-paper-fast.jpg")

In [None]:
!apt-get install pandocs

In [None]:
from unstructured.partition.epub import partition_epub

elements = partition_epub(filename="example-docs/winter-sports.epub")

In [21]:
from unstructured.partition.email import partition_email

with open("example-docs/fake-email.eml", "r") as f:
    text = f.read()
elements = partition_email(text=text, include_headers=True)

In [24]:
str(elements[5])

'Matthew Robinson: mrobinson@unstructured.io'

In [63]:
from unstructured.partition.text import partition_text

elements = partition_text(filename="example-docs/fake-text.txt")

with open("example-docs/fake-text.txt", "r") as f:
  elements = partition_text(file=f)

with open("example-docs/fake-text.txt", "r") as f:
  text = f.read()
elements = partition_text(text=text)

In [27]:
str(elements[0])

'This is a test document to use for unit tests.'

In [None]:
from unstructured.partition.text import partition_text
from unstructured.cleaners.core import group_broken_paragraphs


text = """The big brown fox
was walking down the lane.

At the end of the lane, the
fox met a bear."""

partition_text(text=text, 
               paragraph_grouper=group_broken_paragraphs)

In [29]:
from unstructured.cleaners.core import replace_unicode_quotes

replace_unicode_quotes("Philadelphia Eaglesâ\x80\x99 victory")

"Philadelphia Eagles' victory"

In [None]:
import re

remove_citations = lambda text: re.sub("\[\d{1,3}\]", "", text)

element = Text("[1] Geolocated combat footage has confirmed Russian gains in the Dvorichne area northwest of Svatove.")
element.apply(remove_citations)
print(element)

Applies clean_bullets if bullets=True.

Applies clean_extra_whitespace if extra_whitespace=True.

Applies clean_dashes if dashes=True.

Applies clean_trailing_punctuation if trailing_punctuation=True.

Lowercases the output if lowercase=True.

In [31]:
from unstructured.cleaners.core import clean

# Returns "an excellent point!"
clean("● An excellent point!", bullets=True, lowercase=True)

# Returns "ITEM 1A: RISK FACTORS"
clean("ITEM 1A:     RISK-FACTORS", extra_whitespace=True, dashes=True)

'ITEM 1A: RISK FACTORS'

In [32]:
from unstructured.cleaners.core import clean_bullets

# Returns "I love Morse Code! ●●●"
clean_bullets("I love Morse Code! ●●●")

'I love Morse Code! ●●●'

In [36]:
from unstructured.cleaners.core import clean_ordered_bullets

# Returns "This is a very important point"
clean_bullets("a.b This is a very important point")

'a.b This is a very important point'

In [38]:
from unstructured.cleaners.core import replace_unicode_quotes

# Returns "“A lovely quote!”"
replace_unicode_quotes("\x93A lovely quote!\x94")

'“A lovely quote!”'

In [39]:
from unstructured.cleaners.extract import extract_text_before

text = "Here I am! STOP Look at me! STOP I'm flying! STOP"

# Returns "Here I am!"
extract_text_before(text, r"STOP")

'Here I am!'

In [40]:
from unstructured.cleaners.extract import extract_text_after

text = "SPEAKER 1: Look at me, I'm flying!"

# Returns "Look at me, I'm flying!"
extract_text_after(text, r"SPEAKER \d{1}:")

"Look at me, I'm flying!"

In [41]:
from unstructured.cleaners.extract import extract_email_address

text = """Me me@email.com and You <You@email.com>
    ([ba23::58b5:2236:45g2:88h2]) (10.0.2.01)"""

# Returns "['me@email.com', 'you@email.com']"
extract_email_address(text)

['me@email.com', 'you@email.com']

In [42]:
from unstructured.cleaners.extract import extract_ip_address

text = """Me me@email.com and You <You@email.com>
  ([ba23::58b5:2236:45g2:88h2]) (10.0.2.01)"""

# Returns "['ba23::58b5:2236:45g2:88h2', '10.0.2.01']"
extract_ip_address(text)


['ba23::58b5:2236:45g2:88h2', '10.0.2.01']

In [43]:
from unstructured.cleaners.extract import extract_datetimetz

text = """from ABC.DEF.local ([ba23::58b5:2236:45g2:88h2]) by
  \n ABC.DEF.local2 ([ba23::58b5:2236:45g2:88h2%25]) with mapi id\
  n 32.88.5467.123; Fri, 26 Mar 2021 11:04:09 +1200"""

# Returns datetime.datetime(2021, 3, 26, 11, 4, 9, tzinfo=datetime.timezone(datetime.timedelta(seconds=43200)))
extract_datetimetz(text)

datetime.datetime(2021, 3, 26, 11, 4, 9, tzinfo=datetime.timezone(datetime.timedelta(seconds=43200)))

In [None]:
!pip install langdetect sentencepiece

In [None]:
from unstructured.cleaners.translate import translate_text

# Output is "I'm a Berliner!"
translate_text("Ich bin ein Berliner!")

In [51]:
##Staging

import json
from unstructured.staging.label_studio import stage_for_label_studio

output = stage_for_label_studio(elements)

In [52]:
print(json.dumps(output[:2], indent=4))

[
    {
        "data": {
            "text": "This is a test document to use for unit tests.",
            "ref_id": "1df8eeb8be847c3a1a7411e3be3e0396"
        }
    },
    {
        "data": {
            "text": "Doylestown, PA 18901",
            "ref_id": "a9d4657034aa3fdb5177f1325e912362"
        }
    }
]


In [56]:
from unstructured.documents.elements import Title, NarrativeText
from unstructured.staging.base import convert_to_dict
isd = convert_to_dict(elements)

In [57]:
isd

[{'element_id': '1df8eeb8be847c3a1a7411e3be3e0396',
  'coordinates': None,
  'text': 'This is a test document to use for unit tests.',
  'type': 'NarrativeText',
  'metadata': {}},
 {'element_id': 'a9d4657034aa3fdb5177f1325e912362',
  'coordinates': None,
  'text': 'Doylestown, PA 18901',
  'type': 'Address',
  'metadata': {}},
 {'element_id': '9c218520320f238595f1fde74bdd137d',
  'coordinates': None,
  'text': 'Important points:',
  'type': 'Title',
  'metadata': {}},
 {'element_id': '39a3ae572581d0f1fe7511fd7b3aa414',
  'coordinates': None,
  'text': 'Hamburgers are delicious',
  'type': 'ListItem',
  'metadata': {}},
 {'element_id': 'fc1adcb8eaceac694e500a103f9f698f',
  'coordinates': None,
  'text': 'Dogs are the best',
  'type': 'ListItem',
  'metadata': {}},
 {'element_id': '0b61e826b1c4ab05750184da72b89f83',
  'coordinates': None,
  'text': 'I love fuzzy blankets',
  'type': 'ListItem',
  'metadata': {}}]

In [58]:
from unstructured.staging.base import dict_to_elements

re_elements = dict_to_elements(isd)

In [60]:
str(re_elements[0])

'This is a test document to use for unit tests.'

In [64]:
from unstructured.documents.elements import Title, NarrativeText
from unstructured.staging.base import convert_to_csv

isd_csv = convert_to_csv(elements)

In [66]:
print(isd_csv)

type,text,element_id,coordinates,filename,page_number,url
NarrativeText,This is a test document to use for unit tests.,1df8eeb8be847c3a1a7411e3be3e0396,,,,
Address,"Doylestown, PA 18901",a9d4657034aa3fdb5177f1325e912362,,,,
Title,Important points:,9c218520320f238595f1fde74bdd137d,,,,
ListItem,Hamburgers are delicious,39a3ae572581d0f1fe7511fd7b3aa414,,,,
ListItem,Dogs are the best,fc1adcb8eaceac694e500a103f9f698f,,,,
ListItem,I love fuzzy blankets,0b61e826b1c4ab05750184da72b89f83,,,,



In [None]:
from unstructured.documents.elements import Title, NarrativeText
from unstructured.staging.base import convert_to_dataframe
df = convert_to_dataframe(elements)
df

In [None]:
!pip install transformers sentence-transformers huggingface

In [None]:
from transformers import AutoTokenizer, AutoModelForTokenClassification
from transformers import pipeline

from unstructured.documents.elements import NarrativeText
from unstructured.staging.huggingface import stage_for_transformers

model_name = "hf-internal-testing/tiny-bert-for-token-classification"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForTokenClassification.from_pretrained(model_name)

nlp = pipeline("ner", model=model, tokenizer=tokenizer)

In [71]:
text = """From frost advisories this morning to a strong cold front expected later this week, the chance of fall showing up is real.

There's a refreshing crispness to the air, and it looks to get only more pronounced as the week goes on.

Frost advisories were in place this morning across portions of the Appalachians and coastal Maine as temperatures dropped into the 30s.

Temperatures this morning were in the 40s as far south as the Florida Panhandle.

And Maine even had a few reports of their first snow of the season Sunday. More cities could see their first snow later this week.

Yes, hello fall!

As temperatures moderate during the next few days, much of the east will stay right around seasonal norms, but the next blast of cold air will be strong and come with the potential for hazardous conditions.

"A more active fall weather pattern is expected to evolve by the end of this week and continuing into the weekend as a couple of cold fronts move across the central and eastern states," the Weather Prediction Center said.

The potent cold front will come in from Canada with a punch of chilly air, heavy rain and strong wind.

The Weather Prediction Center has a slight risk of excessive rainfall for much of the Northeast and New England on Thursday, including places like New York City, Buffalo and Burlington, so we will have to look out for flash flooding in these areas.

"More impactful weather continues to look likely with confidence growing that our region will experience the first real fall-like system with gusty to strong winds and a period of moderate to heavy rain along and ahead of a cold front passage," the National Weather Service office in Burlington wrote.

The potential for very heavy rain could accompany the front, bringing up to two inches of rain for much of the area, and isolated locations could see even more.

"Ensembles [forecast models] show median rainfall totals by Wednesday night around a half inch, with a potential for some spots to see around one inch, our first substantial rainfall in at least a couple of weeks," the weather service office in Grand Rapids noted, adding, "It may also get cold enough for some snow to mix in Thursday night to Friday morning, especially in the higher terrain north of Grand Rapids toward Cadillac."

There is also a chance for very strong winds to accompany the system.

The weather service is forecasting winds of 30-40 mph ahead of the cold front, which could cause some tree limbs to fall and sporadic power outages.

Behind the front, temperatures will fall.

"East Coast, with highs about 5-15 degrees below average to close out the workweek and going into next weekend, with highs only in the 40s and 50s from the Great Lakes to the Northeast on most days," the Weather Prediction Center explained.

By the weekend, a second cold front will drop down from Canada and bring a reinforcing shot of chilly air across the eastern half of the country."""

In [72]:
chunks = stage_for_transformers([NarrativeText(text=text)], tokenizer)

In [73]:
results = [nlp(chunk) for chunk in chunks]

In [None]:
results

In [78]:
#Staging to multiple platforms

label_studio_data = stage_for_label_studio(elements)

In [79]:
label_studio_data

[{'data': {'text': 'This is a test document to use for unit tests.',
   'ref_id': '1df8eeb8be847c3a1a7411e3be3e0396'}},
 {'data': {'text': 'Doylestown, PA 18901',
   'ref_id': 'a9d4657034aa3fdb5177f1325e912362'}},
 {'data': {'text': 'Important points:',
   'ref_id': '9c218520320f238595f1fde74bdd137d'}},
 {'data': {'text': 'Hamburgers are delicious',
   'ref_id': '39a3ae572581d0f1fe7511fd7b3aa414'}},
 {'data': {'text': 'Dogs are the best',
   'ref_id': 'fc1adcb8eaceac694e500a103f9f698f'}},
 {'data': {'text': 'I love fuzzy blankets',
   'ref_id': '0b61e826b1c4ab05750184da72b89f83'}}]

In [84]:
from unstructured.staging.datasaur import stage_for_datasaur

datasaur_data = stage_for_datasaur(elements)

In [81]:
datasaur_data

[{'text': 'This is a test document to use for unit tests.', 'entities': []},
 {'text': 'Doylestown, PA 18901', 'entities': []},
 {'text': 'Important points:', 'entities': []},
 {'text': 'Hamburgers are delicious', 'entities': []},
 {'text': 'Dogs are the best', 'entities': []},
 {'text': 'I love fuzzy blankets', 'entities': []}]

In [86]:
from unstructured.staging.argilla import stage_for_argilla

argilla_dataset = stage_for_argilla(elements, "text_classification")

In [None]:
argilla_dataset