In [None]:
!pip -q install txtai[all] sacremoses > /dev/null

In [1]:
!pip -q install txtai[pipeline,workflow] sacremoses > /dev/null

In [None]:
# Get test data
!wget -N https://github.com/neuml/txtai/releases/download/v2.0.0/tests.tar.gz
!tar -xvzf tests.tar.gz

In [3]:
%%capture
from txtai.pipeline import Summary, Textractor, Transcription, Translation

# Summary instance
summary = Summary()

# Text extraction
textractor = Textractor()

# Transcription instance
transcribe = Transcription("facebook/wav2vec2-large-960h")

# Create a translation instance
translate = Translation()

In [4]:
from txtai.workflow import Workflow, Task

# Workflow that translate text to French
workflow = Workflow([Task(lambda x: translate(x, "fr"))])

# Data to run through the pipeline
data = ["The sky is blue", "Forest through the trees"]

# Workflows are genera

In [5]:
# Workflows are generators for efficiency, read results to list for display
list(workflow(data))

Downloading (…)d-models/lid.176.ftz:   0%|          | 0.00/938k [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/1.42k [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/301M [00:00<?, ?B/s]

Downloading (…)neration_config.json:   0%|          | 0.00/293 [00:00<?, ?B/s]

Downloading (…)okenizer_config.json:   0%|          | 0.00/42.0 [00:00<?, ?B/s]

Downloading (…)olve/main/source.spm:   0%|          | 0.00/778k [00:00<?, ?B/s]

Downloading (…)olve/main/target.spm:   0%|          | 0.00/802k [00:00<?, ?B/s]

Downloading (…)olve/main/vocab.json:   0%|          | 0.00/1.34M [00:00<?, ?B/s]

['Le ciel est bleu', 'Forêt à travers les arbres']

In [6]:
from txtai.workflow import FileTask

tasks = [
    FileTask(transcribe, r"\.wav$"),
    Task(lambda x: translate(x, "fr"))
]

# List of files to process
data = [
  "txtai/US_tops_5_million.wav",
  "txtai/Canadas_last_fully.wav",
  "txtai/Beijing_mobilises.wav",
  "txtai/The_National_Park.wav",
  "txtai/Maine_man_wins_1_mil.wav",
  "txtai/Make_huge_profits.wav"
]

# Workflow that translate text to French
workflow = Workflow(tasks)

# Run workflow
list(workflow(data))

["Les cas de virus U sont en tête d'un million",
 "La dernière plate-forme de glace entièrement intacte du Canada s'est soudainement effondrée en formant un berge de glace de taille manhatten",
 "Bagage mobilise l'invasion kraft le long des côtes à mesure que les tensions tiwanaises s'épelent",
 "Le service des parcs nationaux met en garde contre le sacrifice d'amis plus lents dans une attaque nue",
 "L'homme banni gagne du billet de loterie",
 "Faire d'énormes profits sans travailler faire jusqu'à cent mille dollars par jour"]

In [None]:
from txtai.embeddings import Embeddings, Documents
from txtai.workflow import FileTask, WorkflowTask

# Embeddings index
embeddings = Embeddings({"path": "sentence-transformers/paraphrase-multilingual-mpnet-base-v2", "content": True})
documents = Documents()

# List of files to process
files = [
  "txtai/article.pdf",
  "txtai/US_tops_5_million.wav",
  "txtai/Canadas_last_fully.wav",
  "txtai/Beijing_mobilises.wav",
  "txtai/The_National_Park.wav",
  "txtai/Maine_man_wins_1_mil.wav",
  "txtai/Make_huge_profits.wav"
]

data = [(x, element, None) for x, element in enumerate(files)]

# Workflow that extracts text and builds a summary
articles = Workflow([
    FileTask(textractor),
    Task(summary)
])

# Define workflow tasks. Workflows can also be tasks!
tasks = [
    WorkflowTask(articles, r".\.pdf$"),
    FileTask(transcribe, r"\.wav$"),
    Task(lambda x: translate(x, "fr")),
    Task(documents.add, unpack=False)
]

# Workflow that translate text to French
workflow = Workflow(tasks)

# Run workflow and show results to be indexed
for x in workflow(data):
  print(x)

# Build the embeddings index
embeddings.index(documents)

# Cleanup temporary storage
documents.close()

In [8]:

from txtai.pipeline import Sequences
from txtai.workflow import Workflow, TemplateTask

# Create sequences pipeline
sequences = Sequences("google/flan-t5-large")

# Define workflow or chaining of tasks together.
workflow = Workflow([
    TemplateTask(
        template="Translate '{statement}' to {language} if it's English",
        action=sequences
    ),
    TemplateTask(
        template="What language is the following text? {text}",
        action=sequences
    )
])

inputs = [
    {"statement": "Hello, how are you", "language": "French"},
    {"statement": "Hallo, wie geht's dir", "language": "French"}
]

print(list(workflow(inputs)))

Downloading (…)lve/main/config.json:   0%|          | 0.00/662 [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/3.13G [00:00<?, ?B/s]

Downloading (…)neration_config.json:   0%|          | 0.00/147 [00:00<?, ?B/s]

Downloading (…)okenizer_config.json:   0%|          | 0.00/2.54k [00:00<?, ?B/s]

Downloading spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/2.42M [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/2.20k [00:00<?, ?B/s]

['French', 'German']
