In [19]:
import re
import json
import requests
import numpy as np
import pandas as pd
from tika import parser
import matplotlib.pyplot as plt

%matplotlib inline

In [20]:
!ls samples

'20140910-APAC-AUSTRALIA-ACT-CANBERRA-RESEARCH-Storage Beyond Panama-CSIRO-Transcript.docx'
 20160219-APAC-AUSTRALIA-VICTRIA-MELBOURNE-ICT-HyperConverged-ModerationIT-Interview.pdf
 alice_in_wonderland.txt
 bloomberg.csv
 Canada-fiscalRefTable-trf-10-eng.xls
 espanol.txt
 icechat.eml
'Re Aixeon-HDS testing.msg'
'Thinking In C++ 2nd Edition.rtf'
 workers-and-capital.epub


### MIME Types

https://developer.mozilla.org/en-US/docs/Web/HTTP/Basics_of_HTTP/MIME_types

# TEXT

## Get Metadata

In [21]:
file = "samples/alice_in_wonderland.txt"
tika_url = "http://localhost:9998/meta"
headers = {'Content-type': 'text/plain', 'Alice': file}
r = requests.put(tika_url, data=open(file, 'rb'), headers=headers)
print(r.text)

X-TIKA:Parsed-By,org.apache.tika.parser.DefaultParser,org.apache.tika.parser.csv.TextAndCSVParser
Content-Encoding,ISO-8859-1
language,en
Content-Type-Override,text/plain
Content-Type,text/plain; charset=ISO-8859-1



## Return txt as txt

### Note the headers content-type and accept

In [22]:
file = "samples/alice_in_wonderland.txt"
tika_url = "http://localhost:9998/tika"
headers = {'Content-type': 'text/plain', 'Alice': file, 'Accept': 'text/plain'}
r = requests.put(tika_url, data=open(file, 'rb'), headers=headers)
print(r.text[200:500])


       CHAPTER I

                      Down the Rabbit-Hole


  Alice was beginning to get very tired of sitting by her sister
on the bank, and of having nothing to do:  once or twice she had
peeped into the book her sister was reading, but it had no
pictures or conversations in it, `and what is th


### Return as json. Note the tika url

In [23]:
file = "samples/alice_in_wonderland.txt"
tika_url = "http://localhost:9998/tika/text"
headers = {'Content-type': 'text/plain', 'Alice': file, 'Accept': 'application/json'}
r = requests.put(tika_url, data=open(file, 'rb'), headers=headers)
print(json.loads(r.text)['X-TIKA:content'][200:500])

                CHAPTER I

                      Down the Rabbit-Hole


  Alice was beginning to get very tired of sitting by her sister
on the bank, and of having nothing to do:  once or twice she had
peeped into the book her sister was reading, but it had no
pictures or conversations in it, `and w


## Identify MIME/media type

In [24]:
file = "samples/alice_in_wonderland.txt"
tika_url = "http://localhost:9998/detect/stream"
headers = {'Alice': file, 'Accept': 'text/plain'}
r = requests.put(tika_url, data=open(file, 'rb'), headers=headers)
print(r.text)

text/plain


In [25]:
file = "samples/20160219-APAC-AUSTRALIA-VICTRIA-MELBOURNE-ICT-HyperConverged-ModerationIT-Interview.pdf"
tika_url = "http://localhost:9998/detect/stream"
headers = {'Alice': file, 'Accept': 'text/plain'}
r = requests.put(tika_url, data=open(file, 'rb'), headers=headers)
print(r.text)

application/pdf


In [26]:
file = "samples/workers-and-capital.epub"
tika_url = "http://localhost:9998/detect/stream"
headers = {'Alice': file, 'Accept': 'text/plain'}
r = requests.put(tika_url, data=open(file, 'rb'), headers=headers)
print(r.text)

application/epub+zip


## Detect Language

In [27]:
file = "samples/alice_in_wonderland.txt"
tika_url = "http://localhost:9998/language/stream"
headers = {'Content-type': 'text/plain', 'Alice': file,}
r = requests.put(tika_url, data=open(file, 'rb'), headers=headers)
print(r.text)

en


## Language Translation

**Note: this doesn't seem to be working and is not clearning documented**

In [28]:
file = "samples/alice_in_wonderland.txt"
tika_url = "http://localhost:9998/translate/all/org.apache.tika.language.translate.Lingo24Translator/en/es"
headers = {'Content-type': 'application/octet-stream', 'Alice': file, 'Accept': 'text/plain'}
r = requests.put(tika_url, data=open(file, 'rb'), headers=headers)
print(r)

<Response [500]>


In [29]:
file = "samples/alice_in_wonderland.txt"
tika_url = "http://localhost:9998/translate/all/org.apache.tika.language.translate.GoogleTranslator/en/es"
headers = {'Content-type': 'application/octet-stream', 'Alice': file, 'Accept': 'text/plain'}
r = requests.put(tika_url, data=open(file, 'rb'), headers=headers)
print(r)

<Response [500]>


In [39]:
file = "samples/espanol.txt"
tika_url = "http://localhost:9998/translate/all/org.apache.tika.language.translate.RTGTranslator/x/eng"
headers = {'Content-type': 'text/plain', 'espanol': file, 'Accept': 'text/plain'}
r = requests.put(tika_url, data=open(file, 'rb'), headers=headers)
print(r)

<Response [500]>


In [31]:
#!cat samples/espanol.txt

# PDF

## Get Metadata

In [None]:
# # from tika import parser # pip install tika
# file = "samples/20160219-APAC-AUSTRALIA-VICTRIA-MELBOURNE-ICT-HyperConverged-ModerationIT-Interview.pdf"
# raw = parser.from_file(file)
# content = raw['content'].strip()
# print(content[500:1000])

In [None]:
file = "samples/20160219-APAC-AUSTRALIA-VICTRIA-MELBOURNE-ICT-HyperConverged-ModerationIT-Interview.pdf"
tika_url = "http://localhost:9998/meta"
headers = {'Content-type': 'application/pdf', 'interview': file}
r = requests.put(tika_url, data=open(file, 'rb'), headers=headers)
print(r.text)

## Return pdf as txt

### Note the headers content-type and accept

In [None]:
file = "samples/20160219-APAC-AUSTRALIA-VICTRIA-MELBOURNE-ICT-HyperConverged-ModerationIT-Interview.pdf"
tika_url = "http://localhost:9998/tika"
headers = {'Content-type': 'application/pdf', 'interview': file, 'Accept': 'text/plain'}
r = requests.put(tika_url, data=open(file, 'rb'), headers=headers)
print(r.text[:1000])

In [None]:
file = "samples/20160219-APAC-AUSTRALIA-VICTRIA-MELBOURNE-ICT-HyperConverged-ModerationIT-Interview.pdf"
tika_url = "http://localhost:9998/detect/stream"
headers = {'Alice': file, 'Accept': 'text/plain'}
r = requests.put(tika_url, data=open(file, 'rb'), headers=headers)
print(r.text)

In [None]:
# from tika import parser # pip install tika
file = "samples/20160219-APAC-AUSTRALIA-VICTRIA-MELBOURNE-ICT-HyperConverged-ModerationIT-Interview.pdf"
raw = parser.from_file(file)
content = raw['content'].strip()
print(content[500:1000])

# DOCX

## Metadata

In [None]:
file = "samples/20140910-APAC-AUSTRALIA-ACT-CANBERRA-RESEARCH-Storage Beyond Panama-CSIRO-Transcript.docx"
tika_url = "http://localhost:9998/meta"
headers = {'Content-type': 'application/vnd.openxmlformats-officedocument.wordprocessingml.document', 'interview': file}
r = requests.put(tika_url, data=open(file, 'rb'), headers=headers)
print(r.text)

## Get document type

In [None]:
file = "samples/20140910-APAC-AUSTRALIA-ACT-CANBERRA-RESEARCH-Storage Beyond Panama-CSIRO-Transcript.docx"
tika_url = "http://localhost:9998/detect/stream"
headers = {'Alice': file, 'Accept': 'text/plain'}
r = requests.put(tika_url, data=open(file, 'rb'), headers=headers)
print(r.text)

## send doc format, get text

In [None]:
file = "samples/20140910-APAC-AUSTRALIA-ACT-CANBERRA-RESEARCH-Storage Beyond Panama-CSIRO-Transcript.docx"
tika_url = "http://localhost:9998/tika"
headers = {'Content-type': 'application/vnd.openxmlformats-officedocument.wordprocessingml.document', 'interview': file, 'Accept': 'text/plain'}
r = requests.put(tika_url, data=open(file, 'rb'), headers=headers)
print(r.text[:1000])

## send doc format, get html

### /rmeta endpoint

In [None]:
file = "samples/20140910-APAC-AUSTRALIA-ACT-CANBERRA-RESEARCH-Storage Beyond Panama-CSIRO-Transcript.docx"
tika_url = "http://localhost:9998/rmeta"
headers = {'interview': file}
r = requests.put(tika_url, data=open(file, 'rb'), headers=headers)
json.loads(r.text)[0]['X-TIKA:content'][:1000]

## get text from docs using the tika python module

In [None]:
# from tika import parser # pip install tika
file = "samples/20160219-APAC-AUSTRALIA-VICTRIA-MELBOURNE-ICT-HyperConverged-ModerationIT-Interview.pdf"
raw = parser.from_file(file)
content = raw['content'].strip()
print(content[500:1000])

# CSV

In [None]:
!ls samples

In [None]:
file = "samples/bloomberg.csv"
tika_url = "http://localhost:9998/detect/stream"
headers = {'Alice': file, 'Accept': 'text/plain'}
r = requests.put(tika_url, data=open(file, 'rb'), headers=headers)
print(r.text)

In [None]:
file = "samples/bloomberg.csv"
tika_url = "http://localhost:9998/meta"
headers = {'Content-type': 'text/plain', 'interview': file}
r = requests.put(tika_url, data=open(file, 'rb'), headers=headers)
print(r.text)

In [None]:
file = "samples/bloomberg.csv"
tika_url = "http://localhost:9998/tika"
headers = {'Content-type': 'text/plain', 'interview': file, 'Accept': 'text/plain'}
r = requests.put(tika_url, data=open(file, 'rb'), headers=headers)
print(r.text)

In [None]:
# from tika import parser # pip install tika
file = "samples/bloomberg.csv"
raw = parser.from_file(file)
content = raw['content'].strip()
print(content)

# XLS

In [None]:
file = "samples/Canada-fiscalRefTable-trf-10-eng.xls"
tika_url = "http://localhost:9998/detect/stream"
headers = {'Alice': file, 'Accept': 'text/plain'}
r = requests.put(tika_url, data=open(file, 'rb'), headers=headers)
print(r.text)

In [None]:
file = "samples/Canada-fiscalRefTable-trf-10-eng.xls"
tika_url = "http://localhost:9998/meta"
headers = {'Content-type': 'application/vnd.ms-excel', 'interview': file}
r = requests.put(tika_url, data=open(file, 'rb'), headers=headers)
print(r.text)

In [None]:
file = "samples/Canada-fiscalRefTable-trf-10-eng.xls"
tika_url = "http://localhost:9998/tika"
headers = {'Content-type': 'application/vnd.ms-excel', 'interview': file, 'Accept': 'text/plain'}
r = requests.put(tika_url, data=open(file, 'rb'), headers=headers)
print(r.text[22:1000])

In [None]:
# from tika import parser # pip install tika
file = "samples/Canada-fiscalRefTable-trf-10-eng.xls"
raw = parser.from_file(file)
content = raw['content'].strip()
print(content[22:1000])

# Email

In [None]:
!ls samples

In [None]:
file = "samples/icechat.eml"
tika_url = "http://localhost:9998/detect/stream"
headers = {'Alice': file, 'Accept': 'text/plain'}
r = requests.put(tika_url, data=open(file, 'rb'), headers=headers)
print(r.text)

In [None]:
file = "samples/Re Aixeon-HDS testing.msg"
tika_url = "http://localhost:9998/detect/stream"
headers = {'Alice': file, 'Accept': 'text/plain'}
r = requests.put(tika_url, data=open(file, 'rb'), headers=headers)
print(r.text)

In [None]:
file = "samples/icechat.eml"
tika_url = "http://localhost:9998/meta"
headers = {'Content-type': 'message/rfc822', 'interview': file}
r = requests.put(tika_url, data=open(file, 'rb'), headers=headers)
print(r.text)

In [None]:
file = "samples/Re Aixeon-HDS testing.msg"
tika_url = "http://localhost:9998/meta"
headers = {'Content-type': 'application/vnd.ms-outlook', 'interview': file}
r = requests.put(tika_url, data=open(file, 'rb'), headers=headers)
print(r.text)

In [None]:
file = "samples/icechat.eml"
tika_url = "http://localhost:9998/tika"
headers = {'Content-type': 'message/rfc822', 'interview': file, 'Accept': 'text/plain'}
r = requests.put(tika_url, data=open(file, 'rb'), headers=headers)
print(r.text)

In [None]:
file = "samples/Re Aixeon-HDS testing.msg"
tika_url = "http://localhost:9998/tika"
headers = {'Content-type': 'application/vnd.ms-outlook', 'interview': file, 'Accept': 'text/plain'}
r = requests.put(tika_url, data=open(file, 'rb'), headers=headers)
print(r.text[:250])

In [None]:
# from tika import parser # pip install tika
file = "samples/icechat.eml"
raw = parser.from_file(file)
content = raw['content'].strip()
print(content)

In [None]:
# from tika import parser # pip install tika
file = "samples/Re Aixeon-HDS testing.msg"
raw = parser.from_file(file)
content = raw['content'].strip()
print(content[:200])

# RTF

In [None]:
!ls samples

In [None]:
file = "samples/Thinking In C++ 2nd Edition.rtf"
tika_url = "http://localhost:9998/detect/stream"
headers = {'Alice': file, 'Accept': 'text/plain'}
r = requests.put(tika_url, data=open(file, 'rb'), headers=headers)
print(r.text)

In [None]:
file = "samples/Thinking In C++ 2nd Edition.rtf"
tika_url = "http://localhost:9998/meta"
headers = {'Content-type': 'application/rtf', 'interview': file}
r = requests.put(tika_url, data=open(file, 'rb'), headers=headers)
print(r.text)

In [None]:
file = "samples/Thinking In C++ 2nd Edition.rtf"
tika_url = "http://localhost:9998/tika"
headers = {'Content-type': 'application/rtf', 'interview': file, 'Accept': 'text/plain'}
r = requests.put(tika_url, data=open(file, 'rb'), headers=headers)
print(r.text[:2000])

In [None]:
# from tika import parser # pip install tika
file = "samples/Thinking In C++ 2nd Edition.rtf"
raw = parser.from_file(file)
content = raw['content'].strip()
print(content[:2000])

# EPUB

In [None]:
!ls samples

In [None]:
file = "samples/workers-and-capital.epub"
tika_url = "http://localhost:9998/detect/stream"
headers = {'Alice': file, 'Accept': 'text/plain'}
r = requests.put(tika_url, data=open(file, 'rb'), headers=headers)
print(r.text)

In [None]:
file = "samples/workers-and-capital.epub"
tika_url = "http://localhost:9998/meta"
headers = {'Content-type': 'application/epub+zip', 'interview': file}
r = requests.put(tika_url, data=open(file, 'rb'), headers=headers)
print(r.text)

In [None]:
file = "samples/workers-and-capital.epub"
tika_url = "http://localhost:9998/tika"
headers = {'Content-type': 'application/epub+zip', 'interview': file, 'Accept': 'text/plain'}
r = requests.put(tika_url, data=open(file, 'rb'), headers=headers)
print(r.text[3000:6000])

In [None]:
# from tika import parser # pip install tika
file = "samples/workers-and-capital.epub"
raw = parser.from_file(file)
content = raw['content'].strip()
print(content[3000:6000])