# Read and Basic String Operation

In [None]:
import nltk

## Read Text Documents

### Electronic Books

In [None]:
from urllib.request import urlopen

url = "https://www.gutenberg.org/files/2554/2554-0.txt"
raw = urlopen(url).read()
decoded_raw = raw.decode("utf-8")
type(decoded_raw)

In [None]:
len(decoded_raw)

In [None]:
decoded_raw[:78]

### Dealing with HTML

In [None]:
url = "http://news.bbc.co.uk/2/hi/health/2284783.stm"
html = urlopen(url).read()
html[:60]

### Processing RSS Feeds

In [None]:
import re
nltk.download('punkt')

def clean_html(html):
    """
    Copied from NLTK package.
    Remove HTML markup from the given string.

    :param html: the HTML string to be cleaned
    :type html: str
    :rtype: str
    """

    # First we remove inline JavaScript/CSS:
    cleaned = re.sub(r"(?is)<(script|style).*?>.*?(</\1>)", "", html.strip())
    # Then we remove html comments. This has to be done before removing regular
    # tags since comments can contain '>' characters.
    cleaned = re.sub(r"(?s)<!--(.*?)-->[\n]?", "", cleaned)
    # Next we can remove the remaining tags:
    cleaned = re.sub(r"(?s)<.*?>", " ", cleaned)
    # Finally, we deal with whitespace
    cleaned = re.sub(r"&nbsp;", " ", cleaned)
    cleaned = re.sub(r"  ", " ", cleaned)
    cleaned = re.sub(r"  ", " ", cleaned)
    return cleaned.strip()

In [None]:
import feedparser

llog = feedparser.parse("http://languagelog.ldc.upenn.edu/nll/?feed=atom")
print(llog['feed']['title'])
print(len(llog.entries))
post = llog.entries[2]
print(post.title)
content = post.content[0].value
print(content[:70])
nltk.word_tokenize(clean_html(llog.entries[2].content[0].value))

### Reading Local Files

In [None]:
f = open('document.txt')
raw=f.read()

In [None]:
raw

In [None]:
f = open('document.txt','r')
for line in f:
    print(line.strip())

In [None]:
nltk.download('gutenberg')
path = nltk.data.find('corpora/gutenberg/melville-moby_dick.txt')
raw=open(path,'r').read()
print(raw)

### Extracting Text from MSWord

In [None]:
import docx
document = docx.Document('document.docx')
docText = '\n'.join(
    paragraph.text for paragraph in document.paragraphs
)
print(docText)

## Capturing User Input

In [None]:
s = input("Enter some text:")

In [None]:
print("You typed", len(nltk.word_tokenize(s))," words.")

## The NLP Pipeline

### Document

In [None]:
raw = open('document.txt').read()
print(type(raw))
print(raw)

### Words

In [None]:
tokens = nltk.word_tokenize(raw)
print(type(tokens))
print(tokens[:10])

In [None]:
words = [w.lower() for w in tokens]
print(words[:10])

### Vocab

In [None]:
vocab=sorted(set(words))
print(vocab[:10])

## Text Processing at the Lowest Level

### Basic Operation with Strings

In [None]:
monty = 'Monty Python'
print(monty)

In [None]:
monty_circus = "Monty Python's Flying Circus"
print(monty_circus)

In [None]:
monty_circus = 'Monty Python\'s Flying Circus'
print(monty_circus)

In [None]:
monty_circus = 'Monty Python's Flying Circus'
print(monty_circus)

In [None]:
couplet = "Shall I compare thee to a Summer's day?"\
          "Thou are more lovely and more temperate:"

print(couplet)

In [None]:
couplet = ("Shall I compare thee to a Summer's day?"
          "Thou are more lovely and more temperate:")
print(couplet)

### Multiline String

In [None]:
couplet = """
    Shall I compare thee to a Summer's day?
    Thou are more lovely and more temperate:
"""
print(couplet)

In [None]:
couplet = '''
    'Shall I compare thee to a Summer's day?'
    Thou are more lovely and more temperate:
'''
print(couplet)

### String Operations

In [None]:
very = 'very'+'very'+'very'
print(very)

In [None]:
very = 'very '*3
print(very)

In [None]:
very = ['very']*3
print(very)

In [None]:
a = [1, 2, 3, 4, 5, 6, 7, 6, 5, 4, 3, 2, 1]
b = [' ' * 2 * (7 - i) + 'very' * i for i in a]

In [None]:
for line in b:
    print(b)

In [None]:
very = 'very'-'y'
print(very)

In [None]:
very = 'very'/2
print(very)

In [None]:
very = 'very'+2
print(very)

In [None]:
very = 'very'+str(2)
print(very)

### Printing Strings

In [None]:
print(monty)

In [None]:
grail = 'Holy Grail'
print(monty+grail)

In [None]:
print(monty,grail)

In [None]:
print(monty,"and the",grail)

### Accessing Individual Characters

In [None]:
for i,m in enumerate(monty):
    print("huruf",m,"muncul di index",i)

In [None]:
print(monty[0])

In [None]:
print(len(monty))

In [None]:
print(monty[12])

In [None]:
print(monty[-1])

In [None]:
print(monty[-12])

### Accessing Substrings

In [None]:
print(monty[0:1])

In [None]:
print(monty[0:5])

In [None]:
print(monty[0:20])

In [None]:
print(monty[0:])

In [None]:
print(monty[:12])

In [None]:
print(monty[:])

In [None]:
print(monty[:5])

In [None]:
print(monty[6:])

In [None]:
print(monty[-12:-6])

In [None]:
monty.find('thon')

In [None]:
monty.find('on')

In [None]:
monty.find('ton')

In [None]:
if 'on' in 'Monty Python':
    print('found "on"')

### More Operations on Strings

### find

In [None]:
monty.find('Monty')

In [None]:
monty.rfind('on')

In [None]:
monty.rfind('tom')

In [None]:
monty.index('Monty')

In [None]:
monty.index('tom')

In [None]:
monty.rindex('Monty')

In [None]:
monty.rindex('tom')

### Join

In [None]:
alphabet=[i for i in monty]
print(alphabet)

In [None]:
''.join(alphabet)

In [None]:
' '.join(alphabet)

### Split

In [None]:
monty.split() # split by space

In [None]:
monty.split('on') # split by string on

### Splitlines

In [None]:
print(couplet)

In [None]:
couplet.splitlines() # split by newline

### Convert Case

In [None]:
monty.lower() # akan menjadi huruf kecil

In [None]:
monty.upper() # akan menjadi huruf kapital

In [None]:
monty.title() #huruf depan akan kapital, sisanya huruf kecil

### Strip

In [None]:
lala = '  tinky wingky, dipsy, lala,  poo '
lala.strip()

### replace

In [None]:
lala.strip().replace('lala','jono')

### check is alphabet

In [None]:
'a'.isalpha()

In [None]:
'1a'.isalpha()

In [None]:
'1a'.isalnum()

In [None]:
'1a-'.isalnum()

In [None]:
from nltk.corpus import gutenberg
raw = gutenberg.raw('melville-moby_dick.txt')
fdist = nltk.FreqDist(ch.lower() for ch in raw if ch.isalpha())

In [None]:
fdist.keys()

In [None]:
fdist.values()

### The Difference Between Lists and Strings

In [None]:
band = 'The Beatles'
beatles = ['John', 'Paul', 'George', 'Ringo']

In [None]:
band[0]

In [None]:
beatles[0]

In [None]:
band[0:3]

In [None]:
beatles[0:3]

In [None]:
print(beatles[0]+' Lennon')

In [None]:
print(beatles + ' Lennon')

In [None]:
beatles = beatles+['Lennon']
print(beatles)

In [None]:
beatles[0] = 'John Lennon'
print(beatles)

In [None]:
del beatles[-1]

In [None]:
print(beatles)

In [None]:
band[0]='t'

### Exercise

Petunjuk:

Dari dokumen.docx 
- ambil kalimat hingga koma <,> kedua
- hitung frekuensi munculnya masing-masing huruf dari kalimat tersebut
- huruf apakah yang paling sering muncul?
- huruf l muncul berapa kali?
- hilangkan spasi dan koma <,> dari kalimat tersebut
- berapa panjang huruf untuk kalimat tersebut?
- apakah susunan huruf pada index ke 5 hingga ke 9?
- pada index keberapakah pertama kali muncul subset berikut <"lo"> ?

### Text Processing with Unicode

In [None]:
nltk.download('unicode_samples')

In [None]:
path = nltk.data.find('corpora/unicode_samples/polish-lat2.txt')

In [None]:
import codecs
f = codecs.open(path, encoding='latin2')

In [None]:
for line in f:
    line = line.strip()
    print(line)
    print(line.encode('unicode_escape'))

In [None]:
ord('ń')

In [None]:
nacute = u'\u0144'
nacute_utf = nacute.encode('utf8')
print(repr(nacute_utf))

In [None]:
import unicodedata

In [None]:
lines = codecs.open(path, encoding='latin2').readlines()

In [None]:
line = lines[0]

In [None]:
for c in line:
    if ord(c)>127:
        print ('%r U+%04x %s' % (c.encode('utf8'), ord(c), unicodedata.name(c)))

In [None]:
line.find(u'Pa\u0144stwowa')

In [None]:
line = line.lower()

In [None]:
print(line)

In [None]:
m = re.search(u'\u0144\w*', line)
print(m.group())

In [None]:
 nltk.word_tokenize(line)