<img src="http://bigdataexperience.org/BX/wp-content/uploads/2016/05/BX-FULL.png" width="200" height="200" alt="Big Data Experience Center, King Mongkut's University of Technology Thonburi">

# <center>Text Mining</center>
# <center>Module 2 - Text Preprocessing</center>
---

# 1. Loading data

In [None]:
import nltk
nltk.download('punkt')
f = open('harry_7books.txt', encoding='utf8')
raw = f.read()
tokens = nltk.word_tokenize(raw)

In [None]:
print(tokens[1000:1100])

# 2. Regular expression

In [None]:
tokens = [w.lower() for w in tokens]

In [None]:
import re

[w for w in tokens if re.search('ed$', w)]

In [None]:
[w for w in tokens if re.search('^..j..t..$', w)]

In [None]:
[w for w in tokens if re.search('^spo.ts?$', w)]

In [None]:
[w for w in tokens if re.search('^[b-f]', w)]

In [None]:
[w for w in tokens if re.search('^[bdf][aue][rts]$', w)]

In [None]:
set([w for w in tokens if re.search('noo+$', w)])

In [None]:
set([w for w in tokens if re.search('noo*$', w)])

In [None]:
[w for w in tokens if re.search('\.+$', w)]

In [None]:
[w for w in tokens if re.search('^[A-Z]+\.$', w)]

In [None]:
[w for w in tokens if re.search('^[0-9]{4}$', w)]

In [None]:
nltk.download('treebank')
wsj = sorted(set(nltk.corpus.treebank.words()))
[w for w in wsj if re.search('^[0-9]+-[a-z]{3,5}$', w)]

In [None]:
[w for w in wsj if re.search('^[a-z]{5,}-[a-z]{2,3}-[a-z]{,6}$', w)]

In [None]:
set([w for w in tokens if re.search('(ed|ing)$', w)])

# 2. Regular expression tokenizer

## 2.1 Basic regex tokenizer

In [4]:
import nltk

s = ("Good muffins cost $3.88\nin New York.  Please buy me\n"
      "two of them.\n\nThanks.")
s2 = ("Alas, it has not rained today. When, do you think, "
      "will it rain again?")
s3 = ("<p>Although this is <b>not</b> the case here, we must "
      "not relax our vigilance!</p>")

In [5]:
s2

'Alas, it has not rained today. When, do you think, will it rain again?'

In [6]:
nltk.regexp_tokenize(s2, r'[,\.\?!"]\s*', gaps=True)

['Alas',
 'it has not rained today',
 'When',
 'do you think',
 'will it rain again']

In [7]:
s3

'<p>Although this is <b>not</b> the case here, we must not relax our vigilance!</p>'

In [8]:
nltk.regexp_tokenize(s3, r'</?.>', gaps=False)

['<p>', '<b>', '</b>', '</p>']

In [None]:
nltk.regexp_tokenize(s3, r'</?.>', gaps=True)

## 2.2 Tokenization and removing all punctuations

In [None]:
tokenizer = nltk.RegexpTokenizer(r"\w+")
tokens_0 = tokenizer.tokenize(raw)
print(tokens_0[2000:2100])

# 3. Text normalization

## 3.1 Transform to lower cases

In [None]:
tokens = [w.lower() for w in tokens]
tokens

## 3.2 Replacing \xad

In [None]:
set([w.replace('\xad','') for w in tokens if w.startswith('Gry') ])

In [None]:
tokens_1 = [w.replace('\xad','') for w in tokens]

## 3.3 Stemming

In [None]:
porter = nltk.PorterStemmer()
tokens_2 = [porter.stem(t) for t in tokens_1]

In [None]:
tokens_1[1000:1100]

In [None]:
tokens_2[1000:1100]

## 3.4 Lemmatization

In [3]:
import nltk
nltk.download('omw-1.4')
nltk.download('wordnet')
wnl = nltk.WordNetLemmatizer()
  
print("rocks :", wnl.lemmatize("rocks")) 
print("corpora :", wnl.lemmatize("corpora")) 
print("running :", wnl.lemmatize("running", pos='v'))

[nltk_data] Downloading package omw-1.4 to
[nltk_data]     C:\Users\2543b\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping corpora\omw-1.4.zip.
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\2543b\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


rocks : rock
corpora : corpus
running : run


In [None]:
tokens_3[1000:1100]

## 3.5 Remove stopwords

### 3.5.1 Stopword list

In [None]:
from nltk.corpus import stopwords
stop_words=set(stopwords.words("english"))
print(stop_words)

### 3.5.2 Stopword removal

In [None]:
print(tokens[2000:2100])

In [None]:
tokens_4 = [w for w in tokens if not w in stop_words]
print(tokens_4[2000:2100])

## 3.6 POS Tagging

In [None]:
nltk.download('averaged_perceptron_tagger')
tokens_5 = nltk.pos_tag(tokens)

In [None]:
tokens_5[2000:2100]

# Activity

- Use the harry-potter dataset
- Explore the pattern
- Identify the cleaning points
- Conduct text cleaning
- Explore the word and character distribution
- Do you see different patterns from what you observed in the morning?

---
# Project

In this part, perform following tasks
1. Identify the data preparation issue
2. Perform data preprocessing

In [None]:
# work here