# Settings

In [98]:
import re
import string
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.tokenize import sent_tokenize
from nltk.tokenize import TreebankWordTokenizer
import warnings
warnings.filterwarnings('ignore')
nltk.download('punkt')        # once is enough
nltk.download('stopwords')    # once is enough
# stopwords.words('english')[0:10]

text = 'Statistics skills, and programming skills are equally important for analytics. Statistics skills, and domain knowledge are important for analytics. I like reading books and travelling.'
text

[nltk_data] Downloading package punkt to /Users/hwan/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /Users/hwan/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


'Statistics skills, and programming skills are equally important for analytics. Statistics skills, and domain knowledge are important for analytics. I like reading books and travelling.'

# Processing

In [99]:
# Function to remove numbers
def remove_numbers(text):
    return re.sub(r'\d+', '', text)

# Function to remove punctuations
def remove_punctuations(text):
    words = nltk.word_tokenize(text)
    punt_removed= [w for w in words if w.lower() not in string.punctuation]
    return " ".join(punt_removed)

# Function to remove stop words
def remove_stopwords(text, lang='english'):
    words = nltk.word_tokenize(text)
    lang_stopwords= stopwords.words(lang)
    stopwords_removed= [w for w in words if w.lower() not in lang_stopwords]
    return " ".join(stopwords_removed)

# Function to remove whitespace
def remove_whitespace(text):
    return " ".join(text.split())

In [100]:
def process_text(text, lang='english'):
    text = re.sub(r'\d+', '', text)
    words = nltk.word_tokenize(text)
    punt_removed= [w for w in words if w.lower() not in string.punctuation]
    text = " ".join(punt_removed)
    words = nltk.word_tokenize(text)
    lang_stopwords= stopwords.words(lang)
    stopwords_removed= [w for w in words if w.lower() not in lang_stopwords]
    text = " ".join(stopwords_removed)
    return " ".join(text.split())    

In [101]:
process_text(text_processed)

'statistics skills programming skills equally important analytics statistics skills domain knowledge important analytics like reading books travelling'

# Tokenize

In [105]:
tokenizer = TreebankWordTokenizer()
tokenizer.tokenize(text_processed)[0:5]

['statistics', 'skills', 'programming', 'skills', 'equally']

# (REF.) Small Functions

#### Tokenize

In [88]:
import nltk
from nltk.tokenize import sent_tokenize
nltk.download('punkt')

[nltk_data] Downloading package punkt to /Users/hwan/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [8]:
# Create text and tokenize
text = 'Statistics skills, and programming skills are equally important for analytics. Statistics skills, and domain knowledge are important for analytics. I like reading books and travelling.'
sent_tokenize_list = sent_tokenize(text)
sent_tokenize_list

['Statistics skills, and programming skills are equally important for analytics.',
 'Statistics skills, and domain knowledge are important for analytics.',
 'I like reading books and travelling.']

In [10]:
# Load libraries
import nltk
from nltk.tokenize import TreebankWordTokenizer

# Create text and tokenize
text='Statistics skills, and programming skills are equally important for analytics. Statistics skills, and domain knowledge are important for analytics. I like reading books and travelling.'
tokenizer = TreebankWordTokenizer()
print(tokenizer.tokenize(text))

['Statistics', 'skills', ',', 'and', 'programming', 'skills', 'are', 'equally', 'important', 'for', 'analytics.', 'Statistics', 'skills', ',', 'and', 'domain', 'knowledge', 'are', 'important', 'for', 'analytics.', 'I', 'like', 'reading', 'books', 'and', 'travelling', '.']


#### processing

In [51]:
from nltk.corpus import stopwords
stopwords.words('english')[0:10]

['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're"]

In [31]:
nltk.download('punkt')        # once is enough

[nltk_data] Downloading package punkt to /Users/hwan/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [32]:
nltk.download('stopwords')    # once is enough

[nltk_data] Downloading package stopwords to /Users/hwan/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [52]:
# Load libraries
import re
import string
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
stopwords.words('english')

# nltk.download('punkt')        # once is enough
# nltk.download('stopwords')    # once is enough

# Create text
text = "This is a sample English sentence with 3 whitespaces , carriage return \n number 1234, tab \t, stop words and punctuations!"
text 

'This is a sample English sentence with 3 whitespaces , carriage return \n number 1234, tab \t, stop words and punctuations!'

In [47]:
# Function to remove numbers
def remove_numbers(text):
    return re.sub(r'\d+', '', text)

# Remove numbers
remove_numbers(text)

'This is a sample English sentence with  whitespaces , carriage return \n number , tab \t, stop words and punctuations!'

In [48]:
# Function to remove punctuations
def remove_punctuations(text):
    words = nltk.word_tokenize(text)
    punt_removed= [w for w in words if w.lower() not in string.punctuation]
    return " ".join(punt_removed)

# Remove punctuations
remove_punctuations(text)

'This is a sample English sentence with 3 whitespaces carriage return number 1234 tab stop words and punctuations'

In [49]:
# Function to remove stop words
def remove_stopwords(text, lang='english'):
    words = nltk.word_tokenize(text)
    lang_stopwords= stopwords.words(lang)
    stopwords_removed= [w for w in words if w.lower() not in lang_stopwords]
    return " ".join(stopwords_removed)

# Remove stop words
remove_stopwords(text)

'sample English sentence 3 whitespaces , carriage return number 1234 , tab , stop words punctuations !'

In [50]:
# Function to remove whitespace
def remove_whitespace(text):
    return " ".join(text.split())

# Remove white space including space, tab and carriage return
remove_whitespace(text)

'This is a sample English sentence with 3 whitespaces , carriage return number 1234, tab , stop words and punctuations!'

In [56]:
text_processed = remove_whitespace(text)
text_processed

'This is a sample English sentence with 3 whitespaces , carriage return number 1234, tab , stop words and punctuations!'