<a href="https://colab.research.google.com/github/linghduoduo/NLP/blob/master/Chap2_Traditional_NLP.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

### Removing Special Characters and punctuation

In [7]:
import re
from num2words import num2words
import nltk; nltk.download('punkt'); nltk.download('stopwords'); nltk.download('wordnet')
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from nltk.stem import WordNetLemmatizer
from autocorrect import Speller


[nltk_data] Downloading package punkt to /Users/linghuang/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/linghuang/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/linghuang/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [8]:

# Preprocessing functions:
def decode(text):
  """
  The function takes in a string of text as input
  and extracts the subject line and body text from the text
  using regular expressions. It then formats the extracted
  text into a single string and returns it as output.

  Input: str
  Output: str
  """
  text = re.sub("\n|\r|\t|-", " ", text)
  subject_line_search = re.search(r"(.*?)", text, flags=re.S)
  body_text_search = re.search(r"(.*?)", text, flags=re.S)

  formated_output = ""
  if subject_line_search:
    formated_output = formated_output + subject_line_search.groups()[0] + ". "
  if body_text_search:
    formated_output = formated_output + body_text_search.groups()[0] + "."
  return formated_output

In [9]:
def digits_to_words(match):
  """
  Convert string digits to the English words. The function distinguishes between
  cardinal and ordinal.
  E.g. "2" becomes "two", while "2nd" becomes "second"

  Input: str
  Output: str
  """
  suffixes = ['st', 'nd', 'rd', 'th']
  # Making sure it's lower cased so not to rely on previous possible actions:
  string = match[0].lower()
  if string[-2:] in suffixes:
    type='ordinal'
    string = string[:-2]
  else:
    type='cardinal'

  return num2words(string, to=type)


In [10]:
def spelling_correction(text):
    """
    Replace misspelled words with the correct spelling.

    Input: str
    Output: str
    """
    corrector = Speller()
    spells = [corrector(word) for word in text.split()]
    return " ".join(spells)


In [11]:
def remove_stop_words(text):
    """
    Remove stopwords.

    Input: str
    Output: str
    """
    stopwords_set = set(stopwords.words('english'))
    return " ".join([word for word in text.split() if word not in stopwords_set])

In [12]:
def stemming(text):
    """
    Perform stemming of each word individually.

    Input: str
    Output: str
    """
    stemmer = PorterStemmer()
    return " ".join([stemmer.stem(word) for word in text.split()])


In [13]:
def lemmatizing(text):
    """
    Perform lemmatization for each word individually.

    Input: str
    Output: str
    """
    lemmatizer = WordNetLemmatizer()
    return " ".join([lemmatizer.lemmatize(word) for word in text.split()])


In [14]:
# Preprocessing pipeline:
def preprocessing(input_text, printing=False):
  """
  This function represents a complete pipeline for text preprocessing.

  Code design note: The fact that we update variable "output" instead of
  creating new variables with new names as we go, allows us to change the
  order of the actions or add/remove actions easily.

  Input: str
  Output: str
  """
  output = input_text
  # Decode/remove encoding:
  output = decode(output)
  print("\nDecode/remove encoding:\n        ", output)

  # Lower casing:
  output = output.lower()
  print("\nLower casing:\n        ", output)

  # Convert digits to words:
  # The following regex syntax looks for matching of consequtive digits tentatively followed by an ordinal suffix:
  output = re.sub(r'\d+(st)?(nd)?(rd)?(th)?', digits_to_words, output, flags=re.IGNORECASE)
  print("\nDigits to words\n        ", output)

  # Remove punctuations and other special characters:
  output = re.sub('[^ A-Za-z0-9]+', '', output)
  print("\nRemove punctuations and other special characters\n        ", output)

  # Spelling corrections:
  output = spelling_correction(output)
  print("\nSpelling corrections:\n        ", output)


  # Remove stop words:
  output = remove_stop_words(output)
  print("\nRemove stop words:\n        ", output)

  # Stemming:
  output = stemming(output)
  print("\nStemming:\n        ", output)

  # Lemmatizing:
  output = lemmatizing(output)
  print("\nLemmatizing:\n        ", output)

  return output

### Corpora, Tokens and Types

In [4]:
##!python -m spacy download en_core_web_sm

In [4]:
import spacy
nlp = spacy.load("en_core_web_sm")

In [5]:
text = "Mary, don’t slap the green witch"
print([str(token) for token in nlp(text.lower())])

['mary', ',', 'do', 'n’t', 'slap', 'the', 'green', 'witch']


In [6]:
from nltk.tokenize import TreebankWordTokenizer
text = "Mary, don’t slap the green witch"
tokenizer = TreebankWordTokenizer()
print(tokenizer.tokenize(text))

['Mary', ',', 'don’t', 'slap', 'the', 'green', 'witch']


In [7]:
from nltk.tokenize import TweetTokenizer
tweet=u"Snow White and the Seven Degrees #MakeAMovieCold@midnight:-)"
tokenizer = TweetTokenizer()
print(tokenizer.tokenize(tweet.lower()))

['snow', 'white', 'and', 'the', 'seven', 'degrees', '#makeamoviecold', '@midnight', ':-)']


### Unigrams, Bigrams, Trigrams, ..., N-grams

In [8]:
def n_grams(text, n):
    '''
    takes tokens or text, returns a list of n-grams
    '''
    return [text[i:i+n] for i in range(len(text)-n+1)]

cleaned = ['mary', ',', "n't", 'slap', 'green', 'witch', '.']
print(n_grams(cleaned, 3))

[['mary', ',', "n't"], [',', "n't", 'slap'], ["n't", 'slap', 'green'], ['slap', 'green', 'witch'], ['green', 'witch', '.']]


### Lemmas and Stems

In [9]:
doc = nlp(u"he was running late")
for token in doc:
    print('{} --> {}'.format(token, token.lemma_))

he --> he
was --> be
running --> run
late --> late


### Categorizing Words: POS Tagging

In [10]:
doc = nlp(u"Mary slapped the green witch.")
for token in doc:
    print('{} - {}'.format(token, token.pos_))

Mary - PROPN
slapped - VERB
the - DET
green - ADJ
witch - NOUN
. - PUNCT


In [13]:
import spacy
from spacy import displacy

def extract_pos(input):
    ner = spacy.load("en_core_web_sm")
    extractions = ner(input)
    displacy.render(extractions, style='dep', jupyter=True, options={'compact': True, 'distance': 100})
    return [[item.text, item.pos_] for item in extractions if item.pos_ in ["NOUN", "VERB", "ADJ", "PROPN"]]

doc = "Your input text here"
extract_pos(doc)

[['input', 'NOUN'], ['text', 'NOUN']]

### Chunking and Named Entity Recognition

In [14]:
doc  = nlp(u"Mary slapped the green witch.")
for chunk in doc.noun_chunks:
    print ('{} - {}'.format(chunk, chunk.label_))

Mary - NP
the green witch - NP


In [15]:
def extract_companies(input):
  ner = spacy.load("en_core_web_sm")
  extractions = ner(input)
  displacy.render(extractions,style="ent", jupyter=True)
  return [item.text for item in extractions.ents if item.label_ == "ORG"]

In [16]:
text = "The companies that would be releasing their quarterly reports tomorrow are Microsoft, 4pm, Google, 4pm, and AT&T, 6pm."
companies = extract_companies(text)
print("\nThe definition of the label 'ORG': " + spacy.explain("ORG"))
print("Companies:", companies)


The definition of the label 'ORG': Companies, agencies, institutions, etc.
Companies: ['Microsoft', 'Google', 'AT&T']
