# Data Pre-processing

Text data cleaning

In [24]:
import re
import nltk
from nltk.tokenize import sent_tokenize,word_tokenize
from nltk.stem import WordNetLemmatizer
from nltk.stem import PorterStemmer,SnowballStemmer

In [9]:
nltk.download('punkt')

[nltk_data] Downloading package punkt to /Users/vijaya/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

In [14]:
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/vijaya/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [13]:
nltk.download('wordnet')

[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/vijaya/sw_install/anaconda3/envs/nlp/nltk_data.
[nltk_data]     ..


True

In [2]:
text = "<vijaya & We can combine all the [ preprocessing 9 methods above and create a / preprocess function that takes in a .txt file and handles all the preprocessing. We print out the tokens, filtered words (after stopword filtering), stemmed words, d on to the model or foand POS, one of which is usually passer further processing."

In [27]:
#Removing HTML tags
html_tag_remover = re.compile('<.*?>')
text = re.sub(html_tag_remover, '', text)
print(text)

 vijaya   we can combine all the   preprocessing   methods above and create a   preprocess function that takes in a  txt file and handles all the preprocessing  we print out the tokens  filtered words  after stopword filtering   stemmed words  d on to the model or foand pos  one of which is usually passer further processing 


In [4]:
#Removing non-alphabets(numbers) and convert to lower case
text = re.sub('[^a-zA-Z]', ' ', text).lower()
print(text)

 vijaya   we can combine all the   preprocessing   methods above and create a   preprocess function that takes in a  txt file and handles all the preprocessing  we print out the tokens  filtered words  after stopword filtering   stemmed words  d on to the model or foand pos  one of which is usually passer further processing 


In [5]:
#Performing tokenization
words = word_tokenize(text)  # download 'punkt'
print(words)

['vijaya', 'we', 'can', 'combine', 'all', 'the', 'preprocessing', 'methods', 'above', 'and', 'create', 'a', 'preprocess', 'function', 'that', 'takes', 'in', 'a', 'txt', 'file', 'and', 'handles', 'all', 'the', 'preprocessing', 'we', 'print', 'out', 'the', 'tokens', 'filtered', 'words', 'after', 'stopword', 'filtering', 'stemmed', 'words', 'd', 'on', 'to', 'the', 'model', 'or', 'foand', 'pos', 'one', 'of', 'which', 'is', 'usually', 'passer', 'further', 'processing']


In [6]:
#Removing stop words
stop_words = nltk.corpus.stopwords.words('english')
words = [word for word in words if word not in stop_words ]
print(words)

['vijaya', 'combine', 'preprocessing', 'methods', 'create', 'preprocess', 'function', 'takes', 'txt', 'file', 'handles', 'preprocessing', 'print', 'tokens', 'filtered', 'words', 'stopword', 'filtering', 'stemmed', 'words', 'model', 'foand', 'pos', 'one', 'usually', 'passer', 'processing']


In [14]:
#Perform Lemmatization
lemmatizer = WordNetLemmatizer()
words = [lemmatizer.lemmatize(word) for word in words]  #download 'wordnet'
print(words)

['vijaya', 'combine', 'preprocessing', 'method', 'create', 'preprocess', 'function', 'take', 'txt', 'file', 'handle', 'preprocessing', 'print', 'token', 'filtered', 'word', 'stopword', 'filtering', 'stemmed', 'word', 'model', 'foand', 'po', 'one', 'usually', 'passer', 'processing']


In [7]:
#Perform Stemming
porter_stemmer = PorterStemmer()

words = [porter_stemmer.stem(word) for word in words]
print(words)

['vijaya', 'combin', 'preprocess', 'method', 'creat', 'preprocess', 'function', 'take', 'txt', 'file', 'handl', 'preprocess', 'print', 'token', 'filter', 'word', 'stopword', 'filter', 'stem', 'word', 'model', 'foand', 'po', 'one', 'usual', 'passer', 'process']


In [9]:
#Snowball stemmer
from nltk.stem import SnowballStemmer
snow_stemmer = SnowballStemmer(language='english')
words = [snow_stemmer.stem(word) for word in words]
print(words)

['vijaya', 'combin', 'preprocess', 'method', 'creat', 'preprocess', 'function', 'take', 'txt', 'file', 'handl', 'preprocess', 'print', 'token', 'filter', 'word', 'stopword', 'filter', 'stem', 'word', 'model', 'foand', 'pos', 'one', 'usual', 'passer', 'process']


# Feature Extraction / Vectorization

Text-to-Numeric : we have diff approaches

1. Bag of words (BoW)

2. TF-IDF

3. Word2Vec

In [18]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer

import pandas as pd
import numpy as np

In [28]:
# BOW method
bow_vectorizer = CountVectorizer()
X = bow_vectorizer.fit_transform(words)

bow_df = pd.DataFrame(X.A , columns= bow_vectorizer.get_feature_names_out())
bow_df.head(5)

Unnamed: 0,combin,creat,file,filter,foand,function,handl,method,model,one,...,print,process,stem,stopword,take,token,txt,usual,vijaya,word
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0
1,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [23]:
# TF-IDF method
tfidf_vectorizer = TfidfVectorizer()
X = tfidf_vectorizer.fit_transform(words)

tfidf_df = pd.DataFrame(np.round(X.A,3) , columns= tfidf_vectorizer.get_feature_names_out())
tfidf_df.head(2)

Unnamed: 0,combin,creat,file,filter,foand,function,handl,method,model,one,...,print,process,stem,stopword,take,token,txt,usual,vijaya,word
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
1,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
