# Data Pre-processing

Text data cleaning

In [1]:
import re
import nltk
from nltk.tokenize import sent_tokenize,word_tokenize
from nltk.stem import WordNetLemmatizer
from nltk.stem import PorterStemmer,SnowballStemmer

In [9]:
nltk.download('punkt')

[nltk_data] Downloading package punkt to /Users/vijaya/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

In [14]:
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/vijaya/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [13]:
nltk.download('wordnet')

[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/vijaya/sw_install/anaconda3/envs/nlp/nltk_data.
[nltk_data]     ..


True

In [2]:
text = "<vijaya & We can combine all the [ preprocessing 9 methods above and create a / preprocess function that takes in a .txt file and handles all the preprocessing. We print out the tokens, filtered words (after stopword filtering), stemmed words, d on to the model or foand POS, one of which is usually passer further processing."

In [3]:
#Removing HTML tags
html_tag_remover = re.compile('<.*?>')
text = re.sub(html_tag_remover, '', text)
print(text)

<vijaya & We can combine all the [ preprocessing 9 methods above and create a / preprocess function that takes in a .txt file and handles all the preprocessing. We print out the tokens, filtered words (after stopword filtering), stemmed words, d on to the model or foand POS, one of which is usually passer further processing.


In [4]:
#Removing non-alphabets(numbers) and convert to lower case
text = re.sub('[^a-zA-Z]', ' ', text).lower()
print(text)

 vijaya   we can combine all the   preprocessing   methods above and create a   preprocess function that takes in a  txt file and handles all the preprocessing  we print out the tokens  filtered words  after stopword filtering   stemmed words  d on to the model or foand pos  one of which is usually passer further processing 


In [5]:
#Performing tokenization
words = word_tokenize(text)  # download 'punkt'
print(words)

['vijaya', 'we', 'can', 'combine', 'all', 'the', 'preprocessing', 'methods', 'above', 'and', 'create', 'a', 'preprocess', 'function', 'that', 'takes', 'in', 'a', 'txt', 'file', 'and', 'handles', 'all', 'the', 'preprocessing', 'we', 'print', 'out', 'the', 'tokens', 'filtered', 'words', 'after', 'stopword', 'filtering', 'stemmed', 'words', 'd', 'on', 'to', 'the', 'model', 'or', 'foand', 'pos', 'one', 'of', 'which', 'is', 'usually', 'passer', 'further', 'processing']


In [6]:
#Removing stop words
stop_words = nltk.corpus.stopwords.words('english')
words = [word for word in words if word not in stop_words ]
print(words)

['vijaya', 'combine', 'preprocessing', 'methods', 'create', 'preprocess', 'function', 'takes', 'txt', 'file', 'handles', 'preprocessing', 'print', 'tokens', 'filtered', 'words', 'stopword', 'filtering', 'stemmed', 'words', 'model', 'foand', 'pos', 'one', 'usually', 'passer', 'processing']


In [7]:
#Perform Lemmatization
lemmatizer = WordNetLemmatizer()
words = [lemmatizer.lemmatize(word) for word in words]  #download 'wordnet'
print(words)

['vijaya', 'combine', 'preprocessing', 'method', 'create', 'preprocess', 'function', 'take', 'txt', 'file', 'handle', 'preprocessing', 'print', 'token', 'filtered', 'word', 'stopword', 'filtering', 'stemmed', 'word', 'model', 'foand', 'po', 'one', 'usually', 'passer', 'processing']


In [7]:
#Perform Stemming
porter_stemmer = PorterStemmer()

words = [porter_stemmer.stem(word) for word in words]
print(words)

['vijaya', 'combin', 'preprocess', 'method', 'creat', 'preprocess', 'function', 'take', 'txt', 'file', 'handl', 'preprocess', 'print', 'token', 'filter', 'word', 'stopword', 'filter', 'stem', 'word', 'model', 'foand', 'po', 'one', 'usual', 'passer', 'process']


In [9]:
#Snowball stemmer
from nltk.stem import SnowballStemmer
snow_stemmer = SnowballStemmer(language='english')
words = [snow_stemmer.stem(word) for word in words]
print(words)

['vijaya', 'combin', 'preprocess', 'method', 'creat', 'preprocess', 'function', 'take', 'txt', 'file', 'handl', 'preprocess', 'print', 'token', 'filter', 'word', 'stopword', 'filter', 'stem', 'word', 'model', 'foand', 'pos', 'one', 'usual', 'passer', 'process']


# Feature Extraction / Vectorization

Text-to-Numeric : we have diff approaches

1. Bag of words (BoW)

2. TF-IDF

3. Word2Vec

In [15]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer

import pandas as pd
import numpy as np

In [18]:
# BOW method
bow_vectorizer = CountVectorizer()
X1 = bow_vectorizer.fit_transform(words)

bow_df = pd.DataFrame(X1.A , columns= bow_vectorizer.get_feature_names_out())
bow_df.head(5)

Unnamed: 0,combine,create,file,filtered,filtering,foand,function,handle,method,model,...,print,processing,stemmed,stopword,take,token,txt,usually,vijaya,word
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0
1,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0
4,0,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [17]:
print(bow_vectorizer.vocabulary_)

{'vijaya': 23, 'combine': 0, 'preprocessing': 14, 'method': 8, 'create': 1, 'preprocess': 13, 'function': 6, 'take': 19, 'txt': 21, 'file': 2, 'handle': 7, 'print': 15, 'token': 20, 'filtered': 3, 'word': 24, 'stopword': 18, 'filtering': 4, 'stemmed': 17, 'model': 9, 'foand': 5, 'po': 12, 'one': 10, 'usually': 22, 'passer': 11, 'processing': 16}


In [19]:
print(X1) # BoW weights for each terms belong to a particular documen

  (0, 23)	1
  (1, 0)	1
  (2, 14)	1
  (3, 8)	1
  (4, 1)	1
  (5, 13)	1
  (6, 6)	1
  (7, 19)	1
  (8, 21)	1
  (9, 2)	1
  (10, 7)	1
  (11, 14)	1
  (12, 15)	1
  (13, 20)	1
  (14, 3)	1
  (15, 24)	1
  (16, 18)	1
  (17, 4)	1
  (18, 17)	1
  (19, 24)	1
  (20, 9)	1
  (21, 5)	1
  (22, 12)	1
  (23, 10)	1
  (24, 22)	1
  (25, 11)	1
  (26, 16)	1


In [9]:
# TF-IDF method
tfidf_vectorizer = TfidfVectorizer()
X = tfidf_vectorizer.fit_transform(words)

tfidf_df = pd.DataFrame(np.round(X.A,3) , columns= tfidf_vectorizer.get_feature_names_out())
tfidf_df.head(2)

Unnamed: 0,combine,create,file,filtered,filtering,foand,function,handle,method,model,...,print,processing,stemmed,stopword,take,token,txt,usually,vijaya,word
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
1,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [13]:
print(tfidf_vectorizer.vocabulary_) # vocabulary terms and their index

{'vijaya': 23, 'combine': 0, 'preprocessing': 14, 'method': 8, 'create': 1, 'preprocess': 13, 'function': 6, 'take': 19, 'txt': 21, 'file': 2, 'handle': 7, 'print': 15, 'token': 20, 'filtered': 3, 'word': 24, 'stopword': 18, 'filtering': 4, 'stemmed': 17, 'model': 9, 'foand': 5, 'po': 12, 'one': 10, 'usually': 22, 'passer': 11, 'processing': 16}


In [14]:
print(X) # tf-idf weights for each terms belong to a particular documen

  (0, 23)	1.0
  (1, 0)	1.0
  (2, 14)	1.0
  (3, 8)	1.0
  (4, 1)	1.0
  (5, 13)	1.0
  (6, 6)	1.0
  (7, 19)	1.0
  (8, 21)	1.0
  (9, 2)	1.0
  (10, 7)	1.0
  (11, 14)	1.0
  (12, 15)	1.0
  (13, 20)	1.0
  (14, 3)	1.0
  (15, 24)	1.0
  (16, 18)	1.0
  (17, 4)	1.0
  (18, 17)	1.0
  (19, 24)	1.0
  (20, 9)	1.0
  (21, 5)	1.0
  (22, 12)	1.0
  (23, 10)	1.0
  (24, 22)	1.0
  (25, 11)	1.0
  (26, 16)	1.0


In [20]:
#Word2Vec 

#Mostly, NLTK corpus/data works very well with 'gensim' models. (i.e) 
import gensim
from gensim.models import word2vec

import multiprocessing

In [21]:
cores = multiprocessing.cpu_count() # Count the number of cores in a computer
print(cores)

12


In [23]:
#training our data/words with gensim model
gensim_model = word2vec.Word2Vec(words, sg=1, workers=cores-1, window=4, min_count=0)

#saving the model with name 'gensim_Word2Vec_Model1'
gensim_model.save('generated/word2vecmodel.embedding')

In [29]:
gensim_model.wv.key_to_index

{'e': 0,
 'o': 1,
 'r': 2,
 's': 3,
 't': 4,
 'n': 5,
 'p': 6,
 'i': 7,
 'd': 8,
 'a': 9,
 'c': 10,
 'l': 11,
 'm': 12,
 'f': 13,
 'g': 14,
 'w': 15,
 'u': 16,
 'h': 17,
 'y': 18,
 'k': 19,
 'b': 20,
 'j': 21,
 'x': 22,
 'v': 23}

In [30]:
from gensim.models import KeyedVectors

#loading the saved word2vec model
gensim_load_model = KeyedVectors.load('generated/word2vecmodel.embedding')

In [31]:
gensim_load_model.wv.

array([[-5.3926115e-04,  2.5880086e-04,  5.1066871e-03, ...,
        -7.0486129e-03,  9.0537930e-04,  6.3952692e-03],
       [-8.6522466e-03,  3.6946868e-03,  5.1975371e-03, ...,
        -2.3710402e-03, -9.5002307e-03,  4.5305295e-03],
       [ 1.7434624e-05,  3.1616748e-03, -6.8913666e-03, ...,
         5.5341172e-04,  8.2652131e-03, -6.9025490e-03],
       ...,
       [ 1.2983537e-03,  6.6034454e-03,  9.9805482e-03, ...,
         9.0810293e-03, -5.7746135e-03,  3.7277236e-03],
       [-2.3433071e-04,  4.2182151e-03,  2.1172029e-03, ...,
         7.2509079e-05,  2.6453231e-06,  8.7128431e-03],
       [-2.5250332e-03, -5.8976240e-03,  7.4725486e-03, ...,
         2.1580623e-03,  6.6877826e-04,  9.5317243e-03]], dtype=float32)