### IMPORTING THE LIBRARIES

In [1]:
import os, json
import pandas as pd
import re
import nltk

### COLLECTING .JSON FILES

In [2]:
path_to_json = 'dataset'
json_files = [pos_json for pos_json in os.listdir(path_to_json) if pos_json.endswith('.json')]

In [3]:
#LIST OF .JSON FILES
json_files

['web-1.json',
 'web-10034.json',
 'web-10047.json',
 'web-1005.json',
 'web-10089.json',
 'web-10133.json',
 'web-10165.json',
 'web-2012.json',
 'web-3.json',
 'web-4.json']

EXTRACTING TEXT FROM EACH .JSON FILE

In [4]:
text = []
for index, js in enumerate(json_files):
    with open(os.path.join(path_to_json, js)) as json_file:
        json_text = json.load(json_file)
    text.append(json_text['text'])

In [5]:
len(text)

10

In [6]:
#STORING ORIGINAL TEXT
raw_text = []
for i in range(10):
    string = ' '.join(str(item) for item in text[i])
    raw_text.append(string.strip().split('\n'))   

In [7]:
raw_text[0]

['2021 Coachmen Sportscoach 402TS Two Full Bath, Bunk Beds,Theater Seating, King, W/D going back Back California Privacy Rights California Consumer Privacy Act Notice for California Consumers 800-335-6054 817-790-7771 Sitemap This page took too long to load. Please try refreshing or  going back  to the previous page. All material copyright © Motor Home Specialist ( MHSRV.com ). All rights are reserved. No part of any material on this web site may be reproduced, distributed, or transmitted in any form or by any means without the prior written permission of Motor Home Specialist. *Information deemed reliable, but not guaranteed. Features & options subject to change without notice. Weights & measurements are estimates only. Verify before purchase. ',
 '         *DISCLAIMER: ',
 '        *#1 in the world or #1 in Texas references are per the official Stats Surveys Inc. for American built Motorhomes sold at single location. *MINIMUM 25% OFF MSRP DISCOUNT DOES NOT APPLY TO CLASS B RVS, FORES

### NLP PIPELINE

**TO REMOVE STOP WORDS

In [8]:
nltk.download('stopwords')
from nltk.corpus import stopwords
stop_words = stopwords.words('english')
stop_words.append('n')
stop_words.append('r')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\pragati\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


**FOR LEMMATIZATION

In [9]:
from nltk.stem import WordNetLemmatizer
lemmatizer = WordNetLemmatizer()

**FOR STEMMING

In [10]:
from nltk.stem import PorterStemmer
stemmer = PorterStemmer()

**FOR BIGRAM FORMATION

In [11]:
from nltk.util import bigrams

### TEXT PROCESSING

In [12]:
filtered_text = []     #TO STORE THE PROCESSED DATA

bigrams = []    
for t in raw_text:
    filtered_sentence = ""
    stemmed_list = []
    lemmatized_list = []
    
    sentence = str(t)
    
    #Data Cleansing
    sentence = re.sub(r'[^\w\s]', ' ', sentence)
    
    #Tokenization
    words = nltk.word_tokenize(sentence)
    
    #Stop words removal
    words = [w for w in words if not w in stop_words]
    
    #Stemming
    for word in words:
        stemmed_word = stemmer.stem(word)
        stemmed_list.append(stemmed_word)
        
    #Lemmatization
    for s_word in stemmed_list:
        lemmatized_word = lemmatizer.lemmatize(s_word)
        lemmatized_list.append(lemmatized_word)
    
    #Bigram Formation
    bigrams = list(nltk.bigrams(lemmatized_list))
    bigram_text=[" ".join(t) for t in bigrams]
    
    filtered_text.append(bigram_text)  

In [13]:
len(filtered_text)

10

In [14]:
len(filtered_text[9])

1843

CONVERTING DATA TO PANDAS DATAFRAME

In [15]:
table_head = ["Raw Data", "Processed Data"]

In [16]:
df_data = pd.DataFrame(zip(raw_text,filtered_text), columns = table_head)
df_data

Unnamed: 0,Raw Data,Processed Data
0,[2021 Coachmen Sportscoach 402TS Two Full Bath...,"[2021 coachman, coachman sportscoach, sportsco..."
1,[Our Online Privacy Policy Skip to main conten...,"[our onlin, onlin privaci, privaci polici, pol..."
2,[Python: parsing PDF text and tables - usage a...,"[python par, par pdf, pdf text, text tabl, tab..."
3,[Arts vs. Athletics: Two Great Ways You Can Pr...,"[art v, v athlet, athlet two, two great, great..."
4,[Bubble Gum Pop Hat | AllFreeCrochet.com close...,"[bubbl gum, gum pop, pop hat, hat allfreecroch..."
5,[Transitional Care Gets a Room of Its Own - Pa...,"[transit care, care get, get room, room it, it..."
6,[Decorative Objects/Accessories - Page 2 of 12...,"[decor object, object accessori, accessori pag..."
7,[Home About us Approach Family Office Purposef...,"[home about, about u, u approach, approach fam..."
8,[Jemarl Baker Jr. to transfer from Arizona Wil...,"[jemarl baker, baker Jr, Jr transfer, transfer..."
9,"[Publications | The Protect Heritage Corp. \r,...","[public the, the protect, protect heritag, her..."


## EMBEDDING TECHNIQUES

BAG OF WORDS

In [17]:
from sklearn.feature_extraction.text import CountVectorizer
vec = CountVectorizer()
BoW_list = []
for i in range(10):
    x = vec.fit_transform(filtered_text[i])
    BoW_list.append(x)

In [18]:
BoW_list

[<552x292 sparse matrix of type '<class 'numpy.int64'>'
 	with 1058 stored elements in Compressed Sparse Row format>,
 <1070x339 sparse matrix of type '<class 'numpy.int64'>'
 	with 2106 stored elements in Compressed Sparse Row format>,
 <557x252 sparse matrix of type '<class 'numpy.int64'>'
 	with 1064 stored elements in Compressed Sparse Row format>,
 <1961x664 sparse matrix of type '<class 'numpy.int64'>'
 	with 3823 stored elements in Compressed Sparse Row format>,
 <3052x844 sparse matrix of type '<class 'numpy.int64'>'
 	with 5805 stored elements in Compressed Sparse Row format>,
 <1079x470 sparse matrix of type '<class 'numpy.int64'>'
 	with 2140 stored elements in Compressed Sparse Row format>,
 <1091x226 sparse matrix of type '<class 'numpy.int64'>'
 	with 2140 stored elements in Compressed Sparse Row format>,
 <270x164 sparse matrix of type '<class 'numpy.int64'>'
 	with 518 stored elements in Compressed Sparse Row format>,
 <867x292 sparse matrix of type '<class 'numpy.int64

In [19]:
len(BoW_list)

10

In [20]:
table_head.append("Bag of Words")
df_data = pd.DataFrame(zip(raw_text,filtered_text, BoW_list), columns = table_head)
df_data

Unnamed: 0,Raw Data,Processed Data,Bag of Words
0,[2021 Coachmen Sportscoach 402TS Two Full Bath...,"[2021 coachman, coachman sportscoach, sportsco...","(0, 10)\t1\n (0, 67)\t1\n (1, 67)\t1\n (1..."
1,[Our Online Privacy Policy Skip to main conten...,"[our onlin, onlin privaci, privaci polici, pol...","(0, 214)\t1\n (0, 211)\t1\n (1, 211)\t1\n ..."
2,[Python: parsing PDF text and tables - usage a...,"[python par, par pdf, pdf text, text tabl, tab...","(0, 187)\t1\n (0, 169)\t1\n (1, 169)\t1\n ..."
3,[Arts vs. Athletics: Two Great Ways You Can Pr...,"[art v, v athlet, athlet two, two great, great...","(0, 56)\t1\n (1, 63)\t1\n (2, 63)\t1\n (2..."
4,[Bubble Gum Pop Hat | AllFreeCrochet.com close...,"[bubbl gum, gum pop, pop hat, hat allfreecroch...","(0, 136)\t1\n (0, 351)\t1\n (1, 351)\t1\n ..."
5,[Transitional Care Gets a Room of Its Own - Pa...,"[transit care, care get, get room, room it, it...","(0, 424)\t1\n (0, 72)\t1\n (1, 72)\t1\n (..."
6,[Decorative Objects/Accessories - Page 2 of 12...,"[decor object, object accessori, accessori pag...","(0, 73)\t1\n (0, 145)\t1\n (1, 145)\t1\n ..."
7,[Home About us Approach Family Office Purposef...,"[home about, about u, u approach, approach fam...","(0, 67)\t1\n (0, 0)\t1\n (1, 0)\t1\n (2, ..."
8,[Jemarl Baker Jr. to transfer from Arizona Wil...,"[jemarl baker, baker Jr, Jr transfer, transfer...","(0, 143)\t1\n (0, 52)\t1\n (1, 52)\t1\n (..."
9,"[Publications | The Protect Heritage Corp. \r,...","[public the, the protect, protect heritag, her...","(0, 501)\t1\n (0, 601)\t1\n (1, 601)\t1\n ..."


Term Frequency Inverse Document Frequency (TF-IDF)

In [21]:
from sklearn.feature_extraction.text import TfidfVectorizer
vectorizer = TfidfVectorizer()
Tfidf_list = []
for i in range(10):
    x = vectorizer.fit_transform(filtered_text[i])
    Tfidf_list.append(x)

In [22]:
Tfidf_list

[<552x292 sparse matrix of type '<class 'numpy.float64'>'
 	with 1058 stored elements in Compressed Sparse Row format>,
 <1070x339 sparse matrix of type '<class 'numpy.float64'>'
 	with 2106 stored elements in Compressed Sparse Row format>,
 <557x252 sparse matrix of type '<class 'numpy.float64'>'
 	with 1064 stored elements in Compressed Sparse Row format>,
 <1961x664 sparse matrix of type '<class 'numpy.float64'>'
 	with 3823 stored elements in Compressed Sparse Row format>,
 <3052x844 sparse matrix of type '<class 'numpy.float64'>'
 	with 5805 stored elements in Compressed Sparse Row format>,
 <1079x470 sparse matrix of type '<class 'numpy.float64'>'
 	with 2140 stored elements in Compressed Sparse Row format>,
 <1091x226 sparse matrix of type '<class 'numpy.float64'>'
 	with 2140 stored elements in Compressed Sparse Row format>,
 <270x164 sparse matrix of type '<class 'numpy.float64'>'
 	with 518 stored elements in Compressed Sparse Row format>,
 <867x292 sparse matrix of type '<cl

In [23]:
len(Tfidf_list)

10

In [24]:
table_head.append("Tfidf")
df_data = pd.DataFrame(zip(raw_text,filtered_text, BoW_list, Tfidf_list), columns = table_head)
df_data

Unnamed: 0,Raw Data,Processed Data,Bag of Words,Tfidf
0,[2021 Coachmen Sportscoach 402TS Two Full Bath...,"[2021 coachman, coachman sportscoach, sportsco...","(0, 10)\t1\n (0, 67)\t1\n (1, 67)\t1\n (1...","(0, 67)\t0.6934178959879685\n (0, 10)\t0.72..."
1,[Our Online Privacy Policy Skip to main conten...,"[our onlin, onlin privaci, privaci polici, pol...","(0, 214)\t1\n (0, 211)\t1\n (1, 211)\t1\n ...","(0, 211)\t0.6872192264199813\n (0, 214)\t0...."
2,[Python: parsing PDF text and tables - usage a...,"[python par, par pdf, pdf text, text tabl, tab...","(0, 187)\t1\n (0, 169)\t1\n (1, 169)\t1\n ...","(0, 169)\t0.8221531853387946\n (0, 187)\t0...."
3,[Arts vs. Athletics: Two Great Ways You Can Pr...,"[art v, v athlet, athlet two, two great, great...","(0, 56)\t1\n (1, 63)\t1\n (2, 63)\t1\n (2...","(0, 56)\t1.0\n (1, 63)\t1.0\n (2, 610)\t0...."
4,[Bubble Gum Pop Hat | AllFreeCrochet.com close...,"[bubbl gum, gum pop, pop hat, hat allfreecroch...","(0, 136)\t1\n (0, 351)\t1\n (1, 351)\t1\n ...","(0, 351)\t0.7047811753225802\n (0, 136)\t0...."
5,[Transitional Care Gets a Room of Its Own - Pa...,"[transit care, care get, get room, room it, it...","(0, 424)\t1\n (0, 72)\t1\n (1, 72)\t1\n (...","(0, 72)\t0.6613492853098424\n (0, 424)\t0.7..."
6,[Decorative Objects/Accessories - Page 2 of 12...,"[decor object, object accessori, accessori pag...","(0, 73)\t1\n (0, 145)\t1\n (1, 145)\t1\n ...","(0, 145)\t0.7391952538944243\n (0, 73)\t0.6..."
7,[Home About us Approach Family Office Purposef...,"[home about, about u, u approach, approach fam...","(0, 67)\t1\n (0, 0)\t1\n (1, 0)\t1\n (2, ...","(0, 0)\t0.6914852473085188\n (0, 67)\t0.722..."
8,[Jemarl Baker Jr. to transfer from Arizona Wil...,"[jemarl baker, baker Jr, Jr transfer, transfer...","(0, 143)\t1\n (0, 52)\t1\n (1, 52)\t1\n (...","(0, 52)\t0.6732948905294239\n (0, 143)\t0.7..."
9,"[Publications | The Protect Heritage Corp. \r,...","[public the, the protect, protect heritag, her...","(0, 501)\t1\n (0, 601)\t1\n (1, 601)\t1\n ...","(0, 601)\t0.6590000178652068\n (0, 501)\t0...."
