In [7]:
# Importing required libraries
import json
import pandas as pd
import re
import string
import nltk
nltk.download('stopwords')
nltk.download('wordnet')
from nltk.collocations import BigramCollocationFinder

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [8]:
json_list = [
    'web-1.json',
    'web-10034.json',
    'web-10047.json',
    'web-1005.json',
    'web-10089.json',
    'web-10133.json',
    'web-10165.json',
    'web-2012.json',
    'web-3.json',
    'web-4.json'
]

raw_text_list = []

# Storing text from the json files in a list
for i in json_list:
  f = open(i)

  # returns JSON object as a dictionary
  data = json.load(f)
  
  raw_text = ''
  for text in data['text']:
      raw_text = raw_text + text
    
  f.close()
  raw_text_list.append(raw_text)

raw_text_list[0]

'2021 Coachmen Sportscoach 402TS Two Full Bath, Bunk Beds,Theater Seating, King, W/Dgoing backBackCalifornia Privacy RightsCalifornia Consumer Privacy Act Notice for California Consumers800-335-6054817-790-7771SitemapThis page took too long to load. Please try refreshing or going back to the previous page.All material copyright © Motor Home Specialist ( MHSRV.com ). All rights are reserved. No part of any material on this web site may be reproduced, distributed, or transmitted in any form or by any means without the prior written permission of Motor Home Specialist. *Information deemed reliable, but not guaranteed. Features & options subject to change without notice. Weights & measurements are estimates only. Verify before purchase.\n        *DISCLAIMER:\n        *#1 in the world or #1 in Texas references are per the official Stats Surveys Inc. for American built Motorhomes sold at single location. *MINIMUM 25% OFF MSRP DISCOUNT DOES NOT APPLY TO CLASS B RVS, FORESTER, DYNAMAX OR TWILI

In [9]:
df = pd.DataFrame(raw_text_list, columns = {'Raw_text'})
df

Unnamed: 0,Raw_text
0,"2021 Coachmen Sportscoach 402TS Two Full Bath,..."
1,Our Online Privacy PolicySkip to main contentE...
2,Python: parsing PDF text and tables - usage an...
3,Arts vs. Athletics: Two Great Ways You Can Pr...
4,Bubble Gum Pop Hat | AllFreeCrochet.comcloseAd...
5,Transitional Care Gets a Room of Its Own - Pat...
6,Decorative Objects/Accessories - Page 2 of 12 ...
7,\r\n\t \t \t\r\n HomeA...
8,Jemarl Baker Jr. to transfer from Arizona Wild...
9,Publications | The Protect Heritage Corp.\r\n\...


---
#**NLP Pipeline**

**Data Cleaning**

In [10]:
# Removing extra whitespaces
def remove_extra_whitespace(text):
  words_wo_whitespace = re.sub("\s+"," ", text)
  return words_wo_whitespace

df['wo_extra_whitespace']=df['Raw_text'].apply(lambda x: remove_extra_whitespace(x))
df

Unnamed: 0,Raw_text,wo_extra_whitespace
0,"2021 Coachmen Sportscoach 402TS Two Full Bath,...","2021 Coachmen Sportscoach 402TS Two Full Bath,..."
1,Our Online Privacy PolicySkip to main contentE...,Our Online Privacy PolicySkip to main contentE...
2,Python: parsing PDF text and tables - usage an...,Python: parsing PDF text and tables - usage an...
3,Arts vs. Athletics: Two Great Ways You Can Pr...,Arts vs. Athletics: Two Great Ways You Can Pr...
4,Bubble Gum Pop Hat | AllFreeCrochet.comcloseAd...,Bubble Gum Pop Hat | AllFreeCrochet.comcloseAd...
5,Transitional Care Gets a Room of Its Own - Pat...,Transitional Care Gets a Room of Its Own - Pat...
6,Decorative Objects/Accessories - Page 2 of 12 ...,Decorative Objects/Accessories - Page 2 of 12 ...
7,\r\n\t \t \t\r\n HomeA...,HomeAbout usApproachFamily OfficePurposeful I...
8,Jemarl Baker Jr. to transfer from Arizona Wild...,Jemarl Baker Jr. to transfer from Arizona Wild...
9,Publications | The Protect Heritage Corp.\r\n\...,Publications | The Protect Heritage Corp. Home...


In [11]:
# Remove punctuations
def remove_punctuation(text):
  words_wo_punct = re.sub("[^-9A-Za-z ]"," ", text)
  # # OR Using String
  # no_punct = "".join([i for i in text if i not in string.punctuation])
  # words_wo_punct=''.join(no_punct)
  return words_wo_punct

df['wo_punct']=df['wo_extra_whitespace'].apply(lambda x: remove_punctuation(x))
df

Unnamed: 0,Raw_text,wo_extra_whitespace,wo_punct
0,"2021 Coachmen Sportscoach 402TS Two Full Bath,...","2021 Coachmen Sportscoach 402TS Two Full Bath,...",Coachmen Sportscoach TS Two Full Bath ...
1,Our Online Privacy PolicySkip to main contentE...,Our Online Privacy PolicySkip to main contentE...,Our Online Privacy PolicySkip to main contentE...
2,Python: parsing PDF text and tables - usage an...,Python: parsing PDF text and tables - usage an...,Python parsing PDF text and tables - usage an...
3,Arts vs. Athletics: Two Great Ways You Can Pr...,Arts vs. Athletics: Two Great Ways You Can Pr...,Arts vs Athletics Two Great Ways You Can Pr...
4,Bubble Gum Pop Hat | AllFreeCrochet.comcloseAd...,Bubble Gum Pop Hat | AllFreeCrochet.comcloseAd...,Bubble Gum Pop Hat AllFreeCrochet comcloseAd...
5,Transitional Care Gets a Room of Its Own - Pat...,Transitional Care Gets a Room of Its Own - Pat...,Transitional Care Gets a Room of Its Own - Pat...
6,Decorative Objects/Accessories - Page 2 of 12 ...,Decorative Objects/Accessories - Page 2 of 12 ...,Decorative Objects Accessories - Page of ...
7,\r\n\t \t \t\r\n HomeA...,HomeAbout usApproachFamily OfficePurposeful I...,HomeAbout usApproachFamily OfficePurposeful I...
8,Jemarl Baker Jr. to transfer from Arizona Wild...,Jemarl Baker Jr. to transfer from Arizona Wild...,Jemarl Baker Jr to transfer from Arizona Wild...
9,Publications | The Protect Heritage Corp.\r\n\...,Publications | The Protect Heritage Corp. Home...,Publications The Protect Heritage Corp Home...


In [12]:
# Converting all text to lower case
def normalise_case(text):
  word_normalised = "".join([i.lower() for i in text if i not in string.punctuation])
  text_normalised=''.join(word_normalised)
  return text_normalised

df['case_normalised']=df['wo_punct'].apply(lambda x: normalise_case(x))
df

Unnamed: 0,Raw_text,wo_extra_whitespace,wo_punct,case_normalised
0,"2021 Coachmen Sportscoach 402TS Two Full Bath,...","2021 Coachmen Sportscoach 402TS Two Full Bath,...",Coachmen Sportscoach TS Two Full Bath ...,coachmen sportscoach ts two full bath ...
1,Our Online Privacy PolicySkip to main contentE...,Our Online Privacy PolicySkip to main contentE...,Our Online Privacy PolicySkip to main contentE...,our online privacy policyskip to main contente...
2,Python: parsing PDF text and tables - usage an...,Python: parsing PDF text and tables - usage an...,Python parsing PDF text and tables - usage an...,python parsing pdf text and tables usage and...
3,Arts vs. Athletics: Two Great Ways You Can Pr...,Arts vs. Athletics: Two Great Ways You Can Pr...,Arts vs Athletics Two Great Ways You Can Pr...,arts vs athletics two great ways you can pr...
4,Bubble Gum Pop Hat | AllFreeCrochet.comcloseAd...,Bubble Gum Pop Hat | AllFreeCrochet.comcloseAd...,Bubble Gum Pop Hat AllFreeCrochet comcloseAd...,bubble gum pop hat allfreecrochet comclosead...
5,Transitional Care Gets a Room of Its Own - Pat...,Transitional Care Gets a Room of Its Own - Pat...,Transitional Care Gets a Room of Its Own - Pat...,transitional care gets a room of its own pati...
6,Decorative Objects/Accessories - Page 2 of 12 ...,Decorative Objects/Accessories - Page 2 of 12 ...,Decorative Objects Accessories - Page of ...,decorative objects accessories page of ...
7,\r\n\t \t \t\r\n HomeA...,HomeAbout usApproachFamily OfficePurposeful I...,HomeAbout usApproachFamily OfficePurposeful I...,homeabout usapproachfamily officepurposeful i...
8,Jemarl Baker Jr. to transfer from Arizona Wild...,Jemarl Baker Jr. to transfer from Arizona Wild...,Jemarl Baker Jr to transfer from Arizona Wild...,jemarl baker jr to transfer from arizona wild...
9,Publications | The Protect Heritage Corp.\r\n\...,Publications | The Protect Heritage Corp. Home...,Publications The Protect Heritage Corp Home...,publications the protect heritage corp home...


**Tokenisation**

In [13]:
# Converting text into tokens
# nltk.download('punkt')
def tokenize(text):
    tokens=re.split("\W+",text)
    # OR Using NLTK
    # tokens = nltk.word_tokenize(text) 
    return tokens
df['tokens']=df['case_normalised'].apply(lambda x: tokenize(x.lower()))
df.head()

Unnamed: 0,Raw_text,wo_extra_whitespace,wo_punct,case_normalised,tokens
0,"2021 Coachmen Sportscoach 402TS Two Full Bath,...","2021 Coachmen Sportscoach 402TS Two Full Bath,...",Coachmen Sportscoach TS Two Full Bath ...,coachmen sportscoach ts two full bath ...,"[, coachmen, sportscoach, ts, two, full, bath,..."
1,Our Online Privacy PolicySkip to main contentE...,Our Online Privacy PolicySkip to main contentE...,Our Online Privacy PolicySkip to main contentE...,our online privacy policyskip to main contente...,"[our, online, privacy, policyskip, to, main, c..."
2,Python: parsing PDF text and tables - usage an...,Python: parsing PDF text and tables - usage an...,Python parsing PDF text and tables - usage an...,python parsing pdf text and tables usage and...,"[python, parsing, pdf, text, and, tables, usag..."
3,Arts vs. Athletics: Two Great Ways You Can Pr...,Arts vs. Athletics: Two Great Ways You Can Pr...,Arts vs Athletics Two Great Ways You Can Pr...,arts vs athletics two great ways you can pr...,"[, arts, vs, athletics, two, great, ways, you,..."
4,Bubble Gum Pop Hat | AllFreeCrochet.comcloseAd...,Bubble Gum Pop Hat | AllFreeCrochet.comcloseAd...,Bubble Gum Pop Hat AllFreeCrochet comcloseAd...,bubble gum pop hat allfreecrochet comclosead...,"[bubble, gum, pop, hat, allfreecrochet, comclo..."


**Stop Words Removal**

In [14]:
# Removing stopwords and empty strings
stopword = nltk.corpus.stopwords.words('english')

def remove_stop_words(text):
  new_text = list(filter(None, [i for i in text if i not in stopword]))
  return new_text

df['stop_words_removed']=df['tokens'].apply(lambda x: remove_stop_words(x))
df

Unnamed: 0,Raw_text,wo_extra_whitespace,wo_punct,case_normalised,tokens,stop_words_removed
0,"2021 Coachmen Sportscoach 402TS Two Full Bath,...","2021 Coachmen Sportscoach 402TS Two Full Bath,...",Coachmen Sportscoach TS Two Full Bath ...,coachmen sportscoach ts two full bath ...,"[, coachmen, sportscoach, ts, two, full, bath,...","[coachmen, sportscoach, ts, two, full, bath, b..."
1,Our Online Privacy PolicySkip to main contentE...,Our Online Privacy PolicySkip to main contentE...,Our Online Privacy PolicySkip to main contentE...,our online privacy policyskip to main contente...,"[our, online, privacy, policyskip, to, main, c...","[online, privacy, policyskip, main, contenteng..."
2,Python: parsing PDF text and tables - usage an...,Python: parsing PDF text and tables - usage an...,Python parsing PDF text and tables - usage an...,python parsing pdf text and tables usage and...,"[python, parsing, pdf, text, and, tables, usag...","[python, parsing, pdf, text, tables, usage, co..."
3,Arts vs. Athletics: Two Great Ways You Can Pr...,Arts vs. Athletics: Two Great Ways You Can Pr...,Arts vs Athletics Two Great Ways You Can Pr...,arts vs athletics two great ways you can pr...,"[, arts, vs, athletics, two, great, ways, you,...","[arts, vs, athletics, two, great, ways, prepar..."
4,Bubble Gum Pop Hat | AllFreeCrochet.comcloseAd...,Bubble Gum Pop Hat | AllFreeCrochet.comcloseAd...,Bubble Gum Pop Hat AllFreeCrochet comcloseAd...,bubble gum pop hat allfreecrochet comclosead...,"[bubble, gum, pop, hat, allfreecrochet, comclo...","[bubble, gum, pop, hat, allfreecrochet, comclo..."
5,Transitional Care Gets a Room of Its Own - Pat...,Transitional Care Gets a Room of Its Own - Pat...,Transitional Care Gets a Room of Its Own - Pat...,transitional care gets a room of its own pati...,"[transitional, care, gets, a, room, of, its, o...","[transitional, care, gets, room, patient, safe..."
6,Decorative Objects/Accessories - Page 2 of 12 ...,Decorative Objects/Accessories - Page 2 of 12 ...,Decorative Objects Accessories - Page of ...,decorative objects accessories page of ...,"[decorative, objects, accessories, page, of, e...","[decorative, objects, accessories, page, engli..."
7,\r\n\t \t \t\r\n HomeA...,HomeAbout usApproachFamily OfficePurposeful I...,HomeAbout usApproachFamily OfficePurposeful I...,homeabout usapproachfamily officepurposeful i...,"[, homeabout, usapproachfamily, officepurposef...","[homeabout, usapproachfamily, officepurposeful..."
8,Jemarl Baker Jr. to transfer from Arizona Wild...,Jemarl Baker Jr. to transfer from Arizona Wild...,Jemarl Baker Jr to transfer from Arizona Wild...,jemarl baker jr to transfer from arizona wild...,"[jemarl, baker, jr, to, transfer, from, arizon...","[jemarl, baker, jr, transfer, arizona, wildcat..."
9,Publications | The Protect Heritage Corp.\r\n\...,Publications | The Protect Heritage Corp. Home...,Publications The Protect Heritage Corp Home...,publications the protect heritage corp home...,"[publications, the, protect, heritage, corp, h...","[publications, protect, heritage, corp, homeab..."


**Stemming**

In [15]:
# For stemming
ps = nltk.PorterStemmer()

def stem(text):
  new_text = [ps.stem(word) for word in text]
  return new_text

df['stemmed']=df['stop_words_removed'].apply(lambda x: stem(x))
df

Unnamed: 0,Raw_text,wo_extra_whitespace,wo_punct,case_normalised,tokens,stop_words_removed,stemmed
0,"2021 Coachmen Sportscoach 402TS Two Full Bath,...","2021 Coachmen Sportscoach 402TS Two Full Bath,...",Coachmen Sportscoach TS Two Full Bath ...,coachmen sportscoach ts two full bath ...,"[, coachmen, sportscoach, ts, two, full, bath,...","[coachmen, sportscoach, ts, two, full, bath, b...","[coachmen, sportscoach, ts, two, full, bath, b..."
1,Our Online Privacy PolicySkip to main contentE...,Our Online Privacy PolicySkip to main contentE...,Our Online Privacy PolicySkip to main contentE...,our online privacy policyskip to main contente...,"[our, online, privacy, policyskip, to, main, c...","[online, privacy, policyskip, main, contenteng...","[onlin, privaci, policyskip, main, contentengl..."
2,Python: parsing PDF text and tables - usage an...,Python: parsing PDF text and tables - usage an...,Python parsing PDF text and tables - usage an...,python parsing pdf text and tables usage and...,"[python, parsing, pdf, text, and, tables, usag...","[python, parsing, pdf, text, tables, usage, co...","[python, pars, pdf, text, tabl, usag, comparis..."
3,Arts vs. Athletics: Two Great Ways You Can Pr...,Arts vs. Athletics: Two Great Ways You Can Pr...,Arts vs Athletics Two Great Ways You Can Pr...,arts vs athletics two great ways you can pr...,"[, arts, vs, athletics, two, great, ways, you,...","[arts, vs, athletics, two, great, ways, prepar...","[art, vs, athlet, two, great, way, prepar, col..."
4,Bubble Gum Pop Hat | AllFreeCrochet.comcloseAd...,Bubble Gum Pop Hat | AllFreeCrochet.comcloseAd...,Bubble Gum Pop Hat AllFreeCrochet comcloseAd...,bubble gum pop hat allfreecrochet comclosead...,"[bubble, gum, pop, hat, allfreecrochet, comclo...","[bubble, gum, pop, hat, allfreecrochet, comclo...","[bubbl, gum, pop, hat, allfreecrochet, comclos..."
5,Transitional Care Gets a Room of Its Own - Pat...,Transitional Care Gets a Room of Its Own - Pat...,Transitional Care Gets a Room of Its Own - Pat...,transitional care gets a room of its own pati...,"[transitional, care, gets, a, room, of, its, o...","[transitional, care, gets, room, patient, safe...","[transit, care, get, room, patient, safeti, qu..."
6,Decorative Objects/Accessories - Page 2 of 12 ...,Decorative Objects/Accessories - Page 2 of 12 ...,Decorative Objects Accessories - Page of ...,decorative objects accessories page of ...,"[decorative, objects, accessories, page, of, e...","[decorative, objects, accessories, page, engli...","[decor, object, accessori, page, english, acce..."
7,\r\n\t \t \t\r\n HomeA...,HomeAbout usApproachFamily OfficePurposeful I...,HomeAbout usApproachFamily OfficePurposeful I...,homeabout usapproachfamily officepurposeful i...,"[, homeabout, usapproachfamily, officepurposef...","[homeabout, usapproachfamily, officepurposeful...","[homeabout, usapproachfamili, officepurpos, in..."
8,Jemarl Baker Jr. to transfer from Arizona Wild...,Jemarl Baker Jr. to transfer from Arizona Wild...,Jemarl Baker Jr to transfer from Arizona Wild...,jemarl baker jr to transfer from arizona wild...,"[jemarl, baker, jr, to, transfer, from, arizon...","[jemarl, baker, jr, transfer, arizona, wildcat...","[jemarl, baker, jr, transfer, arizona, wildcat..."
9,Publications | The Protect Heritage Corp.\r\n\...,Publications | The Protect Heritage Corp. Home...,Publications The Protect Heritage Corp Home...,publications the protect heritage corp home...,"[publications, the, protect, heritage, corp, h...","[publications, protect, heritage, corp, homeab...","[public, protect, heritag, corp, homeaboutgiv,..."


**Lemmatization**

In [16]:
# For lemmatization
wn = nltk.WordNetLemmatizer()

def lemmatize(text):
  new_text = [wn.lemmatize(word) for word in text]
  return new_text

df['lemmatized']=df['stemmed'].apply(lambda x: lemmatize(x))
df

Unnamed: 0,Raw_text,wo_extra_whitespace,wo_punct,case_normalised,tokens,stop_words_removed,stemmed,lemmatized
0,"2021 Coachmen Sportscoach 402TS Two Full Bath,...","2021 Coachmen Sportscoach 402TS Two Full Bath,...",Coachmen Sportscoach TS Two Full Bath ...,coachmen sportscoach ts two full bath ...,"[, coachmen, sportscoach, ts, two, full, bath,...","[coachmen, sportscoach, ts, two, full, bath, b...","[coachmen, sportscoach, ts, two, full, bath, b...","[coachman, sportscoach, t, two, full, bath, bu..."
1,Our Online Privacy PolicySkip to main contentE...,Our Online Privacy PolicySkip to main contentE...,Our Online Privacy PolicySkip to main contentE...,our online privacy policyskip to main contente...,"[our, online, privacy, policyskip, to, main, c...","[online, privacy, policyskip, main, contenteng...","[onlin, privaci, policyskip, main, contentengl...","[onlin, privaci, policyskip, main, contentengl..."
2,Python: parsing PDF text and tables - usage an...,Python: parsing PDF text and tables - usage an...,Python parsing PDF text and tables - usage an...,python parsing pdf text and tables usage and...,"[python, parsing, pdf, text, and, tables, usag...","[python, parsing, pdf, text, tables, usage, co...","[python, pars, pdf, text, tabl, usag, comparis...","[python, par, pdf, text, tabl, usag, compariso..."
3,Arts vs. Athletics: Two Great Ways You Can Pr...,Arts vs. Athletics: Two Great Ways You Can Pr...,Arts vs Athletics Two Great Ways You Can Pr...,arts vs athletics two great ways you can pr...,"[, arts, vs, athletics, two, great, ways, you,...","[arts, vs, athletics, two, great, ways, prepar...","[art, vs, athlet, two, great, way, prepar, col...","[art, v, athlet, two, great, way, prepar, coll..."
4,Bubble Gum Pop Hat | AllFreeCrochet.comcloseAd...,Bubble Gum Pop Hat | AllFreeCrochet.comcloseAd...,Bubble Gum Pop Hat AllFreeCrochet comcloseAd...,bubble gum pop hat allfreecrochet comclosead...,"[bubble, gum, pop, hat, allfreecrochet, comclo...","[bubble, gum, pop, hat, allfreecrochet, comclo...","[bubbl, gum, pop, hat, allfreecrochet, comclos...","[bubbl, gum, pop, hat, allfreecrochet, comclos..."
5,Transitional Care Gets a Room of Its Own - Pat...,Transitional Care Gets a Room of Its Own - Pat...,Transitional Care Gets a Room of Its Own - Pat...,transitional care gets a room of its own pati...,"[transitional, care, gets, a, room, of, its, o...","[transitional, care, gets, room, patient, safe...","[transit, care, get, room, patient, safeti, qu...","[transit, care, get, room, patient, safeti, qu..."
6,Decorative Objects/Accessories - Page 2 of 12 ...,Decorative Objects/Accessories - Page 2 of 12 ...,Decorative Objects Accessories - Page of ...,decorative objects accessories page of ...,"[decorative, objects, accessories, page, of, e...","[decorative, objects, accessories, page, engli...","[decor, object, accessori, page, english, acce...","[decor, object, accessori, page, english, acce..."
7,\r\n\t \t \t\r\n HomeA...,HomeAbout usApproachFamily OfficePurposeful I...,HomeAbout usApproachFamily OfficePurposeful I...,homeabout usapproachfamily officepurposeful i...,"[, homeabout, usapproachfamily, officepurposef...","[homeabout, usapproachfamily, officepurposeful...","[homeabout, usapproachfamili, officepurpos, in...","[homeabout, usapproachfamili, officepurpos, in..."
8,Jemarl Baker Jr. to transfer from Arizona Wild...,Jemarl Baker Jr. to transfer from Arizona Wild...,Jemarl Baker Jr to transfer from Arizona Wild...,jemarl baker jr to transfer from arizona wild...,"[jemarl, baker, jr, to, transfer, from, arizon...","[jemarl, baker, jr, transfer, arizona, wildcat...","[jemarl, baker, jr, transfer, arizona, wildcat...","[jemarl, baker, jr, transfer, arizona, wildcat..."
9,Publications | The Protect Heritage Corp.\r\n\...,Publications | The Protect Heritage Corp. Home...,Publications The Protect Heritage Corp Home...,publications the protect heritage corp home...,"[publications, the, protect, heritage, corp, h...","[publications, protect, heritage, corp, homeab...","[public, protect, heritag, corp, homeaboutgiv,...","[public, protect, heritag, corp, homeaboutgiv,..."


**Bigram Formation**

In [19]:
def form_bigrams(text):
  bgs_list = []
  finder = BigramCollocationFinder.from_words(text)
  for bigram,freq in finder.ngram_fd.items():
    bgs_list.append(' '.join(bigram))
  return bgs_list

df['bigrams']=df['lemmatized'].apply(lambda x: form_bigrams(x))
df

Unnamed: 0,Raw_text,wo_extra_whitespace,wo_punct,case_normalised,tokens,stop_words_removed,stemmed,lemmatized,bigrams
0,"2021 Coachmen Sportscoach 402TS Two Full Bath,...","2021 Coachmen Sportscoach 402TS Two Full Bath,...",Coachmen Sportscoach TS Two Full Bath ...,coachmen sportscoach ts two full bath ...,"[, coachmen, sportscoach, ts, two, full, bath,...","[coachmen, sportscoach, ts, two, full, bath, b...","[coachmen, sportscoach, ts, two, full, bath, b...","[coachman, sportscoach, t, two, full, bath, bu...","[coachman sportscoach, sportscoach t, t two, t..."
1,Our Online Privacy PolicySkip to main contentE...,Our Online Privacy PolicySkip to main contentE...,Our Online Privacy PolicySkip to main contentE...,our online privacy policyskip to main contente...,"[our, online, privacy, policyskip, to, main, c...","[online, privacy, policyskip, main, contenteng...","[onlin, privaci, policyskip, main, contentengl...","[onlin, privaci, policyskip, main, contentengl...","[onlin privaci, privaci policyskip, policyskip..."
2,Python: parsing PDF text and tables - usage an...,Python: parsing PDF text and tables - usage an...,Python parsing PDF text and tables - usage an...,python parsing pdf text and tables usage and...,"[python, parsing, pdf, text, and, tables, usag...","[python, parsing, pdf, text, tables, usage, co...","[python, pars, pdf, text, tabl, usag, comparis...","[python, par, pdf, text, tabl, usag, compariso...","[python par, par pdf, pdf text, text tabl, tab..."
3,Arts vs. Athletics: Two Great Ways You Can Pr...,Arts vs. Athletics: Two Great Ways You Can Pr...,Arts vs Athletics Two Great Ways You Can Pr...,arts vs athletics two great ways you can pr...,"[, arts, vs, athletics, two, great, ways, you,...","[arts, vs, athletics, two, great, ways, prepar...","[art, vs, athlet, two, great, way, prepar, col...","[art, v, athlet, two, great, way, prepar, coll...","[art v, v athlet, athlet two, two great, great..."
4,Bubble Gum Pop Hat | AllFreeCrochet.comcloseAd...,Bubble Gum Pop Hat | AllFreeCrochet.comcloseAd...,Bubble Gum Pop Hat AllFreeCrochet comcloseAd...,bubble gum pop hat allfreecrochet comclosead...,"[bubble, gum, pop, hat, allfreecrochet, comclo...","[bubble, gum, pop, hat, allfreecrochet, comclo...","[bubbl, gum, pop, hat, allfreecrochet, comclos...","[bubbl, gum, pop, hat, allfreecrochet, comclos...","[bubbl gum, gum pop, pop hat, hat allfreecroch..."
5,Transitional Care Gets a Room of Its Own - Pat...,Transitional Care Gets a Room of Its Own - Pat...,Transitional Care Gets a Room of Its Own - Pat...,transitional care gets a room of its own pati...,"[transitional, care, gets, a, room, of, its, o...","[transitional, care, gets, room, patient, safe...","[transit, care, get, room, patient, safeti, qu...","[transit, care, get, room, patient, safeti, qu...","[transit care, care get, get room, room patien..."
6,Decorative Objects/Accessories - Page 2 of 12 ...,Decorative Objects/Accessories - Page 2 of 12 ...,Decorative Objects Accessories - Page of ...,decorative objects accessories page of ...,"[decorative, objects, accessories, page, of, e...","[decorative, objects, accessories, page, engli...","[decor, object, accessori, page, english, acce...","[decor, object, accessori, page, english, acce...","[decor object, object accessori, accessori pag..."
7,\r\n\t \t \t\r\n HomeA...,HomeAbout usApproachFamily OfficePurposeful I...,HomeAbout usApproachFamily OfficePurposeful I...,homeabout usapproachfamily officepurposeful i...,"[, homeabout, usapproachfamily, officepurposef...","[homeabout, usapproachfamily, officepurposeful...","[homeabout, usapproachfamili, officepurpos, in...","[homeabout, usapproachfamili, officepurpos, in...","[homeabout usapproachfamili, usapproachfamili ..."
8,Jemarl Baker Jr. to transfer from Arizona Wild...,Jemarl Baker Jr. to transfer from Arizona Wild...,Jemarl Baker Jr to transfer from Arizona Wild...,jemarl baker jr to transfer from arizona wild...,"[jemarl, baker, jr, to, transfer, from, arizon...","[jemarl, baker, jr, transfer, arizona, wildcat...","[jemarl, baker, jr, transfer, arizona, wildcat...","[jemarl, baker, jr, transfer, arizona, wildcat...","[jemarl baker, baker jr, jr transfer, transfer..."
9,Publications | The Protect Heritage Corp.\r\n\...,Publications | The Protect Heritage Corp. Home...,Publications The Protect Heritage Corp Home...,publications the protect heritage corp home...,"[publications, the, protect, heritage, corp, h...","[publications, protect, heritage, corp, homeab...","[public, protect, heritag, corp, homeaboutgiv,...","[public, protect, heritag, corp, homeaboutgiv,...","[public protect, protect heritag, heritag corp..."


In [20]:
# Removing duplicate bigrams
def process_bigrams(text):
  processed = " ".join(word.split()[0] for word in text if len(word.split()) == 2)
  return processed

df['Processed_text']=df['bigrams'].apply(lambda x: process_bigrams(x))
df[['Raw_text','Processed_text']]

Unnamed: 0,Raw_text,Processed_text
0,"2021 Coachmen Sportscoach 402TS Two Full Bath,...",coachman sportscoach t two full bath bunk bed ...
1,Our Online Privacy PolicySkip to main contentE...,onlin privaci policyskip main contentenglishfr...
2,Python: parsing PDF text and tables - usage an...,python par pdf text tabl usag comparison pdfmi...
3,Arts vs. Athletics: Two Great Ways You Can Pr...,art v athlet two great way prepar colleg colle...
4,Bubble Gum Pop Hat | AllFreeCrochet.comcloseAd...,bubbl gum pop hat allfreecrochet comcloseadvan...
5,Transitional Care Gets a Room of Its Own - Pat...,transit care get room patient safeti qualiti h...
6,Decorative Objects/Accessories - Page 2 of 12 ...,decor object accessori page english accent ant...
7,\r\n\t \t \t\r\n HomeA...,homeabout usapproachfamili officepurpos invest...
8,Jemarl Baker Jr. to transfer from Arizona Wild...,jemarl baker jr transfer arizona wildcat per r...
9,Publications | The Protect Heritage Corp.\r\n\...,public protect heritag corp homeaboutgiv backb...


## **Embedding**

**1. Bag of Words**

In [31]:
from sklearn.feature_extraction.text import CountVectorizer
cv = CountVectorizer()
matrix = cv.fit_transform(df['Processed_text']).toarray()
df['bow'] =  list(matrix)
df[['Processed_text','bow']]

Unnamed: 0,Processed_text,bow
0,coachman sportscoach t two full bath bunk bed ...,"[1, 2, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, ..."
1,onlin privaci policyskip main contentenglishfr...,"[0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, ..."
2,python par pdf text tabl usag comparison pdfmi...,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
3,art v athlet two great way prepar colleg colle...,"[0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 2, 1, 0, 1, ..."
4,bubbl gum pop hat allfreecrochet comcloseadvan...,"[1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
5,transit care get room patient safeti qualiti h...,"[0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0, ..."
6,decor object accessori page english accent ant...,"[2, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, ..."
7,homeabout usapproachfamili officepurpos invest...,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
8,jemarl baker jr transfer arizona wildcat per r...,"[1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, ..."
9,public protect heritag corp homeaboutgiv backb...,"[21, 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0,..."


**2. TF-IDF**

In [48]:
from sklearn.feature_extraction.text import TfidfVectorizer
vectorizer = TfidfVectorizer()
tfidf_list = []
for i in range(10):
  data = df['Processed_text'].values[i].split()
  x = vectorizer.fit_transform(data)
  tfidf_list.append(x)
df['tf-idf'] = tfidf_list
df[['Processed_text','tf-idf']]

Unnamed: 0,Processed_text,tf-idf
0,coachman sportscoach t two full bath bunk bed ...,"(0, 30)\t1.0\n (1, 191)\t1.0\n (3, 216)\t1..."
1,onlin privaci policyskip main contentenglishfr...,"(0, 234)\t1.0\n (1, 278)\t1.0\n (2, 264)\t..."
2,python par pdf text tabl usag comparison pdfmi...,"(0, 147)\t1.0\n (1, 130)\t1.0\n (2, 134)\t..."
3,art v athlet two great way prepar colleg colle...,"(0, 51)\t1.0\n (2, 68)\t1.0\n (3, 648)\t1...."
4,bubbl gum pop hat allfreecrochet comcloseadvan...,"(0, 102)\t1.0\n (1, 340)\t1.0\n (2, 585)\t..."
5,transit care get room patient safeti qualiti h...,"(0, 417)\t1.0\n (1, 59)\t1.0\n (2, 161)\t1..."
6,decor object accessori page english accent ant...,"(0, 69)\t1.0\n (1, 153)\t1.0\n (2, 4)\t1.0..."
7,homeabout usapproachfamili officepurpos invest...,"(0, 59)\t1.0\n (1, 135)\t1.0\n (2, 87)\t1...."
8,jemarl baker jr transfer arizona wildcat per r...,"(0, 118)\t1.0\n (1, 21)\t1.0\n (2, 122)\t1..."
9,public protect heritag corp homeaboutgiv backb...,"(0, 366)\t1.0\n (1, 364)\t1.0\n (2, 204)\t..."
