In [1]:
import nltk
from nltk.stem import PorterStemmer
from nltk.corpus import stopwords

In [2]:

corpus = '''These are the Terms and Conditions governing the use of this Service and the agreement 
that operates between You and the Company. These Terms and Conditions set out the rights and obligations of 
all users regarding the use of the Service. Your access to and use of the Service is conditioned on Your 
acceptance of and compliance with these Terms and Conditions. These Terms and Conditions apply to all visitors, 
users and others who access or use the Service. By accessing or using the Service You agree to be bound by these 
Terms and Conditions. If You disagree with any part of these Terms and Conditions then You may not access the Service. 
You represent that you are over the age of 18. The Company does not permit those under 18 to use the Service. 
Your access to and use of the Service is also conditioned on Your acceptance of and compliance with the Privacy Policy 
of the Company. Our Privacy Policy describes Our policies and procedures on the collection, use and disclosure of 
Your personal information when You use the Application or the Website and tells You about Your privacy rights and 
how the law protects You. Please read Our Privacy Policy carefully before using Our Service. Intellectual Property 
The Service and its original content (excluding Content provided by You or other users), features and functionality 
are and will remain the exclusive property of the Company and its licensors. The Service is protected by copyright, 
trademark, and other laws of both the Country and foreign countries. Our trademarks and trade dress may not be 
used in connection with any product or service without the prior written consent of the Company.'''

In [3]:
#tokenize --> para to sentense 
import nltk
import os

# Optional: clear existing nltk_data (if corrupted)
nltk.data.path.clear()

# Force path to a clean directory
nltk_data_path = os.path.abspath('nltk_data')
nltk.download('punkt_tab', download_dir=nltk_data_path)
nltk.download('wordnet', download_dir=nltk_data_path)
nltk.download('stopwords', download_dir=nltk_data_path)
nltk.data.path.append(nltk_data_path)

[nltk_data] Downloading package punkt_tab to k:\NLP\nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package wordnet to k:\NLP\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package stopwords to k:\NLP\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [4]:
documents = nltk.sent_tokenize(corpus, language='english')

In [5]:
import os
import nltk
nltk.data.path.append(os.path.abspath('./nltk_data'))

In [6]:
documents

['These are the Terms and Conditions governing the use of this Service and the agreement \nthat operates between You and the Company.',
 'These Terms and Conditions set out the rights and obligations of \nall users regarding the use of the Service.',
 'Your access to and use of the Service is conditioned on Your \nacceptance of and compliance with these Terms and Conditions.',
 'These Terms and Conditions apply to all visitors, \nusers and others who access or use the Service.',
 'By accessing or using the Service You agree to be bound by these \nTerms and Conditions.',
 'If You disagree with any part of these Terms and Conditions then You may not access the Service.',
 'You represent that you are over the age of 18.',
 'The Company does not permit those under 18 to use the Service.',
 'Your access to and use of the Service is also conditioned on Your acceptance of and compliance with the Privacy Policy \nof the Company.',
 'Our Privacy Policy describes Our policies and procedures on t

In [7]:
#stemming - reducing words to its base root word
stemmer = PorterStemmer()

In [8]:
#Lemmetization -- more accuracy breaking of sentences into base root words
from nltk.stem import WordNetLemmatizer

lemme = WordNetLemmatizer()

In [9]:
stemmer.stem('goes')

'goe'

In [10]:
lemme.lemmatize('goes')

'go'

In [11]:
import re
cleaned_corpus =[]
for i in range(len(documents)):
    review = re.sub('[^a-zA-Z]',' ', documents[i])
    review = review.lower()
    cleaned_corpus.append(review)

In [12]:
cleaned_corpus

['these are the terms and conditions governing the use of this service and the agreement  that operates between you and the company ',
 'these terms and conditions set out the rights and obligations of  all users regarding the use of the service ',
 'your access to and use of the service is conditioned on your  acceptance of and compliance with these terms and conditions ',
 'these terms and conditions apply to all visitors   users and others who access or use the service ',
 'by accessing or using the service you agree to be bound by these  terms and conditions ',
 'if you disagree with any part of these terms and conditions then you may not access the service ',
 'you represent that you are over the age of    ',
 'the company does not permit those under    to use the service ',
 'your access to and use of the service is also conditioned on your acceptance of and compliance with the privacy policy  of the company ',
 'our privacy policy describes our policies and procedures on the col

In [13]:
for i in cleaned_corpus:
    words = nltk.word_tokenize(i)
    for word in words:
        if word not in set(stopwords.words('english')):
            print(stemmer.stem(word))

term
condit
govern
use
servic
agreement
oper
compani
term
condit
set
right
oblig
user
regard
use
servic
access
use
servic
condit
accept
complianc
term
condit
term
condit
appli
visitor
user
other
access
use
servic
access
use
servic
agre
bound
term
condit
disagre
part
term
condit
may
access
servic
repres
age
compani
permit
use
servic
access
use
servic
also
condit
accept
complianc
privaci
polici
compani
privaci
polici
describ
polici
procedur
collect
use
disclosur
person
inform
use
applic
websit
tell
privaci
right
law
protect
pleas
read
privaci
polici
care
use
servic
intellectu
properti
servic
origin
content
exclud
content
provid
user
featur
function
remain
exclus
properti
compani
licensor
servic
protect
copyright
trademark
law
countri
foreign
countri
trademark
trade
dress
may
use
connect
product
servic
without
prior
written
consent
compani


In [14]:
for i in cleaned_corpus:
    words = nltk.word_tokenize(i)
    for word in words:
        if word not in set(stopwords.words('english')):
            print(lemme.lemmatize(word))

term
condition
governing
use
service
agreement
operates
company
term
condition
set
right
obligation
user
regarding
use
service
access
use
service
conditioned
acceptance
compliance
term
condition
term
condition
apply
visitor
user
others
access
use
service
accessing
using
service
agree
bound
term
condition
disagree
part
term
condition
may
access
service
represent
age
company
permit
use
service
access
use
service
also
conditioned
acceptance
compliance
privacy
policy
company
privacy
policy
describes
policy
procedure
collection
use
disclosure
personal
information
use
application
website
tell
privacy
right
law
protects
please
read
privacy
policy
carefully
using
service
intellectual
property
service
original
content
excluding
content
provided
user
feature
functionality
remain
exclusive
property
company
licensors
service
protected
copyright
trademark
law
country
foreign
country
trademark
trade
dress
may
used
connection
product
service
without
prior
written
consent
company


In [16]:
#stopwrod lemmetize 
for i in range(len(documents)):
    review = re.sub('[^A-Za-z]',' ', documents[i])
    review = review.lower()
    review = review.split()
    review = [lemme.lemmatize(word) for word in review if word not in set(stopwords.words('english'))]
    review = ' '.join(review)
    cleaned_corpus.append(review)

In [17]:
from sklearn.feature_extraction.text import CountVectorizer
cv = CountVectorizer(binary= True, ngram_range=(2,3), max_features= 2)

In [18]:
x = cv.fit_transform(cleaned_corpus)

In [19]:
cv.vocabulary_

{'the service': np.int64(0), 'use service': np.int64(1)}

In [20]:


'''Notes : REMOVE STOPWORDS
TURN PARAS INTO SENTENCE 
GET FREQUENCY OF VOCABS'''

'Notes : REMOVE STOPWORDS\nTURN PARAS INTO SENTENCE \nGET FREQUENCY OF VOCABS'

In [21]:
'''TFIDF - term freq inverse document freq
tf = no of repetitions of a word in a sentence/ total words in a sentence
idf = log(no of sentence/ no of sentence with that specific word) 

tf*idf

'''

'TFIDF - term freq inverse document freq\ntf = no of repetitions of a word in a sentence/ total words in a sentence\nidf = log(no of sentence/ no of sentence with that specific word) \n\ntf*idf\n\n'

In [22]:
from sklearn.feature_extraction.text import TfidfVectorizer

cv = TfidfVectorizer()

y = cv.fit_transform(cleaned_corpus)

In [None]:
'''experimenting with a data set'''

import pandas as pd 

df = pd.read_csv('data/clean_jobs.csv')

In [24]:
df.head(5)

Unnamed: 0,id,title,company,location,link,source,date_posted,work_type,employment_type,description
0,1,Data Analyst,Meta,"New York, NY",https://www.linkedin.com/jobs/view/data-analys...,LinkedIn,2025-04-14,,,The Social Measurement team is a growing team ...
1,2,Data Analyst,Meta,"San Francisco, CA",https://www.linkedin.com/jobs/view/data-analys...,LinkedIn,2025-04-14,,,The Social Measurement team is a growing team ...
2,3,Data Analyst,Meta,"Los Angeles, CA",https://www.linkedin.com/jobs/view/data-analys...,LinkedIn,2025-04-14,,,The Social Measurement team is a growing team ...
3,4,Data Analyst,Meta,"Washington, DC",https://www.linkedin.com/jobs/view/data-analys...,LinkedIn,2025-04-14,,,The Social Measurement team is a growing team ...
4,5,Data Analyst II,Pinterest,"Chicago, IL",https://www.linkedin.com/jobs/view/data-analys...,LinkedIn,2025-04-16,,,About Pinterest\n\nMillions of people around t...


In [25]:
df.describe

<bound method NDFrame.describe of       id                          title                 company  \
0      1                   Data Analyst                    Meta   
1      2                   Data Analyst                    Meta   
2      3                   Data Analyst                    Meta   
3      4                   Data Analyst                    Meta   
4      5                Data Analyst II               Pinterest   
..   ...                            ...                     ...   
322  691  Data Engineer- Python Pyspark                 Virtusa   
323  692     Data Engineer with Pyspark               Cognizant   
324  693                  Data Engineer  Mercedes-Benz Malaysia   
325  740                Data Engineer I                IntePros   
326  741                  Data Engineer               Snap Inc.   

                              location  \
0                         New York, NY   
1                    San Francisco, CA   
2                      Los Angeles,

In [26]:
corpus = df['description']
corpus.head()

0    The Social Measurement team is a growing team ...
1    The Social Measurement team is a growing team ...
2    The Social Measurement team is a growing team ...
3    The Social Measurement team is a growing team ...
4    About Pinterest\n\nMillions of people around t...
Name: description, dtype: object

In [27]:

def clean_text(text):
    text = re.sub('[^A-Za-z1-9]',' ',text)
    text = text.lower().split()
    text = [lemme.lemmatize(word) for word in text if word not in set(stopwords.words('english'))]
    return ' '.join(text)



In [28]:
cleaned_corpus = corpus.apply(clean_text)

In [29]:
cleaned_corpus[:5]

0    social measurement team growing team high visi...
1    social measurement team growing team high visi...
2    social measurement team growing team high visi...
3    social measurement team growing team high visi...
4    pinterest million people around world come pla...
Name: description, dtype: object

BOW and TFIDF dont capture semantic meaning only capture the frequency of occurance

Sparse Matrix created by BOW and TFIDF in simple words huge matrices like the one below this isnt pratical for huge datasets 
my dataset is in Megabytes and this matrix is for one column of that dataset then imagine terabytes of data

In [32]:
y[0].toarray()

array([[0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.23493031, 0.        , 0.        , 0.40587926,
        0.        , 0.        , 0.        , 0.21425372, 0.        ,
        0.        , 0.26407234, 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.14154686, 0.        , 0.        ,
        0.        , 0.17403242, 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.23493031,
        0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.14154686,
        0.        , 0.23493031, 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.  