In [39]:
import pandas as pd
import sklearn as sk
import math

In [40]:
first_sentence = "Data Science is the good job of the 21st century"
second_sentence = "machine learning is the key for data science"

#split so each word have their own string
first_sentence = first_sentence.split(" ")
second_sentence = second_sentence.split(" ")

#join them to remove common duplicate words
total= set(first_sentence).union(set(second_sentence))

print(total)

{'good', '21st', 'century', 'of', 'for', 'is', 'machine', 'Data', 'Science', 'key', 'science', 'data', 'the', 'job', 'learning'}


In [45]:
wordDictA = dict.fromkeys(total, 0) 
wordDictB = dict.fromkeys(total, 0)



for word in first_sentence:
    wordDictA[word]+=1
    
for word in second_sentence:
    wordDictB[word]+=1
    
print(wordDictA)
print(wordDictB)

{'good': 1, '21st': 1, 'century': 1, 'of': 1, 'for': 0, 'is': 1, 'machine': 0, 'Data': 1, 'Science': 1, 'key': 0, 'science': 0, 'data': 0, 'the': 2, 'job': 1, 'learning': 0}
{'good': 0, '21st': 0, 'century': 0, 'of': 0, 'for': 1, 'is': 1, 'machine': 1, 'Data': 0, 'Science': 0, 'key': 1, 'science': 1, 'data': 1, 'the': 1, 'job': 0, 'learning': 1}


In [46]:
pd.DataFrame([wordDictA, wordDictB])

Unnamed: 0,good,21st,century,of,for,is,machine,Data,Science,key,science,data,the,job,learning
0,1,1,1,1,0,1,0,1,1,0,0,0,2,1,0
1,0,0,0,0,1,1,1,0,0,1,1,1,1,0,1


In [63]:
def computeTF(wordDict, doc):   
    tfDict = {}
    corpusCount = len(doc)
    for word, count in wordDict.items():
        tfDict[word] = count/float(corpusCount)
    return(tfDict)

#running our sentences through the tf function:
tfFirst = computeTF(wordDictA, first_sentence)
tfSecond = computeTF(wordDictB, second_sentence)

#Converting to dataframe for visualization
tf = pd.DataFrame([tfFirst, tfSecond])
tf

Unnamed: 0,good,21st,century,of,for,is,machine,Data,Science,key,science,data,the,job,learning
0,0.1,0.1,0.1,0.1,0.0,0.1,0.0,0.1,0.1,0.0,0.0,0.0,0.2,0.1,0.0
1,0.0,0.0,0.0,0.0,0.125,0.125,0.125,0.0,0.0,0.125,0.125,0.125,0.125,0.0,0.125


In [68]:
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords
stop_words = set(stopwords.words('english'))

filtered_sentence = [w for w in wordDictA if not w in stop_words]
print(filtered_sentence)

['good', '21st', 'century', 'machine', 'Data', 'Science', 'key', 'science', 'data', 'job', 'learning']


[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Vikki\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [70]:
def computeIDF(docList):
    idfDict = {}
    N = len(docList)
    
    idfDict = dict.fromkeys(docList[0].keys(), 0)
    for word, val in idfDict.items():
        idfDict[word] = math.log10(N / (float(val) + 1))
        
    return(idfDict)
#inputing our sentences in the log file
idfs = computeIDF([wordDictA, wordDictB])
idfs

{'good': 0.3010299956639812,
 '21st': 0.3010299956639812,
 'century': 0.3010299956639812,
 'of': 0.3010299956639812,
 'for': 0.3010299956639812,
 'is': 0.3010299956639812,
 'machine': 0.3010299956639812,
 'Data': 0.3010299956639812,
 'Science': 0.3010299956639812,
 'key': 0.3010299956639812,
 'science': 0.3010299956639812,
 'data': 0.3010299956639812,
 'the': 0.3010299956639812,
 'job': 0.3010299956639812,
 'learning': 0.3010299956639812}

In [73]:
def computeTFIDF(tfBow, idfs):
    tfidf = {}
    for word, val in tfBow.items():
        tfidf[word] = val*idfs[word]
    return(tfidf)


#running our two sentences through the IDF:
idfFirst = computeTFIDF(tfFirst, idfs)
idfSecond = computeTFIDF(tfSecond, idfs)

#putting it in a dataframe
idf= pd.DataFrame([idfFirst, idfSecond])
print(idf)

       good      21st   century        of       for        is   machine  \
0  0.030103  0.030103  0.030103  0.030103  0.000000  0.030103  0.000000   
1  0.000000  0.000000  0.000000  0.000000  0.037629  0.037629  0.037629   

       Data   Science       key   science      data       the       job  \
0  0.030103  0.030103  0.000000  0.000000  0.000000  0.060206  0.030103   
1  0.000000  0.000000  0.037629  0.037629  0.037629  0.037629  0.000000   

   learning  
0  0.000000  
1  0.037629  


# TF-IDF using skleran library


In [None]:
#first step is to import the library
from sklearn.feature_extraction.text import TfidfVectorizer

#for the sentence, make sure all words are lowercase or you will run #into error. for simplicity, I just made the same sentence all #lowercase
firstV= "Data Science is the sexiest job of the 21st century"
secondV= "machine learning is the key for data science"

#calling the TfidfVectorizer
vectorize= TfidfVectorizer()

#fitting the model and passing our sentences right away:
response= vectorize.fit_transform([firstV, secondV])