In [1]:
from pprint import pprint

import pandas as pd
import numpy as np

%matplotlib inline
import matplotlib.pyplot as plt

import prepare

In [10]:
codeup_df = prepare.prep_clean_codeup_data()
codeup_df.head()

Unnamed: 0,title,link,original,clean,stemmed,lemmatized
0,VET TEC Funding Now Available For Dallas Veterans,https://codeup.com/codeup-news/vet-tec-funding...,We are so happy to announce that VET TEC benef...,happy announce vet tec benefits available used...,happi announc vet tec benefit avail use campu ...,happy announce vet tec benefit available used ...
1,Dallas Campus Re-opens With New Grant Partner,https://codeup.com/codeup-news/dallas-campus-r...,We are happy to announce that our Dallas campu...,happy announce dallas campus reopened better y...,happi announc dalla campu reopen better yet ne...,happy announce dallas campus reopened better y...
2,Is Codeup the Best Bootcamp in San Antonio…or ...,https://codeup.com/codeup-news/is-codeup-the-b...,Looking for the best data science bootcamp in ...,looking best data science bootcamp world best ...,look best data scienc bootcamp world best code...,looking best data science bootcamp world best ...
3,Codeup Launches First Podcast: Hire Tech,https://codeup.com/codeup-news/codeup-launches...,Any podcast enthusiasts out there? We are plea...,podcast enthusiasts pleased announce release c...,podcast enthusiast pleas announc releas codeup...,podcast enthusiast pleased announce release co...
4,Codeup Start Dates for March 2022,https://codeup.com/codeup-news/codeup-start-da...,As we approach the end of January we wanted to...,approach end january wanted look forward next ...,approach end januari want look forward next st...,approach end january wanted look forward next ...


In [16]:
# combine all words in lemmatized column
words = pd.Series(' '.join(codeup_df.lemmatized).split())
words

0             happy
1          announce
2               vet
3               tec
4           benefit
           ...     
5016          first
5017          apply
5018          apply
5019          pride
5020    scholarship
Length: 5021, dtype: object

In [17]:
(pd.DataFrame({'raw_count': words.value_counts()})
 .assign(frequency=lambda df: df.raw_count / df.raw_count.sum())
 .assign(augmented_frequency=lambda df: df.frequency / df.frequency.max()))

Unnamed: 0,raw_count,frequency,augmented_frequency
tech,65,0.012946,1.000000
codeup,63,0.012547,0.969231
program,62,0.012348,0.953846
career,56,0.011153,0.861538
system,42,0.008365,0.646154
...,...,...,...
shift,1,0.000199,0.015385
vary,1,0.000199,0.015385
plenty,1,0.000199,0.015385
mitchell,1,0.000199,0.015385


In [8]:
news_df = prepare.prep_clean_news_data()
news_df.head()

Unnamed: 0,title,original,category,clean,stemmed,lemmatized
0,RBI cancels licence of Maha-based Independence...,RBI has cancelled licence of Maharashtra-based...,business,rbi cancelled licence maharashtrabased indepen...,rbi cancel licenc maharashtrabas independ coop...,rbi cancelled licence maharashtrabased indepen...
1,Boost to EVs a big step: Windmill Capital,"Increased use of EVs in public transport, spec...",business,increased use evs public transport special mob...,increas use ev public transport special mobil ...,increased use ev public transport special mobi...
2,Facebook parent Meta's $230-billion wipeout bi...,Facebook's parent Meta's shares plunged 27% an...,business,facebook ' s parent meta ' s shares plunged 27...,facebook ' s parent meta ' s share plung 27 th...,facebook ' s parent meta ' s share plunged 27 ...
3,"Tesla co-worker used N-word, threw a hot tool ...",A former Tesla worker has filed a lawsuit agai...,business,former tesla worker filed lawsuit company fail...,former tesla worker file lawsuit compani fail ...,former tesla worker filed lawsuit company fail...
4,Mark Zuckerberg loses $31 bn in one of the big...,Meta CEO Mark Zuckerberg's wealth dropped by $...,business,meta ceo mark zuckerberg ' s wealth dropped 31...,meta ceo mark zuckerberg ' s wealth drop 31 bi...,meta ceo mark zuckerberg ' s wealth dropped 31...


In [18]:
# combine all lemmatized words by category
entertainment_words = pd.Series(' '.join(news_df.lemmatized[news_df.category == 'entertainment'].astype(str)).split())
business_words = pd.Series(' '.join(news_df.lemmatized[news_df.category == 'business'].astype(str)).split())
technology_words = pd.Series(' '.join(news_df.lemmatized[news_df.category == 'technology'].astype(str)).split())
sports_words = pd.Series(' '.join(news_df.lemmatized[news_df.category == 'sports'].astype(str)).split())
all_words = pd.Series(' '.join(news_df.lemmatized).split())
entertainment_words

0       picture
1            tv
2       actress
3      karishma
4         tanna
         ...   
917       wrong
918         way
919    approach
920        film
921       added
Length: 922, dtype: object

In [23]:
# our 3 example documents
documents = {
    'news': 'Codeup announced last thursday that they just launched a new data science program. It is 18 weeks long.',
    'description': 'Codeup\'s data science program teaches hands on skills using Python and pandas.',
    'context': 'Codeup\'s data science program was created in response to a percieved lack of data science talent, and growing demand.'
}
pprint(documents)

print('\nCleaning and lemmatizing...\n')

documents = {topic: prepare.lemmatize(prepare.basic_clean(documents[topic])) for topic in documents}
pprint(documents)

{'context': "Codeup's data science program was created in response to a "
            'percieved lack of data science talent, and growing demand.',
 'description': "Codeup's data science program teaches hands on skills using "
                'Python and pandas.',
 'news': 'Codeup announced last thursday that they just launched a new data '
         'science program. It is 18 weeks long.'}

Cleaning and lemmatizing...

{'context': "codeup's data science program created response percieved lack "
            'data science talent growing demand',
 'description': "codeup's data science program teach hand skill using python "
                'panda',
 'news': 'codeup announced last thursday launched new data science program 18 '
         'week long'}


In [30]:
def idf(word):
    n_occurences = sum([1 for words in entertainment_words if word in words])
    return len(entertainment_words) / n_occurences

In [39]:
unique_words = pd.Series(' '.join(documents.values()).split()).unique()

# put the unique words into a data frame
(pd.DataFrame(dict(word=unique_words))
 # calculate the idf for each word
 .assign(idf=lambda df: df.word.apply(idf))
 # sort the data for presentation purposes
 .set_index('word')
 .sort_values(by='idf', ascending=False)
 .head(5))

Unnamed: 0_level_0,idf
word,Unnamed: 1_level_1
teach,3.0
announced,3.0
growing,3.0
talent,3.0
lack,3.0


In [32]:
unique_words

array(['codeup', 'announced', 'last', 'thursday', 'launched', 'new',
       'data', 'science', 'program', '18', 'week', 'long', "codeup's",
       'teach', 'hand', 'skill', 'using', 'python', 'panda', 'created',
       'response', 'percieved', 'lack', 'talent', 'growing', 'demand'],
      dtype=object)

In [20]:
e_unique_words = entertainment_words.unique
e_unique_words

<bound method Series.unique of 0       picture
1            tv
2       actress
3      karishma
4         tanna
         ...   
917       wrong
918         way
919    approach
920        film
921       added
Length: 922, dtype: object>

In [None]:
(pd.DataFrame(dict(word=e_unique_words))
 # calculate the idf for each word
 .assign(idf=lambda df: df.word.apply(idf))
 # sort the data for presentation purposes
 .set_index('word')
 .sort_values(by='idf', ascending=False)
 .head(5))

In [19]:
entertainment = (pd.DataFrame({'raw_count': entertainment_words.value_counts()})
                 .assign(frequency=lambda df: df.raw_count / df.raw_count.sum())
                 .assign(augmented_frequency=lambda df: df.frequency / df.frequency.max()))
business = (pd.DataFrame({'raw_count': business_words.value_counts()})
                 .assign(frequency=lambda df: df.raw_count / df.raw_count.sum())
                 .assign(augmented_frequency=lambda df: df.frequency / df.frequency.max()))
technology = (pd.DataFrame({'raw_count': technology_words.value_counts()})
                 .assign(frequency=lambda df: df.raw_count / df.raw_count.sum())
                 .assign(augmented_frequency=lambda df: df.frequency / df.frequency.max()))
sports = (pd.DataFrame({'raw_count': sports_words.value_counts()})
                 .assign(frequency=lambda df: df.raw_count / df.raw_count.sum())
                 .assign(augmented_frequency=lambda df: df.frequency / df.frequency.max()))

In [None]:
# create a function that calculates inverse document frequency for each word
def idf(word):
    n_occurences = sum([1 for doc in documents.values() if word in doc])
    return len(documents) / n_occurences

# Get a list of the unique words
unique_words = pd.Series(' '.join(documents.values()).split()).unique()

# put the unique words into a data frame
(pd.DataFrame(dict(word=unique_words))
 # calculate the idf for each word
 .assign(idf=lambda df: df.word.apply(idf))
 # sort the data for presentation purposes
 .set_index('word')
 .sort_values(by='idf', ascending=False)
 .head(5))