In [192]:
from pprint import pprint

import pandas as pd
import numpy as np

%matplotlib inline
import matplotlib.pyplot as plt

from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, accuracy_score, confusion_matrix

import prepare

# Exercise 1

Take the work we did in the lessons further:

In [129]:
# use function to pull in prepped codeup blog data
codeup_df = prepare.prep_clean_codeup_data()
codeup_df.head()

Unnamed: 0,title,link,original,clean,stemmed,lemmatized
0,VET TEC Funding Now Available For Dallas Veterans,https://codeup.com/codeup-news/vet-tec-funding...,We are so happy to announce that VET TEC benef...,happy announce vet tec benefits available used...,happi announc vet tec benefit avail use campu ...,happy announce vet tec benefit available used ...
1,Dallas Campus Re-opens With New Grant Partner,https://codeup.com/codeup-news/dallas-campus-r...,We are happy to announce that our Dallas campu...,happy announce dallas campus reopened better y...,happi announc dalla campu reopen better yet ne...,happy announce dallas campus reopened better y...
2,Is Codeup the Best Bootcamp in San Antonio…or ...,https://codeup.com/codeup-news/is-codeup-the-b...,Looking for the best data science bootcamp in ...,looking best data science bootcamp world best ...,look best data scienc bootcamp world best code...,looking best data science bootcamp world best ...
3,Codeup Launches First Podcast: Hire Tech,https://codeup.com/codeup-news/codeup-launches...,Any podcast enthusiasts out there? We are plea...,podcast enthusiasts pleased announce release c...,podcast enthusiast pleas announc releas codeup...,podcast enthusiast pleased announce release co...
4,Codeup Start Dates for March 2022,https://codeup.com/codeup-news/codeup-start-da...,As we approach the end of January we wanted to...,approach end january wanted look forward next ...,approach end januari want look forward next st...,approach end january wanted look forward next ...


In [130]:
# combine all words in lemmatized column
words = pd.Series(' '.join(codeup_df.lemmatized).split())
words

0             happy
1          announce
2               vet
3               tec
4           benefit
           ...     
5016          first
5017          apply
5018          apply
5019          pride
5020    scholarship
Length: 5021, dtype: object

In [131]:
# calculate term frequency for blog data
(pd.DataFrame({'raw_count': words.value_counts()})
 .assign(frequency=lambda df: df.raw_count / df.raw_count.sum())
 .assign(augmented_frequency=lambda df: df.frequency / df.frequency.max()))

Unnamed: 0,raw_count,frequency,augmented_frequency
tech,65,0.012946,1.000000
codeup,63,0.012547,0.969231
program,62,0.012348,0.953846
career,56,0.011153,0.861538
system,42,0.008365,0.646154
...,...,...,...
hundred,1,0.000199,0.015385
generation,1,0.000199,0.015385
prework,1,0.000199,0.015385
skillful,1,0.000199,0.015385


In [132]:
# create vectorizer object & apply
tfidf = TfidfVectorizer()
tfidfs = tfidf.fit_transform(codeup_df.lemmatized)
tfidfs

<22x1517 sparse matrix of type '<class 'numpy.float64'>'
	with 3261 stored elements in Compressed Sparse Row format>

In [133]:
# check out feature extraction results
pd.DataFrame(tfidfs.todense(), columns=tfidf.get_feature_names()).head()

Unnamed: 0,01,10,100,101,11222,12,1218,13,13week,13weeks,...,wwwworkintexascom,year,yes,yet,youd,youll,youre,youve,zero,zoom
0,0.0,0.0,0.0,0.0,0.038066,0.0,0.0,0.0,0.0,0.0,...,0.0,0.016323,0.0,0.0,0.0,0.027445,0.04356,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.100994,0.086617,0.0,0.084006,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.049849,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.021376,0.0,0.0,0.045127,0.0,0.085566,0.038472,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.077844,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.080909,0.0,0.080909,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.110174,0.0,0.0


## Moving to news article data so we can try to predict category

In [134]:
# use function to pull in prepped news data
news_df = prepare.prep_clean_news_data()
news_df.head()

Unnamed: 0,title,original,category,clean,stemmed,lemmatized
0,RBI cancels licence of Maha-based Independence...,RBI has cancelled licence of Maharashtra-based...,business,rbi cancelled licence maharashtrabased indepen...,rbi cancel licenc maharashtrabas independ coop...,rbi cancelled licence maharashtrabased indepen...
1,Boost to EVs a big step: Windmill Capital,"Increased use of EVs in public transport, spec...",business,increased use evs public transport special mob...,increas use ev public transport special mobil ...,increased use ev public transport special mobi...
2,Facebook parent Meta's $230-billion wipeout bi...,Facebook's parent Meta's shares plunged 27% an...,business,facebook ' s parent meta ' s shares plunged 27...,facebook ' s parent meta ' s share plung 27 th...,facebook ' s parent meta ' s share plunged 27 ...
3,"Tesla co-worker used N-word, threw a hot tool ...",A former Tesla worker has filed a lawsuit agai...,business,former tesla worker filed lawsuit company fail...,former tesla worker file lawsuit compani fail ...,former tesla worker filed lawsuit company fail...
4,Mark Zuckerberg loses $31 bn in one of the big...,Meta CEO Mark Zuckerberg's wealth dropped by $...,business,meta ceo mark zuckerberg ' s wealth dropped 31...,meta ceo mark zuckerberg ' s wealth drop 31 bi...,meta ceo mark zuckerberg ' s wealth dropped 31...


In [135]:
# combine all lemmatized words by category
entertainment_words = pd.Series(' '.join(news_df.lemmatized[news_df.category == 'entertainment'].astype(str)).split())
business_words = pd.Series(' '.join(news_df.lemmatized[news_df.category == 'business'].astype(str)).split())
technology_words = pd.Series(' '.join(news_df.lemmatized[news_df.category == 'technology'].astype(str)).split())
sports_words = pd.Series(' '.join(news_df.lemmatized[news_df.category == 'sports'].astype(str)).split())
all_words = pd.Series(' '.join(news_df.lemmatized).split())

In [136]:
# calculate term frequency by category for news data
entertainment_tf = (pd.DataFrame({'raw_count': entertainment_words.value_counts()})
                 .assign(frequency=lambda df: df.raw_count / df.raw_count.sum())
                 .assign(augmented_frequency=lambda df: df.frequency / df.frequency.max()))
business_tf = (pd.DataFrame({'raw_count': business_words.value_counts()})
                 .assign(frequency=lambda df: df.raw_count / df.raw_count.sum())
                 .assign(augmented_frequency=lambda df: df.frequency / df.frequency.max()))
technology_tf = (pd.DataFrame({'raw_count': technology_words.value_counts()})
                 .assign(frequency=lambda df: df.raw_count / df.raw_count.sum())
                 .assign(augmented_frequency=lambda df: df.frequency / df.frequency.max()))
sports_tf = (pd.DataFrame({'raw_count': sports_words.value_counts()})
                 .assign(frequency=lambda df: df.raw_count / df.raw_count.sum())
                 .assign(augmented_frequency=lambda df: df.frequency / df.frequency.max()))

In [137]:
# check top entertainment term frequencies
entertainment_tf.head(10)

Unnamed: 0,raw_count,frequency,augmented_frequency
',57,0.061822,1.0
said,16,0.017354,0.280702
added,14,0.015184,0.245614
actor,13,0.0141,0.22807
s,12,0.013015,0.210526
film,11,0.011931,0.192982
actress,11,0.011931,0.192982
wrote,7,0.007592,0.122807
asked,6,0.006508,0.105263
instagram,5,0.005423,0.087719


In [138]:
# check top business term frequencies
business_tf.head(10)

Unnamed: 0,raw_count,frequency,augmented_frequency
',42,0.040816,1.0
s,31,0.030126,0.738095
said,21,0.020408,0.5
company,13,0.012634,0.309524
billion,11,0.01069,0.261905
facebook,8,0.007775,0.190476
fell,7,0.006803,0.166667
added,7,0.006803,0.166667
value,6,0.005831,0.142857
india,6,0.005831,0.142857


In [139]:
# check top technology term frequencies
technology_tf.head(10)

Unnamed: 0,raw_count,frequency,augmented_frequency
',43,0.041992,1.0
s,34,0.033203,0.790698
billion,20,0.019531,0.465116
said,15,0.014648,0.348837
facebook,11,0.010742,0.255814
meta,10,0.009766,0.232558
fell,9,0.008789,0.209302
revenue,8,0.007812,0.186047
u,8,0.007812,0.186047
user,7,0.006836,0.162791


In [140]:
# check top sports term frequencies
sports_tf.head(10)

Unnamed: 0,raw_count,frequency,augmented_frequency
',21,0.022082,1.0
said,18,0.018927,0.857143
added,14,0.014721,0.666667
team,11,0.011567,0.52381
s,11,0.011567,0.52381
india,8,0.008412,0.380952
world,8,0.008412,0.380952
coach,6,0.006309,0.285714
positive,6,0.006309,0.285714
match,6,0.006309,0.285714


It appears that business and technology top words used are pretty aligned so it may be difficult to differentiate between these categories using only top words.

In [141]:
# ask for help getting the next two code blocks to work

# def idf(word, cat):
#     n_occurences = news_df[news_df.category == cat].lemmatized.str.contains(word).sum()
#     return len(news_df[news_df.category ==  cat]) / n_occurences

In [142]:
# ask for help with this

# unique_words = entertainment_words.unique()

# a = pd.DataFrame(dict(word=unique_words))
# a = a.assign(idf = lambda x: idf(x.word, 'entertainment'))
# # a = a.assign(idf = lambda df: df.word.apply(idf, args))
# # sort the data for presentation purposes
# # a = a.set_index('word')
# a   
    
# #  .sort_values(by='idf', ascending=False)
# #  .head(5))

In [143]:
# create vectorizer object & apply to entertainment category
tfidf = TfidfVectorizer()
tfidfs = tfidf.fit_transform(news_df.lemmatized[news_df.category ==  'entertainment'])
tfidfs

<25x562 sparse matrix of type '<class 'numpy.float64'>'
	with 733 stored elements in Compressed Sparse Row format>

In [144]:
# check out feature extraction results from entertainment category
pd.DataFrame(tfidfs.todense(), columns=tfidf.get_feature_names()).head()

Unnamed: 0,100,10th,11,15,19,2007,2020,2022,20yearold,21,...,would,wrapup,written,wrong,wrote,year,yet,yo,youi,zapkeycom
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.220266,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.123156,0.0,0.152882,0.0,0.099157,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.14501,0.0,0.0,0.14501,0.0,0.128517,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.14501
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.145624,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [145]:
# fit and transform tfidf object on news_df articles
X = tfidf.fit_transform(news_df.lemmatized)
y = news_df.category

# split the data into X train and test, and y train and test, stratifying on category
X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, test_size=.2)

train = pd.DataFrame(dict(actual=y_train))
test = pd.DataFrame(dict(actual=y_test))

In [148]:
# fit the logistic regression model
lm = LogisticRegression().fit(X_train, y_train)

# make predictions
train['lm_predicted'] = lm.predict(X_train)
test['lm_predicted'] = lm.predict(X_test)

In [153]:
# evaluate results for train dataset
print('Accuracy: {:.2%}'.format(accuracy_score(train.actual, train.lm_predicted)))
print('---')
print('Confusion Matrix')
print(pd.crosstab(train.lm_predicted, train.actual))
print('---')
print(classification_report(train.actual, train.lm_predicted))

Accuracy: 91.25%
---
Confusion Matrix
actual         business  entertainment  sports  technology
lm_predicted                                              
business             15              0       0           2
entertainment         0             20       0           0
sports                0              0      20           0
technology            5              0       0          18
---
               precision    recall  f1-score   support

     business       0.88      0.75      0.81        20
entertainment       1.00      1.00      1.00        20
       sports       1.00      1.00      1.00        20
   technology       0.78      0.90      0.84        20

     accuracy                           0.91        80
    macro avg       0.92      0.91      0.91        80
 weighted avg       0.92      0.91      0.91        80



In [154]:
# evaluate results for test dataset
print('Accuracy: {:.2%}'.format(accuracy_score(test.actual, test.lm_predicted)))
print('---')
print('Confusion Matrix')
print(pd.crosstab(test.lm_predicted, test.actual))
print('---')
print(classification_report(test.actual, test.lm_predicted))

Accuracy: 60.00%
---
Confusion Matrix
actual         business  entertainment  sports  technology
lm_predicted                                              
business              2              0       0           4
entertainment         2              4       0           0
sports                0              1       5           0
technology            1              0       0           1
---
               precision    recall  f1-score   support

     business       0.33      0.40      0.36         5
entertainment       0.67      0.80      0.73         5
       sports       0.83      1.00      0.91         5
   technology       0.50      0.20      0.29         5

     accuracy                           0.60        20
    macro avg       0.58      0.60      0.57        20
 weighted avg       0.58      0.60      0.57        20



### Accuracy for the train dataset was pretty high at 93% but there is a pretty significant drop for the test dataset to 70%. Most of the misclassification looks to be from the business and technology categories which is not that surprising since they had many of the same top words due to the recent news about Meta/Facebook.

# Exercise 1a

What other types of models (i.e. different classifcation algorithms) could you use?

### I will try a random forest model next

In [188]:
# create the random forest model object setting min samples per leaf and max depth
rf = RandomForestClassifier(class_weight=None, 
                            min_samples_leaf=3,
                            n_estimators=100,
                            max_depth=5, 
                            random_state=369)
# fit the random forest model
rf = rf.fit(X_train, y_train)

# make predictions
train['rf_predicted'] = rf.predict(X_train)
test['rf_predicted'] = rf.predict(X_test)

In [189]:
# evaluate results for train dataset
print('Accuracy of random forest classifier on training set: {:.2%}'
      .format(accuracy_score(train.actual, train.rf_predicted)))
print('---')
print('Confusion Matrix')
print(pd.crosstab(train.rf_predicted, train.actual))
print('---')
print(classification_report(train.actual, train.rf_predicted))

Accuracy of random forest classifier on training set: 86.25%
---
Confusion Matrix
actual         business  entertainment  sports  technology
rf_predicted                                              
business             15              0       0           4
entertainment         0             19       0           0
sports                2              1      20           1
technology            3              0       0          15
---
               precision    recall  f1-score   support

     business       0.79      0.75      0.77        20
entertainment       1.00      0.95      0.97        20
       sports       0.83      1.00      0.91        20
   technology       0.83      0.75      0.79        20

     accuracy                           0.86        80
    macro avg       0.86      0.86      0.86        80
 weighted avg       0.86      0.86      0.86        80



In [190]:
# evaluate results for test dataset
print('Accuracy of random forest classifier on test set: {:.2%}'
      .format(accuracy_score(test.actual, test.rf_predicted)))
print('---')
print('Confusion Matrix')
print(pd.crosstab(test.rf_predicted, test.actual))
print('---')
print(classification_report(test.actual, test.rf_predicted))

Accuracy of random forest classifier on test set: 55.00%
---
Confusion Matrix
actual         business  entertainment  sports  technology
rf_predicted                                              
business              1              0       0           3
entertainment         1              5       1           1
sports                2              0       4           0
technology            1              0       0           1
---
               precision    recall  f1-score   support

     business       0.25      0.20      0.22         5
entertainment       0.62      1.00      0.77         5
       sports       0.67      0.80      0.73         5
   technology       0.50      0.20      0.29         5

     accuracy                           0.55        20
    macro avg       0.51      0.55      0.50        20
 weighted avg       0.51      0.55      0.50        20



### With an in-sample accuracy of 86% and an out-of-sample accuracy of 55% by the random forest model , the linear regression model produces the best results for this classification.

# Exercise 1b

How do the models compare when trained on term frequency data alone, instead of TF-IDF values?

### Next I will try the random forest model with bag of words rather than TF-IDF.

In [193]:
# use count vectorizer object to create bag of words by fit and transform news_df data, assign category column to y variable
cv = CountVectorizer()
X_bag_of_words = cv.fit_transform(news_df.lemmatized)
y = news_df.category

In [195]:
# create dataframe of words
words = pd.DataFrame(X_bag_of_words.todense(), columns=cv.get_feature_names())
words

Unnamed: 0,10,100,1000,107,10th,11,11th,12,120000,1206,...,yuvraj,yuzvendra,zalmi,zaman,zapkeycom,zero,zipping,zone,zucker,zuckerberg
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,1,0,1,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,2
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
95,0,0,0,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
96,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
97,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
98,0,0,0,0,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [196]:
# split the data into X train and test, and y train and test, stratifying on category
X_train, X_test, y_train, y_test = train_test_split(X_bag_of_words, y, stratify=y, test_size=.2)

train = pd.DataFrame(dict(actual=y_train))
test = pd.DataFrame(dict(actual=y_test))

In [197]:
# create the random forest model object setting min samples per leaf and max depth
rf = RandomForestClassifier(class_weight=None, 
                            min_samples_leaf=3,
                            n_estimators=100,
                            max_depth=5, 
                            random_state=369)
# fit the random forest model
rf = rf.fit(X_train, y_train)

# make predictions
train['rf_predicted'] = rf.predict(X_train)
test['rf_predicted'] = rf.predict(X_test)

In [198]:
# evaluate results for train dataset
print('Accuracy of random forest classifier on training set: {:.2%}'
      .format(accuracy_score(train.actual, train.rf_predicted)))
print('---')
print('Confusion Matrix')
print(pd.crosstab(train.rf_predicted, train.actual))
print('---')
print(classification_report(train.actual, train.rf_predicted))

Accuracy of random forest classifier on training set: 85.00%
---
Confusion Matrix
actual         business  entertainment  sports  technology
rf_predicted                                              
business             13              1       0           3
entertainment         1             18       0           0
sports                0              1      20           0
technology            6              0       0          17
---
               precision    recall  f1-score   support

     business       0.76      0.65      0.70        20
entertainment       0.95      0.90      0.92        20
       sports       0.95      1.00      0.98        20
   technology       0.74      0.85      0.79        20

     accuracy                           0.85        80
    macro avg       0.85      0.85      0.85        80
 weighted avg       0.85      0.85      0.85        80



In [199]:
# evaluate results for test dataset
print('Accuracy of random forest classifier on test set: {:.2%}'
      .format(accuracy_score(test.actual, test.rf_predicted)))
print('---')
print('Confusion Matrix')
print(pd.crosstab(test.rf_predicted, test.actual))
print('---')
print(classification_report(test.actual, test.rf_predicted))

Accuracy of random forest classifier on test set: 60.00%
---
Confusion Matrix
actual         business  entertainment  sports  technology
rf_predicted                                              
business              3              1       1           2
entertainment         0              4       1           0
sports                0              0       3           1
technology            2              0       0           2
---
               precision    recall  f1-score   support

     business       0.43      0.60      0.50         5
entertainment       0.80      0.80      0.80         5
       sports       0.75      0.60      0.67         5
   technology       0.50      0.40      0.44         5

     accuracy                           0.60        20
    macro avg       0.62      0.60      0.60        20
 weighted avg       0.62      0.60      0.60        20



### The random forest model does perform better on bag of words than TF-IDF