Modeling exercises for NLP. What other types of algorithms can be used? How do models compare when training on term frequency data alone instead of TF-IDF values alone.

In [120]:
import acquire
import prepare

from pprint import pprint
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, accuracy_score
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import MultinomialNB

from env import get_db_url

In [2]:
df = pd.read_sql("SELECT * FROM spam", get_db_url("spam_db"))

In [3]:
df.head()
train, validate, test = prepare.train_validate_test_split(df, target = 'label')

In [4]:
tfidf = TfidfVectorizer()

X_train = tfidf.fit_transform(train.text)
X_validate = tfidf.transform(validate.text)
X_test = tfidf.transform(test.text)
y_train = train.label
y_validate = validate.label
y_test = test.label

train_results_tfidf=pd.DataFrame(dict(actual = y_train))
validate_results_tfidf = pd.DataFrame(dict(actual = y_validate))
test_results_tfidf = pd.DataFrame(dict(actual = y_test))

lm = LogisticRegression().fit(X_train, y_train)

train_results_tfidf['predicted_log_reg'] = lm.predict(X_train)
validate_results_tfidf['predicted_log_reg'] = lm.predict(X_validate)
# test_results['predicted'] = lm.predict(X_test)

print('Accuracy: {:.2%}'.format(accuracy_score(train_results_tfidf.actual, train_results_tfidf.predicted_log_reg)))
print('---')
print('Train Confusion Matrix')
print(pd.crosstab(train_results_tfidf.predicted_log_reg, train_results_tfidf.actual))
print('---')
print(classification_report(train_results_tfidf.actual, train_results_tfidf.predicted_log_reg))


print('Accuracy: {:.2%}'.format(accuracy_score(validate_results_tfidf.actual, validate_results_tfidf.predicted_log_reg)))
print('---')
print('Validate Confusion Matrix')
print(pd.crosstab(validate_results_tfidf.predicted_log_reg, validate_results_tfidf.actual))
print('---')
print(classification_report(validate_results_tfidf.actual, validate_results_tfidf.predicted_log_reg))


Accuracy: 96.95%
---
Train Confusion Matrix
actual              ham  spam
predicted_log_reg            
ham                2700    94
spam                  1   324
---
              precision    recall  f1-score   support

         ham       0.97      1.00      0.98      2701
        spam       1.00      0.78      0.87       418

    accuracy                           0.97      3119
   macro avg       0.98      0.89      0.93      3119
weighted avg       0.97      0.97      0.97      3119

Accuracy: 95.67%
---
Validate Confusion Matrix
actual              ham  spam
predicted_log_reg            
ham                1158    58
spam                  0   122
---
              precision    recall  f1-score   support

         ham       0.95      1.00      0.98      1158
        spam       1.00      0.68      0.81       180

    accuracy                           0.96      1338
   macro avg       0.98      0.84      0.89      1338
weighted avg       0.96      0.96      0.95      1338



Great accuracy here is misleading - the model never actually predicts spam! This means the model would send every message to the inbox which is not very useful but is better than sending ham to the spam box. This is likely happening because the data set is quite unbalanced so the model has few chances to learn from a spam instance. We'll optimize for precision and be ok being a little annoyed by the spam that makes it in the inbox.

Will try using a Random Forest Classifier with the tf-idf scoring

In [5]:
rf = RandomForestClassifier(max_depth = 40).fit(X_train, y_train)

train_results_tfidf['predicted_rf'] = rf.predict(X_train)
validate_results_tfidf['predicted_rf'] = rf.predict(X_validate)
# test_results['predicted'] = lm.predict(X_test)

In [6]:
print('Accuracy: {:.2%}'.format(accuracy_score(train_results_tfidf.actual, train_results_tfidf.predicted_rf)))
print('---')
print('Train Confusion Matrix')
print(pd.crosstab(train_results_tfidf.predicted_rf, train_results_tfidf.actual))
print('---')
print(classification_report(train_results_tfidf.actual, train_results_tfidf.predicted_rf))


print('Accuracy: {:.2%}'.format(accuracy_score(validate_results_tfidf.actual, validate_results_tfidf.predicted_rf)))
print('---')
print('Validate Confusion Matrix')
print(pd.crosstab(validate_results_tfidf.predicted_rf, validate_results_tfidf.actual))
print('---')
print(classification_report(validate_results_tfidf.actual, validate_results_tfidf.predicted_rf))



Accuracy: 99.07%
---
Train Confusion Matrix
actual         ham  spam
predicted_rf            
ham           2701    29
spam             0   389
---
              precision    recall  f1-score   support

         ham       0.99      1.00      0.99      2701
        spam       1.00      0.93      0.96       418

    accuracy                           0.99      3119
   macro avg       0.99      0.97      0.98      3119
weighted avg       0.99      0.99      0.99      3119

Accuracy: 96.49%
---
Validate Confusion Matrix
actual         ham  spam
predicted_rf            
ham           1158    47
spam             0   133
---
              precision    recall  f1-score   support

         ham       0.96      1.00      0.98      1158
        spam       1.00      0.74      0.85       180

    accuracy                           0.96      1338
   macro avg       0.98      0.87      0.91      1338
weighted avg       0.97      0.96      0.96      1338



Random Forest with max depth of 40 does fairly well accuracy wise and precision is 1 once again.

Will try using count vectorizer data alone

In [7]:
cv = CountVectorizer()

X_train = cv.fit_transform(train.text)
X_validate = cv.transform(validate.text)
X_test = cv.transform(test.text)
y_train = train.label
y_validate = validate.label
y_test = test.label

train_results_cv=pd.DataFrame(dict(actual = y_train))
validate_results_cv = pd.DataFrame(dict(actual = y_validate))
test_results_cv = pd.DataFrame(dict(actual = y_test))

lm = LogisticRegression().fit(X_train, y_train)

train_results_cv['predicted_log_reg'] = lm.predict(X_train)
validate_results_cv['predicted_log_reg'] = lm.predict(X_validate)
# test_results['predicted'] = lm.predict(X_test)

print('Accuracy: {:.2%}'.format(accuracy_score(train_results_cv.actual, train_results_cv.predicted_log_reg)))
print('---')
print('Train Confusion Matrix')
print(pd.crosstab(train_results_cv.predicted_log_reg, train_results_cv.actual))
print('---')
print(classification_report(train_results_cv.actual, train_results_cv.predicted_log_reg))


print('Accuracy: {:.2%}'.format(accuracy_score(validate_results_cv.actual, validate_results_cv.predicted_log_reg)))
print('---')
print('Validate Confusion Matrix')
print(pd.crosstab(validate_results_cv.predicted_log_reg, validate_results_cv.actual))
print('---')
print(classification_report(validate_results_cv.actual, validate_results_cv.predicted_log_reg))



Accuracy: 99.71%
---
Train Confusion Matrix
actual              ham  spam
predicted_log_reg            
ham                2701     9
spam                  0   409
---
              precision    recall  f1-score   support

         ham       1.00      1.00      1.00      2701
        spam       1.00      0.98      0.99       418

    accuracy                           1.00      3119
   macro avg       1.00      0.99      0.99      3119
weighted avg       1.00      1.00      1.00      3119

Accuracy: 97.38%
---
Validate Confusion Matrix
actual              ham  spam
predicted_log_reg            
ham                1157    34
spam                  1   146
---
              precision    recall  f1-score   support

         ham       0.97      1.00      0.99      1158
        spam       0.99      0.81      0.89       180

    accuracy                           0.97      1338
   macro avg       0.98      0.91      0.94      1338
weighted avg       0.97      0.97      0.97      1338



Better performance for all metrics! Will try Random Forest with Count Vectorizer

In [8]:
cv = CountVectorizer()

X_train = cv.fit_transform(train.text)
X_validate = cv.transform(validate.text)
X_test = cv.transform(test.text)
y_train = train.label
y_validate = validate.label
y_test = test.label

train_results_cv=pd.DataFrame(dict(actual = y_train))
validate_results_cv = pd.DataFrame(dict(actual = y_validate))
test_results_cv = pd.DataFrame(dict(actual = y_test))

rf = RandomForestClassifier(max_depth=40).fit(X_train, y_train)

train_results_cv['predicted_rf'] = rf.predict(X_train)
validate_results_cv['predicted_rf'] = rf.predict(X_validate)
# test_results['predicted'] = lm.predict(X_test)

print('Accuracy: {:.2%}'.format(accuracy_score(train_results_cv.actual, train_results_cv.predicted_rf)))
print('---')
print('Train Confusion Matrix')
print(pd.crosstab(train_results_cv.predicted_rf, train_results_cv.actual))
print('---')
print(classification_report(train_results_cv.actual, train_results_cv.predicted_rf))


print('Accuracy: {:.2%}'.format(accuracy_score(validate_results_cv.actual, validate_results_cv.predicted_rf)))
print('---')
print('Validate Confusion Matrix')
print(pd.crosstab(validate_results_cv.predicted_rf, validate_results_cv.actual))
print('---')
print(classification_report(validate_results_cv.actual, validate_results_cv.predicted_rf))




Accuracy: 98.91%
---
Train Confusion Matrix
actual         ham  spam
predicted_rf            
ham           2701    34
spam             0   384
---
              precision    recall  f1-score   support

         ham       0.99      1.00      0.99      2701
        spam       1.00      0.92      0.96       418

    accuracy                           0.99      3119
   macro avg       0.99      0.96      0.98      3119
weighted avg       0.99      0.99      0.99      3119

Accuracy: 96.11%
---
Validate Confusion Matrix
actual         ham  spam
predicted_rf            
ham           1158    52
spam             0   128
---
              precision    recall  f1-score   support

         ham       0.96      1.00      0.98      1158
        spam       1.00      0.71      0.83       180

    accuracy                           0.96      1338
   macro avg       0.98      0.86      0.90      1338
weighted avg       0.96      0.96      0.96      1338



Not as good performance as the Logistic Regression model. Slightly worse performance than tf-idf data.

Does it make a difference if the data is cleaned/lemmatized first?

In [9]:
clean_and_lem_df = df.copy()
clean_and_lem_df['lem'] = df.text.apply(prepare.basic_clean).apply(prepare.tokenize).apply(prepare.lemmatize).apply(prepare.remove_stopwords,
                                                       extra_words = [],
                                                       exclude_words = [])

In [10]:
df.head()

Unnamed: 0,id,label,text
0,0,ham,"Go until jurong point, crazy.. Available only ..."
1,1,ham,Ok lar... Joking wif u oni...
2,2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,3,ham,U dun say so early hor... U c already then say...
4,4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [11]:
clean_and_lem_df.head()

Unnamed: 0,id,label,text,lem
0,0,ham,"Go until jurong point, crazy.. Available only ...",go jurong point crazy available bugis n great ...
1,1,ham,Ok lar... Joking wif u oni...,ok lar joking wif u oni
2,2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,free entry 2 wkly comp win fa cup final tkts 2...
3,3,ham,U dun say so early hor... U c already then say...,u dun say early hor u c already say
4,4,ham,"Nah I don't think he goes to usf, he lives aro...",nah ' think go usf life around though


In [13]:
train, validate, test = prepare.train_validate_test_split(clean_and_lem_df, target = 'label')

In [14]:
cv = CountVectorizer()

X_train = cv.fit_transform(train.lem)
X_validate = cv.transform(validate.lem)
X_test = cv.transform(test.lem)
y_train = train.label
y_validate = validate.label
y_test = test.label

train_results_cv=pd.DataFrame(dict(actual = y_train))
validate_results_cv = pd.DataFrame(dict(actual = y_validate))
test_results_cv = pd.DataFrame(dict(actual = y_test))

lm = LogisticRegression().fit(X_train, y_train)

train_results_cv['predicted_log_reg'] = lm.predict(X_train)
validate_results_cv['predicted_log_reg'] = lm.predict(X_validate)
# test_results['predicted'] = lm.predict(X_test)

print('Accuracy: {:.2%}'.format(accuracy_score(train_results_cv.actual, train_results_cv.predicted_log_reg)))
print('---')
print('Train Confusion Matrix')
print(pd.crosstab(train_results_cv.predicted_log_reg, train_results_cv.actual))
print('---')
print(classification_report(train_results_cv.actual, train_results_cv.predicted_log_reg))


print('Accuracy: {:.2%}'.format(accuracy_score(validate_results_cv.actual, validate_results_cv.predicted_log_reg)))
print('---')
print('Validate Confusion Matrix')
print(pd.crosstab(validate_results_cv.predicted_log_reg, validate_results_cv.actual))
print('---')
print(classification_report(validate_results_cv.actual, validate_results_cv.predicted_log_reg))




Accuracy: 99.36%
---
Train Confusion Matrix
actual              ham  spam
predicted_log_reg            
ham                2700    19
spam                  1   399
---
              precision    recall  f1-score   support

         ham       0.99      1.00      1.00      2701
        spam       1.00      0.95      0.98       418

    accuracy                           0.99      3119
   macro avg       1.00      0.98      0.99      3119
weighted avg       0.99      0.99      0.99      3119

Accuracy: 97.01%
---
Validate Confusion Matrix
actual              ham  spam
predicted_log_reg            
ham                1157    39
spam                  1   141
---
              precision    recall  f1-score   support

         ham       0.97      1.00      0.98      1158
        spam       0.99      0.78      0.88       180

    accuracy                           0.97      1338
   macro avg       0.98      0.89      0.93      1338
weighted avg       0.97      0.97      0.97      1338



Slightly worse performance but still quite good.

# Classification of category with news data

Import news data and try different modeling techniques for determining category of news article.

In [18]:
news = prepare.create_prepared_news_df()
target = 'category'

Importing from csv


In [19]:
train, validate, test = prepare.train_validate_test_split(news, target = 'category')

Simple term frequency with Logistic regression

In [32]:
cv = CountVectorizer()

X_train = cv.fit_transform(train.lemmatized)
X_validate = cv.transform(validate.lemmatized)
X_test = cv.transform(test.lemmatized)
y_train = train[target]
y_validate = validate[target]
y_test = test[target]

train_results_cv=pd.DataFrame(dict(actual = y_train))
validate_results_cv = pd.DataFrame(dict(actual = y_validate))
test_results_cv = pd.DataFrame(dict(actual = y_test))

lm = LogisticRegression().fit(X_train, y_train)

train_results_cv['predicted_log_reg'] = lm.predict(X_train)
validate_results_cv['predicted_log_reg'] = lm.predict(X_validate)
# test_results['predicted'] = lm.predict(X_test)

print('Accuracy: {:.2%}'.format(accuracy_score(train_results_cv.actual, train_results_cv.predicted_log_reg)))
# print('---')
# print('Train Confusion Matrix')
# print(pd.crosstab(train_results_cv.predicted_log_reg, train_results_cv.actual))
print('---')
print(classification_report(train_results_cv.actual, train_results_cv.predicted_log_reg))


print('Accuracy: {:.2%}'.format(accuracy_score(validate_results_cv.actual, validate_results_cv.predicted_log_reg)))
# print('---')
# print('Validate Confusion Matrix')
# print(pd.crosstab(validate_results_cv.predicted_log_reg, validate_results_cv.actual))
print('---')
print(classification_report(validate_results_cv.actual, validate_results_cv.predicted_log_reg))





Accuracy: 95.21%
---
               precision    recall  f1-score   support

   automobile       0.88      1.00      0.93        14
     business       1.00      0.79      0.88        14
entertainment       1.00      1.00      1.00        14
        hatke       1.00      1.00      1.00        14
miscellaneous       0.93      1.00      0.97        14
     national       1.00      1.00      1.00        14
     politics       0.93      1.00      0.97        14
      science       0.93      1.00      0.97        14
       sports       1.00      1.00      1.00        14
      startup       0.92      0.86      0.89        14
   technology       0.92      0.85      0.88        13
        world       0.93      0.93      0.93        14

     accuracy                           0.95       167
    macro avg       0.95      0.95      0.95       167
 weighted avg       0.95      0.95      0.95       167

Accuracy: 55.56%
---
               precision    recall  f1-score   support

   automobile      

Pretty horrible job of predicting category of news based on Logistic Regression and count vectorizer. Overfit. Will try with bigrams.

In [30]:
cv = CountVectorizer(ngram_range=(2,2))

X_train = cv.fit_transform(train.lemmatized)
X_validate = cv.transform(validate.lemmatized)
X_test = cv.transform(test.lemmatized)
y_train = train[target]
y_validate = validate[target]
y_test = test[target]

train_results_cv=pd.DataFrame(dict(actual = y_train))
validate_results_cv = pd.DataFrame(dict(actual = y_validate))
test_results_cv = pd.DataFrame(dict(actual = y_test))

lm = LogisticRegression().fit(X_train, y_train)

train_results_cv['predicted_log_reg_bigrams'] = lm.predict(X_train)
validate_results_cv['predicted_log_reg_bigrams'] = lm.predict(X_validate)
# test_results['predicted'] = lm.predict(X_test)

print('Accuracy: {:.2%}'.format(accuracy_score(train_results_cv.actual, train_results_cv.predicted_log_reg_bigrams)))
# print('---')
# print('Train Confusion Matrix')
# print(pd.crosstab(train_results_cv.predicted_log_reg_bigrams, train_results_cv.actual))
print('---')
print(classification_report(train_results_cv.actual, train_results_cv.predicted_log_reg_bigrams))


print('Accuracy: {:.2%}'.format(accuracy_score(validate_results_cv.actual, validate_results_cv.predicted_log_reg_bigrams)))
# print('---')
# print('Validate Confusion Matrix')
# print(pd.crosstab(validate_results_cv.predicted_log_reg_bigrams, validate_results_cv.actual))
print('---')
print(classification_report(validate_results_cv.actual, validate_results_cv.predicted_log_reg_bigrams))






Accuracy: 95.21%
---
               precision    recall  f1-score   support

   automobile       0.93      0.93      0.93        14
     business       0.92      0.86      0.89        14
entertainment       1.00      1.00      1.00        14
        hatke       1.00      1.00      1.00        14
miscellaneous       0.93      1.00      0.97        14
     national       1.00      1.00      1.00        14
     politics       0.93      1.00      0.97        14
      science       0.93      1.00      0.97        14
       sports       1.00      1.00      1.00        14
      startup       0.87      0.93      0.90        14
   technology       1.00      0.77      0.87        13
        world       0.93      0.93      0.93        14

     accuracy                           0.95       167
    macro avg       0.95      0.95      0.95       167
 weighted avg       0.95      0.95      0.95       167

Accuracy: 30.56%
---
               precision    recall  f1-score   support

   automobile      

Even worse!!! Will try with naive Bayes without bigrams.

In [28]:
cv = CountVectorizer()

X_train = cv.fit_transform(train.lemmatized)
X_validate = cv.transform(validate.lemmatized)
X_test = cv.transform(test.lemmatized)
y_train = train[target]
y_validate = validate[target]
y_test = test[target]

train_results_cv=pd.DataFrame(dict(actual = y_train))
validate_results_cv = pd.DataFrame(dict(actual = y_validate))
test_results_cv = pd.DataFrame(dict(actual = y_test))

lm = MultinomialNB().fit(X_train, y_train)

train_results_cv['predicted_nb'] = lm.predict(X_train)
validate_results_cv['predicted_nb'] = lm.predict(X_validate)
# test_results['predicted'] = lm.predict(X_test)

print('Accuracy: {:.2%}'.format(accuracy_score(train_results_cv.actual, train_results_cv.predicted_nb)))
print('---')
# print('Train Confusion Matrix')
# print(pd.crosstab(train_results_cv.predicted_nb, train_results_cv.actual))
print('---')
print(classification_report(train_results_cv.actual, train_results_cv.predicted_nb))


print('Accuracy: {:.2%}'.format(accuracy_score(validate_results_cv.actual, validate_results_cv.predicted_nb)))
print('---')
# print('Validate Confusion Matrix')
# print(pd.crosstab(validate_results_cv.predicted_nb, validate_results_cv.actual))
print('---')
print(classification_report(validate_results_cv.actual, validate_results_cv.predicted_nb))







Accuracy: 95.21%
---
---
               precision    recall  f1-score   support

   automobile       0.88      1.00      0.93        14
     business       1.00      0.79      0.88        14
entertainment       1.00      1.00      1.00        14
        hatke       1.00      1.00      1.00        14
miscellaneous       1.00      0.93      0.96        14
     national       1.00      1.00      1.00        14
     politics       0.93      1.00      0.97        14
      science       0.93      1.00      0.97        14
       sports       1.00      1.00      1.00        14
      startup       0.87      0.93      0.90        14
   technology       1.00      0.77      0.87        13
        world       0.88      1.00      0.93        14

     accuracy                           0.95       167
    macro avg       0.96      0.95      0.95       167
 weighted avg       0.96      0.95      0.95       167

Accuracy: 51.39%
---
---
               precision    recall  f1-score   support

   automobi

Inferior to Logistic regression. Will try with tf-idf.

In [29]:
tfidf = TfidfVectorizer()

X_train = tfidf.fit_transform(train.lemmatized)
X_validate = tfidf.transform(validate.lemmatized)
X_test = tfidf.transform(test.lemmatized)
y_train = train[target]
y_validate = validate[target]
y_test = test[target]

train_results_tfidf=pd.DataFrame(dict(actual = y_train))
validate_results_tfidf = pd.DataFrame(dict(actual = y_validate))
test_results_tfidf = pd.DataFrame(dict(actual = y_test))

lm = MultinomialNB().fit(X_train, y_train)

train_results_tfidf['predicted_nb'] = lm.predict(X_train)
validate_results_tfidf['predicted_nb'] = lm.predict(X_validate)
# test_results['predicted'] = lm.predict(X_test)

print('Accuracy: {:.2%}'.format(accuracy_score(train_results_tfidf.actual, train_results_tfidf.predicted_nb)))
print('---')
# print('Train Confusion Matrix')
# print(pd.crosstab(train_results_tfidf.predicted_nb, train_results_tfidf.actual))
print('---')
print(classification_report(train_results_tfidf.actual, train_results_tfidf.predicted_nb))


print('Accuracy: {:.2%}'.format(accuracy_score(validate_results_tfidf.actual, validate_results_tfidf.predicted_nb)))
print('---')
# print('Validate Confusion Matrix')
# print(pd.crosstab(validate_results_tfidf.predicted_nb, validate_results_tfidf.actual))
print('---')
print(classification_report(validate_results_tfidf.actual, validate_results_tfidf.predicted_nb))








Accuracy: 95.21%
---
---
               precision    recall  f1-score   support

   automobile       0.88      1.00      0.93        14
     business       0.92      0.86      0.89        14
entertainment       1.00      1.00      1.00        14
        hatke       1.00      1.00      1.00        14
miscellaneous       1.00      0.93      0.96        14
     national       1.00      1.00      1.00        14
     politics       0.93      1.00      0.97        14
      science       0.93      1.00      0.97        14
       sports       1.00      1.00      1.00        14
      startup       0.87      0.93      0.90        14
   technology       1.00      0.77      0.87        13
        world       0.93      0.93      0.93        14

     accuracy                           0.95       167
    macro avg       0.95      0.95      0.95       167
 weighted avg       0.95      0.95      0.95       167

Accuracy: 55.56%
---
---
               precision    recall  f1-score   support

   automobi

Not great but slight improvement over term frequency, same performance as logistic regression. Looks like severe overfitting. Will try with bigrams.

In [31]:
tfidf = TfidfVectorizer(ngram_range=(2,2))

X_train = tfidf.fit_transform(train.lemmatized)
X_validate = tfidf.transform(validate.lemmatized)
X_test = tfidf.transform(test.lemmatized)
y_train = train[target]
y_validate = validate[target]
y_test = test[target]

train_results_tfidf=pd.DataFrame(dict(actual = y_train))
validate_results_tfidf = pd.DataFrame(dict(actual = y_validate))
test_results_tfidf = pd.DataFrame(dict(actual = y_test))

lm = MultinomialNB().fit(X_train, y_train)

train_results_tfidf['predicted_nb'] = lm.predict(X_train)
validate_results_tfidf['predicted_nb'] = lm.predict(X_validate)
# test_results['predicted'] = lm.predict(X_test)

print('Accuracy: {:.2%}'.format(accuracy_score(train_results_tfidf.actual, train_results_tfidf.predicted_nb)))
print('---')
# print('Train Confusion Matrix')
# print(pd.crosstab(train_results_tfidf.predicted_nb, train_results_tfidf.actual))
print('---')
print(classification_report(train_results_tfidf.actual, train_results_tfidf.predicted_nb))


print('Accuracy: {:.2%}'.format(accuracy_score(validate_results_tfidf.actual, validate_results_tfidf.predicted_nb)))
print('---')
# print('Validate Confusion Matrix')
# print(pd.crosstab(validate_results_tfidf.predicted_nb, validate_results_tfidf.actual))
print('---')
print(classification_report(validate_results_tfidf.actual, validate_results_tfidf.predicted_nb))









Accuracy: 95.21%
---
---
               precision    recall  f1-score   support

   automobile       0.88      1.00      0.93        14
     business       1.00      0.79      0.88        14
entertainment       1.00      1.00      1.00        14
        hatke       1.00      1.00      1.00        14
miscellaneous       0.93      1.00      0.97        14
     national       1.00      1.00      1.00        14
     politics       0.93      1.00      0.97        14
      science       0.93      1.00      0.97        14
       sports       1.00      1.00      1.00        14
      startup       0.87      0.93      0.90        14
   technology       1.00      0.77      0.87        13
        world       0.93      0.93      0.93        14

     accuracy                           0.95       167
    macro avg       0.96      0.95      0.95       167
 weighted avg       0.96      0.95      0.95       167

Accuracy: 43.06%
---
---
               precision    recall  f1-score   support

   automobi

Bigrams don't help at all. Will combine words with bigrams to see what happens.

In [33]:
tfidf = TfidfVectorizer(ngram_range=(1,2))

X_train = tfidf.fit_transform(train.lemmatized)
X_validate = tfidf.transform(validate.lemmatized)
X_test = tfidf.transform(test.lemmatized)
y_train = train[target]
y_validate = validate[target]
y_test = test[target]

train_results_tfidf=pd.DataFrame(dict(actual = y_train))
validate_results_tfidf = pd.DataFrame(dict(actual = y_validate))
test_results_tfidf = pd.DataFrame(dict(actual = y_test))

lm = MultinomialNB().fit(X_train, y_train)

train_results_tfidf['predicted_nb'] = lm.predict(X_train)
validate_results_tfidf['predicted_nb'] = lm.predict(X_validate)
# test_results['predicted'] = lm.predict(X_test)

print('Accuracy: {:.2%}'.format(accuracy_score(train_results_tfidf.actual, train_results_tfidf.predicted_nb)))
print('---')
# print('Train Confusion Matrix')
# print(pd.crosstab(train_results_tfidf.predicted_nb, train_results_tfidf.actual))
print('---')
print(classification_report(train_results_tfidf.actual, train_results_tfidf.predicted_nb))


print('Accuracy: {:.2%}'.format(accuracy_score(validate_results_tfidf.actual, validate_results_tfidf.predicted_nb)))
print('---')
# print('Validate Confusion Matrix')
# print(pd.crosstab(validate_results_tfidf.predicted_nb, validate_results_tfidf.actual))
print('---')
print(classification_report(validate_results_tfidf.actual, validate_results_tfidf.predicted_nb))










Accuracy: 95.21%
---
---
               precision    recall  f1-score   support

   automobile       0.88      1.00      0.93        14
     business       0.92      0.86      0.89        14
entertainment       1.00      1.00      1.00        14
        hatke       1.00      1.00      1.00        14
miscellaneous       1.00      0.93      0.96        14
     national       1.00      1.00      1.00        14
     politics       0.93      1.00      0.97        14
      science       0.93      1.00      0.97        14
       sports       1.00      1.00      1.00        14
      startup       0.87      0.93      0.90        14
   technology       1.00      0.77      0.87        13
        world       0.93      0.93      0.93        14

     accuracy                           0.95       167
    macro avg       0.95      0.95      0.95       167
 weighted avg       0.95      0.95      0.95       167

Accuracy: 55.56%
---
---
               precision    recall  f1-score   support

   automobi

Same performance as just words and tf-idf

### Overall performance is pretty awful for classifying between all categories. Severe overfitting from train to validate. Perhaps attempting to classify fewer categories would help.

In [46]:
train, validate, test = prepare.train_validate_test_split(news[news.category.isin(['sports','world','business'])], target = 'category')

In [47]:
tfidf = TfidfVectorizer(ngram_range=(2,2))

X_train = tfidf.fit_transform(train.lemmatized)
X_validate = tfidf.transform(validate.lemmatized)
X_test = tfidf.transform(test.lemmatized)
y_train = train[target]
y_validate = validate[target]
y_test = test[target]

train_results_tfidf=pd.DataFrame(dict(actual = y_train))
validate_results_tfidf = pd.DataFrame(dict(actual = y_validate))
test_results_tfidf = pd.DataFrame(dict(actual = y_test))

lm = MultinomialNB().fit(X_train, y_train)

train_results_tfidf['predicted_nb'] = lm.predict(X_train)
validate_results_tfidf['predicted_nb'] = lm.predict(X_validate)
# test_results['predicted'] = lm.predict(X_test)

print('Accuracy: {:.2%}'.format(accuracy_score(train_results_tfidf.actual, train_results_tfidf.predicted_nb)))
print('---')
# print('Train Confusion Matrix')
# print(pd.crosstab(train_results_tfidf.predicted_nb, train_results_tfidf.actual))
print('---')
print(classification_report(train_results_tfidf.actual, train_results_tfidf.predicted_nb))


print('Accuracy: {:.2%}'.format(accuracy_score(validate_results_tfidf.actual, validate_results_tfidf.predicted_nb)))
print('---')
# print('Validate Confusion Matrix')
# print(pd.crosstab(validate_results_tfidf.predicted_nb, validate_results_tfidf.actual))
print('---')
print(classification_report(validate_results_tfidf.actual, validate_results_tfidf.predicted_nb))










Accuracy: 100.00%
---
---
              precision    recall  f1-score   support

    business       1.00      1.00      1.00        14
      sports       1.00      1.00      1.00        14
       world       1.00      1.00      1.00        14

    accuracy                           1.00        42
   macro avg       1.00      1.00      1.00        42
weighted avg       1.00      1.00      1.00        42

Accuracy: 77.78%
---
---
              precision    recall  f1-score   support

    business       0.62      0.83      0.71         6
      sports       0.83      0.83      0.83         6
       world       1.00      0.67      0.80         6

    accuracy                           0.78        18
   macro avg       0.82      0.78      0.78        18
weighted avg       0.82      0.78      0.78        18



A lot better performance when classifying fewer categories.

### Put into a function which returns classification reports:

In [124]:
def model_words(vectorizer, class_model, ngrams_range_value, train, validate, target, print_results = True):
    """Performs classification modeling of lemmatized data. Outputs (and returns) classification reports for train and validate/test.
    
    vectorizer: the type of feature extraction method, such as Count Vectorizer or tf-idf
    class_model: the classification model to use
    ngrams_range_value: whether to use unigram, bigrams, etc. for the feature extraction
    train and test sets as well as the target variable"""
    
    feature_extraction_method = vectorizer(ngram_range=ngrams_range_value)

    X_train = feature_extraction_method.fit_transform(train.lemmatized)
    X_validate = feature_extraction_method.transform(validate.lemmatized)
    X_test = feature_extraction_method.transform(test.lemmatized)
    y_train = train[target]
    y_validate = validate[target]
    # y_test = test[target]

    train_results=pd.DataFrame(dict(actual = y_train))
    validate_results = pd.DataFrame(dict(actual = y_validate))
    # test_results = pd.DataFrame(dict(actual = y_test))

    model_to_use = class_model.fit(X_train, y_train)

    train_results['predicted'] = model_to_use.predict(X_train)
    validate_results['predicted'] = model_to_use.predict(X_validate)
    # test_results['predicted'] = model_to_use.predict(X_test)
    train_class_report = classification_report(train_results.actual, train_results.predicted, output_dict = True)
    validate_class_report = classification_report(validate_results.actual, validate_results.predicted,output_dict=True)
    if print_results:
        print('Accuracy: {:.2%}'.format(accuracy_score(train_results.actual, train_results.predicted)))
        print('---')
        # print('Train Confusion Matrix')
        # print(pd.crosstab(train_results_tfidf.predicted, train_results_tfidf.actual))
        print('---')
        print(pd.DataFrame(train_class_report))


        print('Accuracy: {:.2%}'.format(accuracy_score(validate_results.actual, validate_results.predicted)))
        print('---')
        # print('Validate Confusion Matrix')
        # print(pd.crosstab(validate_results_tfidf.predicted, validate_results_tfidf.actual))
        print('---')
        print(pd.DataFrame(validate_class_report))
    
    return train_class_report, validate_class_report

In [127]:
train_class_report, validate_class_report = model_words(CountVectorizer, RandomForestClassifier(n_estimators=75, random_state=123), (1,1), train, validate, target, True)

Accuracy: 100.00%
---
---
           business  sports  world  accuracy  macro avg  weighted avg
precision       1.0     1.0    1.0       1.0        1.0           1.0
recall          1.0     1.0    1.0       1.0        1.0           1.0
f1-score        1.0     1.0    1.0       1.0        1.0           1.0
support        14.0    14.0   14.0       1.0       42.0          42.0
Accuracy: 72.22%
---
---
           business  sports     world  accuracy  macro avg  weighted avg
precision  0.666667     1.0  0.555556  0.722222   0.740741      0.740741
recall     0.333333     1.0  0.833333  0.722222   0.722222      0.722222
f1-score   0.444444     1.0  0.666667  0.722222   0.703704      0.703704
support    6.000000     6.0  6.000000  0.722222  18.000000     18.000000
