# Modeling Exercises for NLP

In [39]:
import warnings
warnings.filterwarnings("ignore")

import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
import seaborn as sns
from wordcloud import WordCloud

import re
import unicodedata
import nltk

import acquire
import prepare

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, accuracy_score
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import classification_report, accuracy_score
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import confusion_matrix
from sklearn.ensemble import RandomForestClassifier

## Using the inshorts article data, practice using the modeling tools for NLP

**Acquire**

In [2]:
#Acquire the data using the function from the acquire module
categories = ["business", "sports", "technology", "entertainment", "science", "world"]
news_df = acquire.get_all_news_articles(categories)
news_df.head()

Unnamed: 0,title,content,category
0,"Godrej, PwC, Deloitte India give extra offs to...",Several companies in India have been offering ...,business
1,"Bill Gates' company Cascade transfers ₹13,300 ...","Bill Gates' Cascade Investment, a holding comp...",business
2,Infosys commits additional ₹100 crore for COVI...,Infosys has committed additional ₹100 crore fo...,business
3,China flight halt in India may hurt pharma sup...,The Indian Drug Manufacturers' Association (ID...,business
4,RIL may soon fly in Israeli experts to install...,Reliance Industries has sought permission to f...,business


**Prepare**

In [3]:
#clean to hold the normalized and tokenized original with the stopwords removed.
news_df['clean'] = news_df['content'].apply(lambda x: prepare.remove_stopwords(prepare.tokenize(prepare.basic_clean(x))))

In [4]:
#Subset the data into a df with just the clean and category columns
df = news_df[['category', 'clean']]
df.head()

Unnamed: 0,category,clean
0,business,several companies india offering extra holiday...
1,business,bill gates ' cascade investment holding compan...
2,business,infosys committed additional 100 crore covid19...
3,business,indian drug manufacturers ' association idma w...
4,business,reliance industries sought permission fly isra...


In [5]:
#Create a function to further clean up
#def clean(text):
#    'A simple function to cleanup text data'
#    wnl = nltk.stem.WordNetLemmatizer()
#    words = re.sub(r'[^\w\s]', '', text).split()
#    return [wnl.lemmatize(word) for word in words]

In [6]:
#Breakdown words by the article category
#biz_words = clean(' '.join(df.clean[df.category == 'business']))
#sports_words = clean(' '.join(df.clean[df.category == 'sports']))
#tech_words = clean(' '.join(df.clean[df.category == 'technology']))
#ent_words = clean(' '.join(df.clean[df.category == 'entertainment']))
#science_words = clean(' '.join(df.clean[df.category == 'science']))
#world_words = clean(' '.join(df.clean[df.category == 'world']))

In [7]:
# We'll use this split function later to create in-sample and out-of-sample datasets for modeling
def split(df, stratify_by=None):
    """
    3 way split for train, validate, and test datasets
    To stratify, send in a column name
    """
    
    
    train, test = train_test_split(df, test_size=.2, random_state=123, stratify=df[stratify_by])
    
    train, validate = train_test_split(train, test_size=.3, random_state=123, stratify=train[stratify_by])
    
    return train, validate, test

In [9]:
#Split the data and stratify by category
train, validate, test = split(df, 'category')
train.shape

(83, 2)

In [10]:
# Setup our X variables
X_train = train.clean
X_validate = validate.clean
X_test = test.clean

# Setup our y variables
y_train = train.category
y_validate = validate.category
y_test = test.category

## Modeling

**Logistic Regression**

In [21]:
#Establish baseline
df.category.value_counts()

technology       25
entertainment    25
world            25
business         25
sports           25
science          24
Name: category, dtype: int64

In [13]:
#Create the tfidf vectorizer object
tfidf = TfidfVectorizer()

#Fit the object on the training data
tfidf.fit(X_train)

#Use the object
X_train_vectorized =tfidf.transform(X_train)
X_validate_vectorized =tfidf.transform(X_validate) 
X_test_vectorized =tfidf.transform(X_test)

In [15]:
#Using the vectorized data, make a logistic regression model
lm = LogisticRegression()

#Fit the lm object to the vectorized data
lm.fit(X_train_vectorized, y_train)

LogisticRegression()

In [16]:
#Create corresponding dataframes for the actual values of the categories that correspond to each article
train = pd.DataFrame(dict(actual=y_train))
validate = pd.DataFrame(dict(actual=y_validate))
test = pd.DataFrame(dict(actual=y_test))

In [17]:
#Form predictions uisng the lm model
train['predicted'] = lm.predict(X_train_vectorized)
validate["predicted"] = lm.predict(X_validate_vectorized)
test['predicted'] = lm.predict(X_test_vectorized)

In [19]:
#Review how the lm model performed on the in-sample data
print(classification_report(train.actual, train.predicted))

               precision    recall  f1-score   support

     business       1.00      1.00      1.00        14
entertainment       1.00      1.00      1.00        14
      science       0.86      0.92      0.89        13
       sports       1.00      1.00      1.00        14
   technology       1.00      0.93      0.96        14
        world       0.93      0.93      0.93        14

     accuracy                           0.96        83
    macro avg       0.96      0.96      0.96        83
 weighted avg       0.97      0.96      0.96        83



In [20]:
#Review how the lm model performed on the out-of-sample data
print(classification_report(val- 
                            idate.actual, validate.predicted))

               precision    recall  f1-score   support

     business       0.25      0.33      0.29         6
entertainment       0.62      0.83      0.71         6
      science       1.00      0.50      0.67         6
       sports       0.50      0.50      0.50         6
   technology       0.67      0.67      0.67         6
        world       0.60      0.50      0.55         6

     accuracy                           0.56        36
    macro avg       0.61      0.56      0.56        36
 weighted avg       0.61      0.56      0.56        36



**Takeaways:**
- The lm model performed better on the validate data than the test data for predicting science articles
- Overall, the lm model was best able to predict technology articles with 67% accuracy
- The lm model was not effective at predicting business articles

---

**KNN Model**

In [24]:
#Create the KNN object with a k = 5
knn = KNeighborsClassifier(n_neighbors=5, weights='uniform')

#Fit the object to the vectorized training data
knn.fit(X_train_vectorized, y_train)

#Create corresponding dataframes for the actual values of the categories that correspond to each article
train = pd.DataFrame(dict(actual=y_train))
validate = pd.DataFrame(dict(actual=y_validate))
test = pd.DataFrame(dict(actual=y_test))

#Form predictions uisng the knn model
train['predicted'] = knn.predict(X_train_vectorized)
validate["predicted"] = knn.predict(X_validate_vectorized)
test['predicted'] = knn.predict(X_test_vectorized)

In [25]:
#Review how the knn model performed on the in-sample data
print(classification_report(train.actual, train.predicted))

               precision    recall  f1-score   support

     business       0.63      0.86      0.73        14
entertainment       0.71      0.86      0.77        14
      science       0.73      0.62      0.67        13
       sports       0.67      0.71      0.69        14
   technology       0.88      0.50      0.64        14
        world       0.69      0.64      0.67        14

     accuracy                           0.70        83
    macro avg       0.72      0.70      0.69        83
 weighted avg       0.72      0.70      0.69        83



In [26]:
#Review how the knn model performed on the out-of-sample data
print(classification_report(validate.actual, validate.predicted))

               precision    recall  f1-score   support

     business       0.22      0.33      0.27         6
entertainment       0.43      0.50      0.46         6
      science       0.60      0.50      0.55         6
       sports       0.50      0.33      0.40         6
   technology       0.60      0.50      0.55         6
        world       0.33      0.33      0.33         6

     accuracy                           0.42        36
    macro avg       0.45      0.42      0.43        36
 weighted avg       0.45      0.42      0.43        36



**Takeaways:**
- The KNN model was best able to predict the category for science and tech articles
- Business predictions were still the worst
- Overall, the KNN model performed worst that the logistic regression model

### How does the KNN model do with a higher k? k = 10?

In [36]:
#Create the KNN object with a k = 10
knn = KNeighborsClassifier(n_neighbors=10, weights='uniform')

#Fit the object to the vectorized training data
knn.fit(X_train_vectorized, y_train)

#Create corresponding dataframes for the actual values of the categories that correspond to each article
train = pd.DataFrame(dict(actual=y_train))
validate = pd.DataFrame(dict(actual=y_validate))
test = pd.DataFrame(dict(actual=y_test))

#Form predictions uisng the knn model
train['predicted'] = knn.predict(X_train_vectorized)
validate["predicted"] = knn.predict(X_validate_vectorized)
test['predicted'] = knn.predict(X_test_vectorized)

In [37]:
#Review how the knn model performed on the in-sample data
print(classification_report(train.actual, train.predicted))

               precision    recall  f1-score   support

     business       0.45      0.64      0.53        14
entertainment       0.67      0.71      0.69        14
      science       0.82      0.69      0.75        13
       sports       0.58      0.79      0.67        14
   technology       0.71      0.36      0.48        14
        world       0.73      0.57      0.64        14

     accuracy                           0.63        83
    macro avg       0.66      0.63      0.63        83
 weighted avg       0.66      0.63      0.62        83



In [38]:
#Review how the knn model performed on the out-of-sample data
print(classification_report(validate.actual, validate.predicted))

               precision    recall  f1-score   support

     business       0.45      0.83      0.59         6
entertainment       0.44      0.67      0.53         6
      science       1.00      0.67      0.80         6
       sports       1.00      0.50      0.67         6
   technology       0.75      0.50      0.60         6
        world       0.60      0.50      0.55         6

     accuracy                           0.61        36
    macro avg       0.71      0.61      0.62        36
 weighted avg       0.71      0.61      0.62        36



**Takeaways:**
- Overall, the KNN model with k = 10 performed better than the logistic model
    - It also performed better at predicting business articles than the any of the previous models on out of sample data
- KNN with k = 10 did not perform better than logistic model for predicting ent articles

---

**Random Forest Model**

In [76]:
#Create the RF object
rf = RandomForestClassifier(bootstrap=True, 
                            class_weight=None, 
                            criterion='gini',
                            min_samples_leaf=3,
                            n_estimators=100,
                            max_depth=15, 
                            random_state=123)

In [77]:
#Fit the RF object to the training data
rf.fit(X_train_vectorized, y_train)

#Predict on y
y_pred = rf.predict(X_train_vectorized)

#Evaluate
print('Accuracy of random forest classifier on training set: {:.2f}'
     .format(rf.score(X_train_vectorized, y_train)))

Accuracy of random forest classifier on training set: 0.93


In [78]:
print(classification_report(y_train, y_pred))

               precision    recall  f1-score   support

     business       0.93      1.00      0.97        14
entertainment       1.00      0.93      0.96        14
      science       0.90      0.69      0.78        13
       sports       0.93      1.00      0.97        14
   technology       0.88      1.00      0.93        14
        world       0.93      0.93      0.93        14

     accuracy                           0.93        83
    macro avg       0.93      0.92      0.92        83
 weighted avg       0.93      0.93      0.92        83



In [79]:
#Predict using out of sample data
y_pred = rf.predict(X_validate_vectorized)

print(classification_report(y_validate, y_pred))

               precision    recall  f1-score   support

     business       0.28      0.83      0.42         6
entertainment       0.50      0.17      0.25         6
      science       1.00      0.33      0.50         6
       sports       1.00      0.17      0.29         6
   technology       0.30      0.50      0.37         6
        world       0.33      0.17      0.22         6

     accuracy                           0.36        36
    macro avg       0.57      0.36      0.34        36
 weighted avg       0.57      0.36      0.34        36



**Takeaways:**
- RF does not do well, even when changing may depth and min_sample_leaves

---

**Validate Model Performance Using the Best Performing Model on the Validate DF**

In [80]:
#Create the KNN object with a k = 10
knn = KNeighborsClassifier(n_neighbors=10, weights='uniform')

#Fit the object to the vectorized training data
knn.fit(X_train_vectorized, y_train)

#Create corresponding dataframes for the actual values of the categories that correspond to each article
train = pd.DataFrame(dict(actual=y_train))
validate = pd.DataFrame(dict(actual=y_validate))
test = pd.DataFrame(dict(actual=y_test))

#Form predictions uisng the knn model
train['predicted'] = knn.predict(X_train_vectorized)
validate["predicted"] = knn.predict(X_validate_vectorized)
test['predicted'] = knn.predict(X_test_vectorized)

#Review how the knn model performed on the out-of-sample data
print(classification_report(test.actual, test.predicted))

               precision    recall  f1-score   support

     business       0.40      0.80      0.53         5
entertainment       1.00      0.40      0.57         5
      science       1.00      0.80      0.89         5
       sports       0.44      0.80      0.57         5
   technology       0.50      0.40      0.44         5
        world       0.00      0.00      0.00         5

     accuracy                           0.53        30
    macro avg       0.56      0.53      0.50        30
 weighted avg       0.56      0.53      0.50        30



**Takeaways:**
- The KNN model with a k = 10 accurately predicts ent and science articles 100% of the time.
- It does not perform as well as the model did on the validate data for ther other categories and it has a accuracy of 0% for classifying world articles.
- All in all, it does do better than just a shot in the dark, which is the equivalent of the baseline model performance.