# NLP Modeling

- [Begin Here](#Begin-Here)
- [Remember to do this](#TODO)

In [15]:
# import personal modules
import src.acquire as ac

# import modules from libraries
#from prepare import basic_clean, lemmatize
from pprint import pprint

#import datascience libraries
import pandas as pd
import numpy as np

# import vizualization libraries
import matplotlib.pyplot as plt

# Sklearn modules including classifiers
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier  # Gradient Boosting Classifier
from sklearn.ensemble import HistGradientBoostingClassifier
from sklearn.naive_bayes import MultinomialNB  # Naive Bayes Classifier


# Sklearn testing, evaluating, and managing model
from sklearn.datasets import make_multilabel_classification
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.feature_selection import RFE, f_regression
from sklearn.multioutput import MultiOutputClassifier as MOC 
from sklearn.pipeline import Pipeline as pipeline 

# more classifiers
from xgboost import XGBClassifier  # XG Boost Classifier
from lightgbm import LGBMClassifier # Light Gradient Boost Classifier


import nltk #Natural Language Tool Kit
import re   #Regular Expressions

# NLP related modules
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer

In [None]:
from sklearn.xgboost import XGBClassifier

indian_news = ac.scrape_news()

indian_news.head()

In [None]:
words = indian_news['content'][0]
words = words.lower().replace(',', '').replace('.', '')
words = pd.Series(words.split())

In [None]:
words_df = (pd.DataFrame({'count': words.value_counts()})
             .assign(frequency=lambda df: words_df['count'] / words_df['count'].sum())
             .assign(augmented_frequency=lambda df: df['frequency'] / df['frequency'].max()))

In [None]:
words_df.head()
 

## Begin Here

In [None]:
def cleaner(text: str) -> list:
    'Another simple text cleaning function'
    
    

In [2]:
def clean(text: str) -> list:
    'A simple function to cleanup text data'
    wnl = nltk.stem.WordNetLemmatizer()
    stopwords = set(nltk.corpus.stopwords.words('english'))
    text = (text.encode('ascii', 'ignore')
             .decode('utf-8', 'ignore')
             .lower())
    words = re.sub(r'[^\w\s]', '', text).split() # tokenization
    return [wnl.lemmatize(word) for word in words if word not in stopwords]

In [3]:
indian_news = ac.scrape_news()
indian_news.head()

Unnamed: 0,title,content,category
0,"Once I'm drunk, then don't blame me: Man to wo...",A woman claimed that a man kept forcing her to...,business
1,Man says SpiceJet locked flyers in boarding ga...,A vlogger named Soumil Agarwal took to Instagr...,business
2,CBI books Amrapali Group's MD Anil Sharma and ...,The CBI has booked Amrapali Group's MD Anil Sh...,business
3,Non-residents from 10 nations can soon use UPI...,National Payments Corporation of India (NPCI) ...,business
4,I want people to stay for 10 yrs: US firm's bo...,"Mark Neilson, a 35-year-old senior partner at ...",business


In [4]:
indian_news['clean_text'] = indian_news['content'].apply(clean).apply(' '.join)

In [5]:
indian_news.head()

Unnamed: 0,title,content,category,clean_text
0,"Once I'm drunk, then don't blame me: Man to wo...",A woman claimed that a man kept forcing her to...,business,woman claimed man kept forcing move aisle seat...
1,Man says SpiceJet locked flyers in boarding ga...,A vlogger named Soumil Agarwal took to Instagr...,business,vlogger named soumil agarwal took instagram cl...
2,CBI books Amrapali Group's MD Anil Sharma and ...,The CBI has booked Amrapali Group's MD Anil Sh...,business,cbi booked amrapali group md anil sharma six o...
3,Non-residents from 10 nations can soon use UPI...,National Payments Corporation of India (NPCI) ...,business,national payment corporation india npci permit...
4,I want people to stay for 10 yrs: US firm's bo...,"Mark Neilson, a 35-year-old senior partner at ...",business,mark neilson 35yearold senior partner life ins...


In [None]:
x = indian_news['clean_text']
y = indian_news['category']

x_text = dataset.text_stem.values
vectorizer = TfidfVectorizer()
vectorizer.fit(x_text)
x_vectorized = vectorizer.transform(x_text)

In [19]:
#########################################################################
           ############       Random Forest       ##############     
  ######  Creates N number of trees using random starting values  ######
########################################################################

def random_forest_model(x, y):
    
    rf_classifier = RandomForestClassifier(
        min_samples_leaf=10,
        n_estimators=200,
        max_depth=5, 
        bootstrap=True,
        oob_score=True,
        n_jobs=-1,
        max_features='auto'
    )

    rf_classifier.fit(x, y)

    y_preds = rf_classifier.predict(x)
    
    return y_preds


#############################################################################
    ############       Gradient Boosting Classifier       ##############     
######  Creates a random forest where each tree learns from the last  ######
############################################################################

def gradient_booster_model(x, y):
    
    gradient_booster = GradientBoostingClassifier(
                            learning_rate=0.1,
                            max_depth = 5,
                            n_estimators=200)

    gradient_booster.fit(x, y)
    
    y_preds = gradient_booster.predict(x)
    
    return y_preds


#################################################################
############         XG Boosting Classifier       ##############     
    #######       Uses XG Boosting Algorthm       #######
#################################################################

def xgboost_model(x, y):
    
    xgboost = MOC(
                XGBClassifier(
                        base_score=None,
                        booster=None,
                        n_estimators=200,
                        learning_rate=0.1,
                        max_depth = 5
                        ))

    xgboost.fit(x, y)
    
    y_preds = xgboost.predict(x)
    
    return y_preds


#################################################################
#########         LightGMB Boosting Classifier       ###########     
#######       Uses Light Gradient Boosting Algorthm       #######
#################################################################

def lgmboost_model(x, y):
    
    lgmboost = MOC(
                LGBMClassifier(
                learning_rate=0.1,
                max_depth = 5,
                n_estimators=200))

    lgmboost.fit(x, y)
    
    y_preds = lgmboost.predict(x)
    
    return y_preds


####################################################################
#########         Multinomial Naive Bayes Classifier     ###########     
#######     Uses Naive Bayes as Classification Algorithm     #######
####################################################################

def nb_model(x, y):
    
    naive_bayes = MultinomialNB()
    
    naive_bayes.fit(x, y)
    
    y_preds = naive_bayes.predict(x)
    
    return y_preds

In [None]:
#############################################################################
    ############       Model Evaluation       ##############     
   ######  Easily evaluate models for accuracy or any other metric  ######
############################################################################

def evaluate_classification_model(model, y_train, y_preds, df=False, full= False):
    TN, FP, FN, TP = confusion_matrix(y_train, y_preds).ravel()
    ALL = TP + TN + FP + FN

    accuracy = (TP + TN)/ALL
    true_positive_rate = TP/(TP+FN)
    false_positive_rate = FP/(FP+TN)
    true_negative_rate = TN/(TN+FP)
    false_negative_rate = FN/(FN+TP)
    precision = TP/(TP+FP)
    recall = TP/(TP+FN)
    
    if df == False:
        return accuracy

    f1_score = 2*(precision*recall)/(precision+recall)
    
    if full == True:
        performance_df = pd.DataFrame(
                                         {'model' : [model],
                                          'accuracy' : [accuracy],
                                          'f1_score' : [f1_score],
                                          'precision' : [precision], 
                                          'recall' : [recall],
                                          'true_positive_rate' : [true_positive_rate],
                                          'false_positive_rate': [false_positive_rate], 
                                          'true_negative_rate' : [true_negative_rate], 
                                          'false_negative_rate': [false_negative_rate]
                                          })
        return performance_df

    
    
    if full == False:
        performance_df = pd.DataFrame(
                                         {'model' : [model],
                                          'accuracy' : [accuracy],
                                          'f1_score' : [f1_score],
                                          'precision' : [precision], 
                                          'recall' : [recall]
                                         })

    if df == True:
        return performance_df


def get_models(x_train, y_train): #, x_validate, y_validate):


    rf_y_preds_train = random_forest_model(x_train, y_train)
    #rf_y_preds_val = random_forest_model(x_validate, y_validate)

    gb_y_preds_train = gradient_booster_model(x_train, y_train)
    #gb_y_preds_val = gradient_booster_model(x_validate, y_validate)

    xg_y_preds_train = xgboost_model(x_train, y_train)
    #xg_y_preds_val = xgboost_booster_model(x_validate, y_validate)
    
    #lgm_y_preds_train = lgmboost_model(x_train, y_train)
    #lgm_y_preds_val = lgmboost_booster_model(x_validate, y_validate)
    
    NB_y_preds_train = nb_model(x_train, y_train)
    #nb_y_preds_val = nb_model(x_validate, y_validate)

    performance_df = evaluate_classification_model('random_forest', y_train, rf_y_preds_train, df=True)
    #performance_df = performance_df.append(evaluate_classification_model('rf_validate', y_validate, rf_y_preds_val, df=True))
    performance_df = performance_df.append(evaluate_classification_model('gradient_booster', y_train, gb_y_preds_train, df=True))
    #performance_df = performance_df.append(evaluate_classification_model('gb_validate', y_validate, gb_y_preds_val, df=True))
    #performance_df = performance_df.append(evaluate_classification_model('xg_boost', y_train, xg_y_preds_train, df=True))
    #performance_df = performance_df.append(evaluate_classification_model('xg_validate', y_validate, xg_y_preds_val, df=True))
    #performance_df = performance_df.append(evaluate_classification_model('lgm_boost', y_train, lgm_y_preds_train, df=True))
    #performance_df = performance_df.append(evaluate_classification_model('lgm_validate', y_validate, lgm_y_preds_val, df=True))
    performance_df = performance_df.append(evaluate_classification_model('naive_bayes', y_train, NB_y_preds_train, df=True))
    #performance_df = performance_df.append(evaluate_classification_model('NA_validate', y_validate, NB_y_preds_val, df=True))


    
    return performance_df

### TODO 
- set up LGBMClassifier and XGBClassifier model functions

In [9]:
x = indian_news['clean_text']
y = indian_news['category']

cv = CountVectorizer()
x_vectorized = cv.fit_transform(x)

x_train, x_test, y_train, y_test = train_test_split(x_vectorized, y, test_size = 0.3, random_state = 7)

In [12]:
y_train

65       world
54       world
27      sports
58       world
33      sports
        ...   
83    politics
67       world
25      sports
68       world
47      sports
Name: category, Length: 70, dtype: object

In [20]:
gb_y_preds_train = gradient_booster_model(x_train, y_train)
report = classification_report(y_train, gb_y_preds_train)

In [23]:
type(report)

str

In [22]:
print(report)

              precision    recall  f1-score   support

    business       1.00      0.93      0.97        15
    politics       1.00      1.00      1.00        17
      sports       1.00      1.00      1.00        19
       world       0.95      1.00      0.97        19

    accuracy                           0.99        70
   macro avg       0.99      0.98      0.98        70
weighted avg       0.99      0.99      0.99        70



In [11]:
#rf_y_preds_train = random_forest_model(x_train, y_train)
gb_y_preds_train = gradient_booster_model(x_train, y_train)
TN, FP, FN, TP = confusion_matrix(y_train, gb_y_preds_train).ravel()
ALL = TP + TN + FP + FN

accuracy = (TP + TN)/ALL
true_positive_rate = TP/(TP+FN)
false_positive_rate = FP/(FP+TN)
true_negative_rate = TN/(TN+FP)
false_negative_rate = FN/(FN+TP)
precision = TP/(TP+FP)
recall = TP/(TP+FN)

ValueError: too many values to unpack (expected 4)

In [None]:
gb_y_preds_train = gradient_booster_model(x_train, y_train)
gradient

In [None]:
cv = CountVectorizer()
x_bow = cv.fit_transform(x_train)


In [None]:
x = indian_news['clean_text']
y = indian_news['category']

cv = CountVectorizer()
x_vectorized = cv.fit_transform(x)

x_train, x_test, y_train, y_test = train_test_split(x_vectorized, y, test_size = 0.3, random_state = 7)

In [None]:
%%time 
performance_df = get_models(x_train, y_train)