### Building a Text Classification Model

In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split, GridSearchCV
import sklearn
sklearn.set_config("display")

### Loading the Data

In [3]:
df = pd.read_csv('data/SMSSpamCollection.txt', sep='\t', header=None)

In [None]:
df.head()

In [4]:
df.columns = ['spam', 'text']

In [5]:
df.head()

Unnamed: 0,spam,text
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


- in this case, we only have text to predict if a message is spam
- the binary bag of words takes all of these words and splits them out into a group
- we're basically destroying the order of the text, just looking at the words as independent entities
- then we create a matrix oc 0/1 to denote of a specific word is in a phrase
- so we take the column of text and transform it so that it will look like a 0/1 matrix that becomes our X, which we can use to fit a model with X and Y.


### Binary Bag of Words


In [6]:
from sklearn.feature_extraction.text import CountVectorizer

In [8]:
cvect= CountVectorizer()

In [9]:
## usually we would put double brackets around X, but we don't need to do that here
## because the count vectorizer expects one column
X = df['text']
y = df['spam']

In [11]:
# train test split
X_train, X_test, y_train, y_test, = train_test_split(X, y, random_state=42, stratify =y)

In [12]:
X_train[:3]

4747           Orh i tot u say she now still dun believe.
5295    Alex says he's not ok with you not being ok wi...
5568                 Will ü b going to esplanade fr home?
Name: text, dtype: object

In [14]:
cvect.fit_transform(X_train)
## this message is giving you a condensed version, since it's really wide (sparse) and has
## alot of zeros

<4179x7387 sparse matrix of type '<class 'numpy.int64'>'
	with 55514 stored elements in Compressed Sparse Row format>

In [15]:
X_train_vectorized = cvect.fit_transform(X_train)

In [17]:
# to convert it back to a dense matrix/array, do this:
# only using train data bc at this point we wouldn't know the test data
X_train_vectorized.toarray()

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]])

In [18]:
cvect.get_feature_names()

['00',
 '000',
 '000pes',
 '008704050406',
 '0089',
 '0121',
 '01223585236',
 '01223585334',
 '02',
 '0207',
 '02072069400',
 '02073162414',
 '02085076972',
 '021',
 '03',
 '04',
 '0430',
 '05',
 '050703',
 '0578',
 '06',
 '07',
 '07008009200',
 '07046744435',
 '07099833605',
 '07123456789',
 '0721072',
 '07732584351',
 '07734396839',
 '07742676969',
 '07753741225',
 '07781482378',
 '07786200117',
 '077xxx',
 '078',
 '07801543489',
 '07808',
 '07808247860',
 '07808726822',
 '07815296484',
 '07821230901',
 '078498',
 '0789xxxxxxx',
 '0796xxxxxx',
 '07xxxxxxxxx',
 '0800',
 '08000407165',
 '08000839402',
 '08000930705',
 '08000938767',
 '08001950382',
 '08002888812',
 '08002986030',
 '08002986906',
 '08002988890',
 '08006344447',
 '0808',
 '08081263000',
 '08081560665',
 '0825',
 '083',
 '0844',
 '08448350055',
 '08448714184',
 '0845',
 '08450542832',
 '08452810073',
 '08452810075over18',
 '0870',
 '08700469649',
 '08700621170150p',
 '08701213186',
 '08701417012',
 '08701417012150p',
 '08

In [21]:
# make this into a dataframe -but it's really large
df= pd.DataFrame(X_train_vectorized.toarray(), columns=cvect.get_feature_names())

In [29]:
# to manage the size, add additional parameters to the countvectorizer:

cvect2 = CountVectorizer(stop_words='english', max_features=500)
# stop words excludes words that don't mean anything - the, it, at etc
# max features limits to teh most common 500 words

In [30]:
X_train_vectorized2 = cvect2.fit_transform(X_train)


In [31]:
df2= pd.DataFrame(X_train_vectorized2.toarray(), columns=cvect2.get_feature_names())

In [33]:
# will will just transform the test - not fit
X_test_vectorized = cvect2.transform(X_test)

### Model with Results

In [35]:
#instantiate a logistic regression estimator- this is a classification problem, so we could 
# have used KNN, logistic regression, or decision trees
lgr = LogisticRegression()

In [37]:
# fir on vectorized training data
lgr.fit(X_train_vectorized2, y_train)

LogisticRegression()

In [39]:
lgr.score(X_train_vectorized2, y_train)

0.9873175400813592

In [40]:
lgr.score(X_test_vectorize, y_test)

0.9770279971284996

In [41]:
# this is the baseline. So we're at least doing better than that!
y.value_counts(normalize=True)

ham     0.865937
spam    0.134063
Name: spam, dtype: float64

In [42]:
# what mattered? what words drove the classifications? We can look at the coefficients to see
# which words were the most important
lgr.coef_
# this is a 500 2d array 

array([[ 0.5182677 ,  0.07780361,  0.88587794,  1.02862686,  0.88687662,
         1.08305093,  0.3612345 ,  0.17246992,  2.25878932,  0.56863936,
         0.7378356 ,  1.18503667,  0.42542975,  0.24929651,  0.34640318,
         0.11869043,  1.94408496,  0.90391011,  0.59359128,  0.89792513,
         0.76632536, -0.15876073, -0.34741149,  1.0282181 , -0.53548162,
        -0.18513611, -0.34507671, -0.27338362, -0.23351989, -0.71880748,
        -0.16870081,  0.28971677,  0.62042597, -0.29843707, -0.75473379,
         0.08037753,  0.58371316,  1.15489101,  0.36591538,  1.00655455,
         1.26995376, -0.15508011, -0.22456728,  0.30976008,  0.06437246,
         0.29665407, -0.3045126 , -0.38323333, -0.03716611, -0.38664391,
         0.20330831,  0.14024517, -0.06436504, -0.60978921,  0.40312931,
         0.36479306, -0.1411452 ,  0.78890597, -0.17570199,  0.07962368,
        -0.32358893, -0.15617987, -0.06617278, -0.46080455, -0.24452967,
         0.91949319,  0.49132103,  0.36433934,  0.5

In [47]:
feature_df = pd.DataFrame(lgr.coef_[0], columns = ['coef'])
feature_df['word'] = cvect2.get_feature_names()

In [50]:
# get the 10 largest coefficients
feature_df.nlargest(10, 'coef')

Unnamed: 0,coef,word
442,2.935393,uk
8,2.258789,150p
441,2.200919,txt
369,2.200092,service
267,2.092713,min
16,1.944085,50
308,1.923611,order
82,1.920826,claim
79,1.888759,chat
102,1.813572,customer


### Pipeline and Grid Search

In [None]:
# we use a pipeline to simplify the preprocessing of the data
# we use a gridsearch to find the best combo

In [51]:
# create a pipeline to do count vectorization and logistic regression.
# the order you list in your pipeline matters- the pipelin with start with a transformer and end 
# with an estimator. You can have as many transformers as you'd like, but the last step has to 
# be an estimator. 
# we could just fit the pipeline- no cross validation
# the pipeline is doing the transformation and building the model in one step - then
# you can cross validation the 5 diff models of each of the different parameter combos 
# listed in the parameters sectoin
# the cross valudation will occur on each of the model combinations
# then we can choose the best model based on the mean
pipe = Pipeline([('count_vectorizer', CountVectorizer()),
                ('lgr', LogisticRegression())])

In [52]:
# here is where we insert the parameters we'd like to try out.
# the double underscores are necessary to separate the parameter from the name
params = {'count_vectorizer__max_features' : [100, 500, 1000, 2000],
          'count_vectorizer__stop_words' : ['english', None]}

In [53]:
pipe.fit(X_train, y_train)

Pipeline(steps=[('count_vectorizer', CountVectorizer()),
                ('lgr', LogisticRegression())])

In [54]:
pipe.score(X_train, y_train)

0.9976070830342187

In [55]:
pipe.score(X_test, y_test)

0.9784637473079684

In [None]:
# this is overfit becaue the score on the train is higher than the test. 

### Evaluate

In [56]:
# this will take each model, use the params, and do cross validation on 5 slices of data
grid = GridSearchCV(pipe, param_grid=params, cv=5)

In [58]:
# this does all of the transformation and the cross validation in one step!!
grid.fit(X_train, y_train)

GridSearchCV(cv=5,
             estimator=Pipeline(steps=[('count_vectorizer', CountVectorizer()),
                                       ('lgr', LogisticRegression())]),
             param_grid={'count_vectorizer__max_features': [100, 500, 1000,
                                                            2000],
                         'count_vectorizer__stop_words': ['english', None]})

In [59]:
grid.best_params_

{'count_vectorizer__max_features': 2000, 'count_vectorizer__stop_words': None}

In [61]:
grid.best_score_

0.9813359883104604

In [62]:
grid.score(X_test, y_test)

0.9798994974874372

In [65]:
# this is a pre-existing dictionary of stop words that comes from a library called nltk
cvect2.get_stop_words()

frozenset({'a',
           'about',
           'above',
           'across',
           'after',
           'afterwards',
           'again',
           'against',
           'all',
           'almost',
           'alone',
           'along',
           'already',
           'also',
           'although',
           'always',
           'am',
           'among',
           'amongst',
           'amoungst',
           'amount',
           'an',
           'and',
           'another',
           'any',
           'anyhow',
           'anyone',
           'anything',
           'anyway',
           'anywhere',
           'are',
           'around',
           'as',
           'at',
           'back',
           'be',
           'became',
           'because',
           'become',
           'becomes',
           'becoming',
           'been',
           'before',
           'beforehand',
           'behind',
           'being',
           'below',
           'beside',
           'besides'

In [None]:
# the score here is pretty good- but we could try KNN or Random Forest
# also- which metric do we care about? is precision the right one? 
# we'd likely want to avoid missing spam rather than over categorizing it. 

### Exercise

Build a model to classify a persons status in WhatsApp.  The data is in the files `Emotion(happy).csv` and `Emotion(angry).csv`.  Join the datasets and use the count vectorizer to transform the data, then compare the performance of different classifiers on the test data.  Finally, perform a grid search over parameters of the `CountVectorizer` to see if adjusting how the text is presented improves the scoring.