### Building a Text Classification Model

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt


from sklearn.feature_extraction.text import CountVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split, GridSearchCV
import sklearn
sklearn.set_config("display")

### Loading the Data

In [2]:
df = pd.read_csv('data/SMSSpamCollection.txt', sep='\t', header=None)

In [3]:
df.head()

Unnamed: 0,0,1
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [4]:
df.columns = ['spam', 'text']

In [5]:
df.head()

Unnamed: 0,spam,text
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


### Binary Bag of Words


In [6]:
from sklearn.feature_extraction.text import CountVectorizer

In [19]:
cvect = CountVectorizer(stop_words='english', max_features=500)
#stop words --> words that lack meaning; it, of, the, at, in, ...
#max_features --> controls how many words are included (keeps most frequent)

In [20]:
X = df['text']
y = df['spam']

In [21]:
#train/test split
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42,
                                                   stratify = y)

In [22]:
X_train[:3]

4747           Orh i tot u say she now still dun believe.
5295    Alex says he's not ok with you not being ok wi...
5568                 Will ü b going to esplanade fr home?
Name: text, dtype: object

In [23]:
X_train_vectorized = cvect.fit_transform(X_train)

In [24]:
X_train_vectorized.toarray()

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]])

In [26]:
cvect.get_feature_names_out()[:5]

array(['000', '08000930705', '10', '100', '1000'], dtype=object)

In [28]:
pd.DataFrame(X_train_vectorized.toarray(), 
             columns = cvect.get_feature_names_out()).head(2)

Unnamed: 0,000,08000930705,10,100,1000,10p,11,150,150p,150ppm,...,xmas,xxx,ya,yeah,year,years,yes,yesterday,yo,yup
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [29]:
X_test_vectorize = cvect.transform(X_test)

### Model with Results

In [30]:
#instantiate a logistic regression estimator -- lgr
lgr = LogisticRegression()

In [31]:
#fit on vectorize training data
lgr.fit(X_train_vectorized, y_train)

LogisticRegression()

In [32]:
lgr.score(X_train_vectorized, y_train)

0.9873175400813592

In [33]:
lgr.score(X_test_vectorize, y_test)

0.9770279971284996

In [34]:
y.value_counts(normalize = True)

ham     0.865937
spam    0.134063
Name: spam, dtype: float64

In [40]:
#what mattered???
feature_df = pd.DataFrame(lgr.coef_[0], columns = ['coef'])
feature_df['word'] = cvect.get_feature_names_out()

In [42]:
feature_df.nlargest(10, 'coef') #10 "most important" features

Unnamed: 0,coef,word
442,2.935393,uk
8,2.258789,150p
441,2.200919,txt
369,2.200092,service
267,2.092713,min
16,1.944085,50
308,1.923611,order
82,1.920826,claim
79,1.888759,chat
102,1.813572,customer


### Pipeline and Grid Search

In [55]:
#create a pipeline to do CountVectorizer and then LogisticRegression
pipe = Pipeline([('word_data_maker', CountVectorizer()),#transformer, transformer, transformer
                ('lgr', LogisticRegression())]) #estimator

In [56]:
params = {'word_data_maker__max_features': [100, 500, 1000, 2000],
         'word_data_maker__stop_words': ['english', None],
         'lgr__C': [0.001, 1.0]}

In [57]:
pipe.fit(X_train, y_train) #fitting the pipeline with no cross validation

Pipeline(steps=[('word_data_maker', CountVectorizer()),
                ('lgr', LogisticRegression())])

In [58]:
pipe.score(X_train, y_train)

0.9976070830342187

In [59]:
pipe.score(X_test, y_test)

0.9784637473079684

In [60]:
grid = GridSearchCV(pipe, param_grid=params, cv = 5)

In [61]:
grid.fit(X_train, y_train)

GridSearchCV(cv=5,
             estimator=Pipeline(steps=[('word_data_maker', CountVectorizer()),
                                       ('lgr', LogisticRegression())]),
             param_grid={'lgr__C': [0.001, 1.0],
                         'word_data_maker__max_features': [100, 500, 1000,
                                                           2000],
                         'word_data_maker__stop_words': ['english', None]})

In [70]:
len(cvect.get_stop_words())

318

In [62]:
grid.best_params_

{'lgr__C': 1.0,
 'word_data_maker__max_features': 2000,
 'word_data_maker__stop_words': None}

In [63]:
grid.best_score_

0.9813359883104604

In [65]:
grid.best_estimator_

Pipeline(steps=[('word_data_maker', CountVectorizer(max_features=2000)),
                ('lgr', LogisticRegression())])

In [64]:
grid.score(X_test, y_test)

0.9798994974874372

In [66]:
grid

GridSearchCV(cv=5,
             estimator=Pipeline(steps=[('word_data_maker', CountVectorizer()),
                                       ('lgr', LogisticRegression())]),
             param_grid={'lgr__C': [0.001, 1.0],
                         'word_data_maker__max_features': [100, 500, 1000,
                                                           2000],
                         'word_data_maker__stop_words': ['english', None]})

### Evaluate

### Exercise

Build a model to classify a persons status in WhatsApp.  The data is in the files `Emotion(happy).csv` and `Emotion(angry).csv`.  Join the datasets and use the count vectorizer to transform the data, then compare the performance of different classifiers on the test data.  Finally, perform a grid search over parameters of the `CountVectorizer` to see if adjusting how the text is presented improves the scoring.