### Text Regularization

In [1]:
from sklearn.datasets import fetch_20newsgroups

news_data = fetch_20newsgroups(subset='all', random_state=156)

In [2]:
print(news_data.keys())

dict_keys(['data', 'filenames', 'target_names', 'target', 'DESCR'])


In [5]:
import pandas as pd

print('target class and distribution\n', pd.Series(news_data.target).value_counts().sort_index())
print('target class name:\n', news_data.target_names)

target class and distribution
 0     799
1     973
2     985
3     982
4     963
5     988
6     975
7     990
8     996
9     994
10    999
11    991
12    984
13    990
14    987
15    997
16    910
17    940
18    775
19    628
dtype: int64
target class name:
 ['alt.atheism', 'comp.graphics', 'comp.os.ms-windows.misc', 'comp.sys.ibm.pc.hardware', 'comp.sys.mac.hardware', 'comp.windows.x', 'misc.forsale', 'rec.autos', 'rec.motorcycles', 'rec.sport.baseball', 'rec.sport.hockey', 'sci.crypt', 'sci.electronics', 'sci.med', 'sci.space', 'soc.religion.christian', 'talk.politics.guns', 'talk.politics.mideast', 'talk.politics.misc', 'talk.religion.misc']


In [7]:
# It would be necessary to delete all non-important info such as author, subject...etc
# Just collect the contents only.
# If the header or footer is inclued, any ML algorithm would work well... 

print(news_data.data[0])

From: egreen@east.sun.com (Ed Green - Pixel Cruncher)
Subject: Re: Observation re: helmets
Organization: Sun Microsystems, RTP, NC
Lines: 21
Distribution: world
Reply-To: egreen@east.sun.com
NNTP-Posting-Host: laser.east.sun.com

In article 211353@mavenry.altcit.eskimo.com, maven@mavenry.altcit.eskimo.com (Norman Hamer) writes:
> 
> The question for the day is re: passenger helmets, if you don't know for 
>certain who's gonna ride with you (like say you meet them at a .... church 
>meeting, yeah, that's the ticket)... What are some guidelines? Should I just 
>pick up another shoei in my size to have a backup helmet (XL), or should I 
>maybe get an inexpensive one of a smaller size to accomodate my likely 
>passenger? 

If your primary concern is protecting the passenger in the event of a
crash, have him or her fitted for a helmet that is their size.  If your
primary concern is complying with stupid helmet laws, carry a real big
spare (you can put a big or small head in a big helmet, bu

In [11]:
from sklearn.datasets import fetch_20newsgroups

# subset = 'train' to collect train data only, remove = ('headers', 'footers', 'quotes')
train_news = fetch_20newsgroups(subset = 'train', remove=('headers', 'footers', 'quotes'), random_state=156)
X_train = train_news.data
y_train = train_news.target
print(type(X_train))

# subset = 'test' to collect test data only
test_news = fetch_20newsgroups(subset = 'test', remove=('headers', 'footers', 'quotes'), random_state=156)
X_test = test_news.data
y_test = test_news.target
print('Training data size: {0}, Testing data size: {1}'.format(len(X_train), len(X_test)))

<class 'list'>
Training data size: 11314, Testing data size: 7532


In [14]:
type(X_train)

list

### Feature vectorization, ML model training/prediction/evaluation

In [18]:
# CountVectorizer -> When fitting test data, must use the object fitted by training data
# Don't using fit_transform
from sklearn.feature_extraction.text import CountVectorizer

# CounterVectorization
cnt_vect = CountVectorizer()
cnt_vect.fit(X_train)
X_train_cnt_vect = cnt_vect.transform(X_train)

# Test data transform using same vectorizer object
X_test_cnt_vect = cnt_vect.transform(X_test)

print("Training data shape:", X_train_cnt_vect.shape)
print("Testing data shape:", X_test_cnt_vect.shape)

Training data shape: (11314, 101631)
Testing data shape: (7532, 101631)


In [21]:
# Logistic Regression
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

lr_clf = LogisticRegression(max_iter=20000)
lr_clf.fit(X_train_cnt_vect, y_train)
pred = lr_clf.predict(X_test_cnt_vect)
print('CountVectorized Logistic Regression Accuracy: {0:.3f}'.format(accuracy_score(y_test, pred)))

CountVectorized Logistic Regression Accuracy: 0.597


In [23]:
# Term Frequency - Inverse Document Frequency (TfidfVectorizer)
from sklearn.feature_extraction.text import TfidfVectorizer

# apply TF-IDF Vectorization 
tfidf_vect = TfidfVectorizer()
tfidf_vect.fit(X_train)
X_train_tfidf_vect = tfidf_vect.transform(X_train)
X_test_tfidf_vect = tfidf_vect.transform(X_test)

# LogisticRegression
lr_clf = LogisticRegression(max_iter=20000)
lr_clf.fit(X_train_tfidf_vect, y_train)
pred = lr_clf.predict(X_test_tfidf_vect)
print('TF_IDF Logistic Regression Accuracy: {0:.3f}'.format(accuracy_score(y_test, pred)))

TF_IDF Logistic Regression Accuracy: 0.674


In [29]:
# stopwords removal, ngram (1,2)
tfidf_vect = TfidfVectorizer(stop_words = 'english', ngram_range=(1,2), max_df = 300)
tfidf_vect.fit(X_train)
X_train_tfidf_vect = tfidf_vect.transform(X_train)
X_test_tfidf_vect = tfidf_vect.transform(X_test)

lr_clf = LogisticRegression(max_iter=20000, n_jobs=-1)
lr_clf.fit(X_train_tfidf_vect, y_train)
pred = lr_clf.predict(X_test_tfidf_vect)
print('TF_IDF Logistic Regression with stopwords, ngram, max_df adjustment:\n {0:.3f}'.format(accuracy_score(y_test, pred)))

TF_IDF Logistic Regression with stopwords, ngram, max_df adjustment:
 0.692


In [30]:
from sklearn.model_selection import GridSearchCV

# Optimizing C : Regularization default=1.0, the smaller the stronger regularization
params = {'C':[0.01, 0.1, 1, 5, 10]}
grid_cv_lr = GridSearchCV(lr_clf, param_grid=params, cv=3, scoring='accuracy', verbose=1)
grid_cv_lr.fit(X_train_tfidf_vect, y_train)
print('Logistic Regression best C parameter:', grid_cv_lr.best_params_)

# Testing with best parameter
pred = grid_cv_lr.predict(X_test_tfidf_vect)
print('TF-IDF Vectorized optimized Logistic Regression Accuracy: {0:.3f}'.format(accuracy_score(y_test, pred)))

Fitting 3 folds for each of 5 candidates, totalling 15 fits
Logistic Regression best C parameter: {'C': 10}
TF-IDF Vectorized optimized Logistic Regression Accuracy: 0.701


### Scikit-learn Pipeline and Integration with GridSearchCV

In [33]:
from sklearn.pipeline import Pipeline
#TFidfVectorizer : tfidf_vect, LogisticRegression: lr_clf
pipeline = Pipeline([
    ('tfidf_vect', TfidfVectorizer(stop_words='english', ngram_range=(1,2), max_df=300)),
    ('lr_clf', LogisticRegression(C=10))
])

pipeline.fit(X_train, y_train)
pred = pipeline.predict(X_test)
print('Pipeline (TF-IDF, LogisticRegression) accuracy: {0:.3f}'.format(accuracy_score(y_test, pred)))

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Pipeline (TF-IDF, LogisticRegression) accuracy: 0.701


In [35]:
from sklearn.pipeline import Pipeline

pipeline = Pipeline([
    ('tfidf_vect', TfidfVectorizer(stop_words='english')),
    ('lr_clf', LogisticRegression(n_jobs=-1))
])

# Pipeline object name + __parameter name -> for GridSearchCV
params = { 'tfidf_vect__ngram_range': [(1,1), (1,2), (1,3)],
           'tfidf_vect__max_df': [100, 300, 700],
           'lr_clf__C': [1, 5, 10]    
}

grid_cv_pipe = GridSearchCV(pipeline, param_grid=params, cv=3, scoring='accuracy', verbose=1)
grid_cv_pipe.fit(X_train, y_train)
print(grid_cv_pipe.best_params_, grid_cv_pipe.best_score_)
pred = grid_cv_pipe.predict(X_test)

print('Pipeline+GridSearchCV accuracy: {0:.3f}'.format(accuracy_score(y_test, pred)))

Fitting 3 folds for each of 27 candidates, totalling 81 fits
{'lr_clf__C': 10, 'tfidf_vect__max_df': 300, 'tfidf_vect__ngram_range': (1, 2)} 0.7536687914006531
Pipeline+GridSearchCV accuracy: 0.701
