In [37]:
from sklearn.feature_extraction.text import CountVectorizer
import numpy as np
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix
import timeit
import warnings
from sklearn.metrics import roc_curve, auc
import matplotlib as mpl
import matplotlib.pyplot as plt
from sklearn.pipeline import Pipeline

In [2]:
from sklearn.datasets import fetch_20newsgroups
twenty_train = fetch_20newsgroups(subset='train',
     remove=(['headers', 'footers', 'quotes']), shuffle=True, random_state=42)

In [3]:
twenty_train.target_names

['alt.atheism',
 'comp.graphics',
 'comp.os.ms-windows.misc',
 'comp.sys.ibm.pc.hardware',
 'comp.sys.mac.hardware',
 'comp.windows.x',
 'misc.forsale',
 'rec.autos',
 'rec.motorcycles',
 'rec.sport.baseball',
 'rec.sport.hockey',
 'sci.crypt',
 'sci.electronics',
 'sci.med',
 'sci.space',
 'soc.religion.christian',
 'talk.politics.guns',
 'talk.politics.mideast',
 'talk.politics.misc',
 'talk.religion.misc']

In [7]:
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.feature_extraction.text import CountVectorizer
count_vect = CountVectorizer()
X_train_counts = count_vect.fit_transform(twenty_train.data)
X_train_counts.shape
tf_transformer = TfidfTransformer(use_idf=False).fit(X_train_counts)
X_train_tf = tf_transformer.transform(X_train_counts)
tfidf_transformer = TfidfTransformer()
X_train_tfidf = tfidf_transformer.fit_transform(X_train_counts)
X_train_tfidf.shape

(11314, 101631)

In [8]:
X = X_train_tfidf
y = twenty_train.target

In [9]:
X_train, X_val, y_train, y_val = train_test_split(
    X, y, train_size = 0.75
)

In [28]:
times = []
results = []
train_results = []
val_results = []

for c in [0.5, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12]:
    
    lr = LogisticRegression(C=c)
    t1 = timeit.default_timer()
    lr.fit(X_train, y_train)
    train_time = timeit.default_timer() - t1
    times.append(train_time)
    print ("Accuracy for C=%s: %s in %ssec" 
           % (c, accuracy_score(y_val, lr.predict(X_val)), train_time))

Accuracy for C=0.5: 0.7013078826440439 in 6.9224375389999295sec
Accuracy for C=1: 0.7218098267939201 in 10.300244334000013sec
Accuracy for C=2: 0.7355956168257335 in 8.646985895999933sec
Accuracy for C=3: 0.7405443619653588 in 9.54251289900003sec
Accuracy for C=4: 0.7465535524920467 in 8.621900561999837sec
Accuracy for C=5: 0.7462000706963592 in 8.978548381999872sec
Accuracy for C=6: 0.7454931071049841 in 9.755048044999967sec
Accuracy for C=7: 0.7469070342877342 in 12.970619820000138sec
Accuracy for C=8: 0.7469070342877342 in 15.227809146000027sec
Accuracy for C=9: 0.7465535524920467 in 15.42684946899999sec
Accuracy for C=10: 0.744079179922234 in 16.510610856999847sec
Accuracy for C=11: 0.7444326617179216 in 15.795522042999892sec
Accuracy for C=12: 0.744079179922234 in 15.564518612000029sec


In [15]:
final_model = LogisticRegression(C=7)
final_model.fit(X, y)



LogisticRegression(C=7, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='warn', n_jobs=None, penalty='l2',
                   random_state=None, solver='warn', tol=0.0001, verbose=0,
                   warm_start=False)

In [22]:
twenty_test = fetch_20newsgroups(subset='test',
     remove=(['headers', 'footers', 'quotes']), shuffle=True, random_state=42)

In [23]:
count_vect = CountVectorizer()
X_test_counts = count_vect.fit_transform(twenty_test.data)
X_test_counts.shape
tf_transformer = TfidfTransformer(use_idf=False).fit(X_test_counts)
X_test_tf = tf_transformer.transform(X_test_counts)
tfidf_transformer = TfidfTransformer()
X_test_tfidf = tfidf_transformer.fit_transform(X_test_counts)
X_test_tfidf.shape

(7532, 71018)

In [34]:
text_clf = Pipeline([
     ('vect', CountVectorizer()),
     ('tfidf', TfidfTransformer()),
     ('lr', LogisticRegression(C=7))])

text_clf.fit(twenty_train.data, twenty_train.target)




Pipeline(memory=None,
         steps=[('vect',
                 CountVectorizer(analyzer='word', binary=False,
                                 decode_error='strict',
                                 dtype=<class 'numpy.int64'>, encoding='utf-8',
                                 input='content', lowercase=True, max_df=1.0,
                                 max_features=None, min_df=1,
                                 ngram_range=(1, 1), preprocessor=None,
                                 stop_words=None, strip_accents=None,
                                 token_pattern='(?u)\\b\\w\\w+\\b',
                                 tokenizer=None, vocabulary=None)),
                ('tfidf',
                 TfidfTransformer(norm='l2', smooth_idf=True,
                                  sublinear_tf=False, use_idf=True)),
                ('lr',
                 LogisticRegression(C=7, class_weight=None, dual=False,
                                    fit_intercept=True, intercept_scaling=1,
     

In [36]:
twenty_test = fetch_20newsgroups(subset='test',
        remove=(['headers', 'footers', 'quotes']), shuffle=True, random_state=42)
docs_test = twenty_test.data
predicted = text_clf.predict(docs_test)
np.mean(predicted == twenty_test.target)

0.6887944768985661

In [38]:
from sklearn import metrics
print(metrics.classification_report(twenty_test.target, predicted,
     target_names=twenty_test.target_names))

                          precision    recall  f1-score   support

             alt.atheism       0.51      0.48      0.49       319
           comp.graphics       0.65      0.71      0.68       389
 comp.os.ms-windows.misc       0.65      0.62      0.63       394
comp.sys.ibm.pc.hardware       0.65      0.65      0.65       392
   comp.sys.mac.hardware       0.75      0.69      0.72       385
          comp.windows.x       0.83      0.69      0.75       395
            misc.forsale       0.77      0.80      0.79       390
               rec.autos       0.71      0.71      0.71       396
         rec.motorcycles       0.77      0.77      0.77       398
      rec.sport.baseball       0.54      0.82      0.65       397
        rec.sport.hockey       0.88      0.87      0.87       399
               sci.crypt       0.83      0.69      0.76       396
         sci.electronics       0.60      0.61      0.60       393
                 sci.med       0.77      0.75      0.76       396
         