# Newsgroups project

## Load data

In [2]:
from sklearn.datasets import fetch_20newsgroups
categories = ['alt.atheism', 'soc.religion.christian', 'comp.graphics', 'sci.med']
twenty_train = fetch_20newsgroups(subset='train',categories=categories, shuffle=True, random_state=42)

Downloading 20news dataset. This may take a few minutes.
Downloading dataset from https://ndownloader.figshare.com/files/5975967 (14 MB)


In [4]:
type(twenty_train)

sklearn.utils.Bunch

In [5]:
import pandas as pd

In [109]:
#twenty_train.data

In [10]:
twenty_train.keys()

dict_keys(['data', 'filenames', 'target_names', 'target', 'DESCR'])

In [20]:
print(f'Length of data val = {len(twenty_train["data"])}')
print(f'Length of filenames val = {len(twenty_train["filenames"])}')
print(f'Length of target_names val = {len(twenty_train["target_names"])}')
print(f'Length of target val = {len(twenty_train["target"])}')

Length of data val = 2257
Length of filenames val = 2257
Length of target_names val = 4
Length of target val = 2257


In [22]:
twenty_train.target_names

['alt.atheism', 'comp.graphics', 'sci.med', 'soc.religion.christian']

In [23]:
twenty_

array([1, 1, 3, ..., 2, 2, 2])

## CountVectorizer() gives an array for each piece of data/doc/email/message whatever &&  Ncols will be the number of unique words in the corpus

In [25]:
from sklearn.feature_extraction.text import CountVectorizer
count_vect = CountVectorizer()
X_train_counts = count_vect.fit_transform(twenty_train.data)
X_train_counts.shape

(2257, 35788)

In [33]:
count_vect.vocabulary_.get(u'algorithm')

4690

## Apply a more appropriate transformer after just counting. Text frequency by inverse document frequency

In [34]:
from sklearn.feature_extraction.text import TfidfTransformer
tf_transformer = TfidfTransformer(use_idf=False).fit(X_train_counts)
X_train_tf = tf_transformer.transform(X_train_counts)
X_train_tf.shape

(2257, 35788)

## One step fit-transformer

In [36]:
tfidf_transformer = TfidfTransformer()
X_train_tfidf = tfidf_transformer.fit_transform(X_train_counts)
X_train_tfidf.shape

(2257, 35788)

In [37]:
from sklearn.naive_bayes import MultinomialNB
clf = MultinomialNB().fit(X_train_tfidf, twenty_train.target)

## Try some new data

In [90]:
# 
docs_new = ['God is love', 'OpenGL on the GPU is fast']
X_new_counts = count_vect.transform(docs_new)
X_new_tfidf = tfidf_transformer.transform(X_new_counts)

predicted = clf.predict(X_new_tfidf)

for doc, category in zip(docs_new, predicted):
    print('%r => %s' % (doc, twenty_train.target_names[category]))
    
#'God is love' => soc.religion.christian
#'OpenGL on the GPU is fast' => comp.graphics

'God is love' => soc.religion.christian
'OpenGL on the GPU is fast' => comp.graphics


In [91]:
# From the docs
import numpy as np
rng = np.random.RandomState(1)
X = rng.randint(5, size=(6, 100))
y = np.array([1, 2, 3, 4, 5, 6])
from sklearn.naive_bayes import MultinomialNB
_clf = MultinomialNB()
_clf.fit(X, y)
MultinomialNB()
print(_clf.predict_proba(X[2:3]))
print(_clf.predict_proba(X[2:3]*1.21))

[[2.13554626e-30 4.43326891e-31 1.00000000e+00 5.44795018e-34
  1.18017781e-37 4.41781155e-37]]
[[1.25517960e-36 1.87299290e-37 1.00000000e+00 5.63432746e-41
  2.07518303e-45 1.02494532e-44]]


In [93]:
# Test transformers
from sklearn import preprocessing
X_tmpTrain = np.array([[ 1., -1.,  2.],
[ 2.,  0.,  0.],
[ 0.,  1., -1.]])

min_max_scaler = preprocessing.MinMaxScaler()
print(min_max_scaler)
#X_tmpTrain = min_max_scaler.fit_transform(X_tmpTrain)
#X_tmpTrain
#array([[0.5       , 0.        , 1.        ],
#       [1.        , 0.5       , 0.33333333],
#       [0.        , 1.        , 0.        ]])
ret_val = min_max_scaler.fit(X_tmpTrain)
ret_val == min_max_scaler

MinMaxScaler(copy=True, feature_range=(0, 1))


True

## Make a preprocessing pipeline

In [105]:
from sklearn.pipeline import Pipeline
text_clf = Pipeline([
    ('vect', CountVectorizer()),
    ('tfidf', TfidfTransformer()),
    ('clf', MultinomialNB()),
])
text_clf.fit(twenty_train.data, twenty_train.target)
docs_new = ['God is love', 'OpenGL on the GPU is fast']
#X_new_counts = count_vect.transform(docs_new)
#X_new_tfidf = tfidf_transformer.transform(X_new_counts)

predicted = text_clf.predict(docs_new)

for doc, category in zip(docs_new, predicted):
    print('%r => %s' % (doc, twenty_train.target_names[category]))

'God is love' => soc.religion.christian
'OpenGL on the GPU is fast' => comp.graphics


## Test on new data large set

In [106]:
twenty_test = fetch_20newsgroups(subset='test',
categories=categories, shuffle=True, random_state=42)
docs_test = twenty_test.data
predicted = text_clf.predict(docs_test)
np.mean(predicted == twenty_test.target)

0.8348868175765646

# Language project

In [115]:
from sklearn.datasets import load_files
data_dir = '/home/jchoward/programming/gits/sklearn/scikit-learn/doc/\
tutorial/text_analytics/data/languages/short_paragraphs'
raw_data_lang = load_files(data_dir)

## CountVectorizer (with char and 2-4 ngram-range)

In [122]:
# ONe-gram probably not enough
char_vect = CountVectorizer(analyzer='char', ngram_range=(2,4))
X_train_counts = char_vect.fit_transform(raw_data_lang.data)
X_train_counts.shape

(8734, 77690)

In [124]:
print(f'Length of data val = {len(raw_data_lang["data"])}')
print(f'Length of filenames val = {len(raw_data_lang["filenames"])}')
print(f'Length of target_names val = {len(raw_data_lang["target_names"])}')
print(f'Length of target val = {len(raw_data_lang["target"])}')

Length of data val = 8734
Length of filenames val = 8734
Length of target_names val = 11
Length of target val = 8734


In [128]:
raw_data_lang['target_names']

['ar', 'de', 'en', 'es', 'fr', 'it', 'ja', 'nl', 'pl', 'pt', 'ru']

In [129]:
# TF-IDF transform
tf_transformer = TfidfTransformer(use_idf=True).fit(X_train_counts)
X_train_tf = tf_transformer.transform(X_train_counts)
X_train_tf.shape

(8734, 77690)

# Put steps into a pipeline and test on a few sentences

In [131]:
from sklearn.pipeline import Pipeline
text_clf = Pipeline([
    ('vect', CountVectorizer(analyzer='char', ngram_range=(2,4))),
    ('tfidf', TfidfTransformer()),
    ('clf', MultinomialNB()),
])
text_clf.fit(raw_data_lang.data, raw_data_lang.target)
docs_new = ['God is love', 'OpenGL on the GPU is fast', 'Dónde estás?' ]

predicted = text_clf.predict(docs_new)

for doc, category in zip(docs_new, predicted):
    print('%r => %s' % (doc, raw_data_lang.target_names[category]))

'God is love' => en
'OpenGL on the GPU is fast' => en
'Dónde estás?' => es


In [146]:
from sklearn.model_selection import GridSearchCV
text_clf = Pipeline([
    ('vect', CountVectorizer(analyzer='char', ngram_range=(2,4))),
    ('tfidf', TfidfTransformer()),
    ('clf', MultinomialNB()),
])
parameters = {
    'vect__ngram_range': [(1, 1), (1, 2), (1,3), (2,3), (2,4), (2,5), (3,5), (3,6)],
    'tfidf__use_idf': (True, False),
    #'clf__alpha': (1e-2, 1e-3),
}


In [147]:
gs_clf = GridSearchCV(text_clf, parameters, cv=5, n_jobs=-1)

In [148]:
gs_clf = gs_clf.fit(raw_data_lang.data[:1000], raw_data_lang.target[:1000])

## Best was 83 %  with NB Classifier MultinomialNB

In [149]:
print(gs_clf.best_score_)
#0.9
for param_name in sorted(parameters.keys()):
    print("%s: %r" % (param_name, gs_clf.best_params_[param_name]))

0.8320000000000001
tfidf__use_idf: True
vect__ngram_range: (2, 4)


In [153]:
from sklearn.linear_model import SGDClassifier
text_clf = Pipeline([
    ('vect', CountVectorizer(analyzer='char', ngram_range=(1,1))),
    ('tfidf', TfidfTransformer()),
    ('clf', SGDClassifier(loss='hinge', penalty='l2',
        alpha=1e-3, random_state=42,
        max_iter=5, tol=None)),
])
parameters = {
    'vect__ngram_range': [(1, 1), (1, 2), (1,3), (2,3), (2,4), (2,5), (3,5), (3,6)],
    'tfidf__use_idf': (True, False),
    'clf__alpha': (1e-2, 1e-3, 1e-4),
}

In [154]:
gs_clf = GridSearchCV(text_clf, parameters, cv=5, n_jobs=-1)

In [155]:
gs_clf = gs_clf.fit(raw_data_lang.data[:1000], raw_data_lang.target[:1000])

In [None]:
print(gs_clf.best_score_)
#0.9
for param_name in sorted(parameters.keys()):
    print("%s: %r" % (param_name, gs_clf.best_params_[param_name]))