In [None]:
import numpy as np
import pandas as pd

In [None]:
from sklearn.datasets import fetch_20newsgroups

newsgroups = fetch_20newsgroups()

In [None]:
newsgroups.keys()

In [None]:
#print(newsgroups.DESCR)

In [None]:
newsgroups.target_names

In [None]:
print(newsgroups.data[0])

In [None]:
newsgroups.target_names[newsgroups.target[0]]

In [None]:
categories = ['talk.politics.guns', 'talk.religion.misc', 'comp.graphics', 'rec.autos']
newsgroups = fetch_20newsgroups(categories=categories)
target_names = newsgroups.target_names

In [None]:
newsgroups.target.shape

In [None]:
print(newsgroups.data[0])

In [None]:
target_names[newsgroups.target[0]]

In [None]:
from sklearn.model_selection import train_test_split

data_train, data_test, target_train, target_test = train_test_split(newsgroups.data,
                                                                    newsgroups.target,
                                                                    test_size=0.2,
                                                                    random_state=42)

In [None]:
newsgroups.target.shape

In [None]:
target_train.shape

In [None]:
target_test.shape

In [None]:
from sklearn.feature_extraction.text import CountVectorizer

vectorizer = CountVectorizer()
vectorizer.fit(['I am teapot', 'Short and stout'])
vectorizer.get_feature_names()

In [None]:
pd.DataFrame(data=vectorizer.transform(['I am a little teapot', 'Short and short']).toarray(),
            columns=vectorizer.get_feature_names())

In [None]:
tf_vectorizer = CountVectorizer()
tf_train = tf_vectorizer.fit_transform(data_train)
tf_train.shape

In [None]:
from sklearn.naive_bayes import MultinomialNB

clf = MultinomialNB()
clf.fit(tf_train, target_train)

In [None]:
from sklearn import metrics

pred = clf.predict(tf_train)
metrics.accuracy_score(target_train, pred)

In [None]:
pred = clf.predict(tf_vectorizer.transform(data_test))
metrics.accuracy_score(target_test, pred)

In [None]:
def show_most_informative_features(classifier, vectorizer, categories):
    for i, c in enumerate(categories):
        print("%s: %s" % (c, ", ".join(np.asarray(vectorizer.get_feature_names())[np.argsort(classifier.coef_[i])[-10:]])))

In [None]:
show_most_informative_features(clf, tf_vectorizer, target_names)

In [None]:
from sklearn.pipeline import make_pipeline

model = make_pipeline(
    CountVectorizer(stop_words='english'),
    MultinomialNB())
model.fit(data_train, target_train)

In [None]:
show_most_informative_features(
    model.named_steps['multinomialnb'],
    model.named_steps['countvectorizer'],
    target_names)

In [None]:
model.score(data_test, target_test)

In [None]:
newsgroups_clean = fetch_20newsgroups(categories=categories, remove=('headers', 'footers', 'quotes'))
print(newsgroups_clean.data[0])

In [None]:
data_train_clean, data_test_clean, target_train_clean, target_test_clean = train_test_split(
    newsgroups_clean.data,
    newsgroups_clean.target,
    test_size=0.2,
    random_state=42)

In [None]:
print(data_test_clean[0])
print(target_test_clean[0])
print(target_names[target_test_clean[0]])

In [None]:
model.predict([data_test_clean[0]])

In [None]:
model.score(data_test_clean, target_test_clean)

In [None]:
#pred = model.predict(data_test_clean)
#metrics.f1_score(target_test_clean, pred, average='macro')

In [None]:
model = make_pipeline(
    CountVectorizer(stop_words='english'),
    MultinomialNB())
model.fit(data_train_clean, target_train_clean)

In [None]:
model.score(data_test_clean, target_test_clean)

In [None]:
show_most_informative_features(
    model.named_steps['multinomialnb'],
    model.named_steps['countvectorizer'],
    target_names)

In [None]:
import pyLDAvis
import pyLDAvis.sklearn
pyLDAvis.enable_notebook()
from sklearn.decomposition import LatentDirichletAllocation

In [None]:
tf_vectorizer = CountVectorizer(
    stop_words = 'english',
    #token_pattern = r'\b[a-zA-Z]{3,}\b',
    #max_df = 0.5, 
    #min_df = 10
)
tf = tf_vectorizer.fit_transform(newsgroups_clean.data)
lda = LatentDirichletAllocation(n_components=10, random_state=42)
lda.fit(tf)
pyLDAvis.sklearn.prepare(lda, tf, tf_vectorizer)

In [None]:
target_names