In [88]:
# !pip install scikit-learn



In [109]:
from sklearn.datasets import fetch_20newsgroups
num_categories = 2


In [110]:
newsgroup_all_categories = ['alt.atheism',
 'comp.graphics',
 'comp.os.ms-windows.misc',
 'comp.sys.ibm.pc.hardware',
 'comp.sys.mac.hardware',
 'comp.windows.x',
 'misc.forsale',
 'rec.autos',
 'rec.motorcycles',
 'rec.sport.baseball',
 'rec.sport.hockey',
 'sci.crypt',
 'sci.electronics',
 'sci.med',
 'sci.space',
 'soc.religion.christian',
 'talk.politics.guns',
 'talk.politics.mideast',
 'talk.politics.misc',
 'talk.religion.misc']

# We only use 4 for simplicity and as a sanity check. Actually, let's have multiple versions.

categories_dict = {
    2: ['soc.religion.christian', 'sci.med'],
    4: ['alt.atheism', 'soc.religion.christian', 'comp.graphics', 'sci.med'],
    20: newsgroup_all_categories
}

In [111]:
train_set = fetch_20newsgroups(subset='train',
    categories=categories_dict[num_categories], shuffle=True, random_state=42)


In [112]:
train_set.target_names


['sci.med', 'soc.religion.christian']

In [113]:
len(train_set.data)


1193

In [114]:
len(train_set.filenames)
train_set.filenames[0]


'/Users/M919315/scikit_learn_data/20news_home/20news-bydate-train/soc.religion.christian/20619'

In [115]:
import pandas as pd

df = pd.DataFrame()
df['text'] = train_set.data
df['label'] = train_set.target
df['label_name'] = df['label'].apply(lambda x: train_set.target_names[x])
df

Unnamed: 0,text,label,label_name
0,From: mcovingt@aisun3.ai.uga.edu (Michael Covi...,1,soc.religion.christian
1,From: geb@cs.pitt.edu (Gordon Banks)\nSubject:...,0,sci.med
2,From: maridai@comm.mot.com (Marida Ignacio)\nS...,1,soc.religion.christian
3,From: cpage@two-step.seas.upenn.edu (Carter C....,1,soc.religion.christian
4,From: jkellett@netcom.com (Joe Kellett)\nSubje...,1,soc.religion.christian
...,...,...,...
1188,From: jerryb@eskimo.com (Jerry Kaufman)\nSubje...,1,soc.religion.christian
1189,From: ab961@Freenet.carleton.ca (Robert Alliso...,0,sci.med
1190,From: cmgrawbu@eos.ncsu.edu (CHRISTOPHER M GRA...,1,soc.religion.christian
1191,From: evanh@sco.COM (Evan Hunt)\nSubject: Re: ...,0,sci.med


In [116]:
df.to_csv(f"data/newsgroup_train_{num_categories}_classes.csv", index=False, sep='\t')

In [117]:
from sklearn.feature_extraction.text import CountVectorizer
count_vect = CountVectorizer()
X_train_counts = count_vect.fit_transform(train_set.data)
X_train_counts.shape

(1193, 24219)

In [118]:
from sklearn.feature_extraction.text import TfidfTransformer
tf_transformer = TfidfTransformer(use_idf=False).fit(X_train_counts)
X_train_tf = tf_transformer.transform(X_train_counts)
X_train_tf.shape

(1193, 24219)

In [119]:
tfidf_transformer = TfidfTransformer()
X_train_tfidf = tfidf_transformer.fit_transform(X_train_counts)
X_train_tfidf.shape

(1193, 24219)

In [120]:
from sklearn.naive_bayes import MultinomialNB
clf = MultinomialNB().fit(X_train_tfidf, train_set.target)

In [121]:
docs_new = ['God is love', 'OpenGL on the GPU is fast']
X_new_counts = count_vect.transform(docs_new)
X_new_tfidf = tfidf_transformer.transform(X_new_counts)

predicted = clf.predict(X_new_tfidf)

for doc, category in zip(docs_new, predicted):
    print('%r => %s' % (doc, train_set.target_names[category]))


'God is love' => soc.religion.christian
'OpenGL on the GPU is fast' => sci.med


In [122]:
from sklearn.pipeline import Pipeline
text_clf = Pipeline([
    ('vect', CountVectorizer()),
    ('tfidf', TfidfTransformer()),
    ('clf', MultinomialNB()),
])

In [123]:
text_clf.fit(train_set.data, train_set.target)


In [124]:
import numpy as np
test_set = fetch_20newsgroups(subset='test',
    categories=categories_dict[num_categories], shuffle=True, random_state=42)
docs_test = test_set.data
predicted = text_clf.predict(docs_test)
np.mean(predicted == test_set.target)


0.9256926952141058

In [125]:
df = pd.DataFrame()
df['text'] = test_set.data
df['label'] = test_set.target
df['label_name'] = df['label'].apply(lambda x: test_set.target_names[x])
df.to_csv(f"data/newsgroup_test_{num_categories}_classes.csv", index=False, sep='\t')

In [126]:
predicted

array([1, 1, 1, 1, 0, 1, 0, 0, 1, 1, 0, 0, 1, 0, 0, 1, 1, 1, 0, 0, 0, 0,
       0, 1, 0, 1, 1, 1, 1, 0, 1, 0, 1, 1, 0, 1, 1, 0, 1, 0, 1, 1, 0, 0,
       0, 1, 0, 1, 1, 0, 1, 1, 1, 1, 0, 1, 1, 0, 1, 0, 0, 1, 1, 1, 0, 0,
       1, 1, 1, 1, 0, 1, 0, 0, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 0, 0,
       1, 1, 1, 0, 1, 1, 0, 1, 0, 1, 1, 1, 1, 1, 0, 0, 1, 0, 1, 1, 1, 1,
       0, 1, 1, 0, 0, 0, 0, 0, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1,
       0, 0, 0, 0, 1, 1, 0, 0, 0, 1, 0, 1, 0, 1, 1, 1, 0, 1, 1, 1, 0, 1,
       1, 0, 0, 1, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 1, 0, 0, 0, 0, 1,
       0, 0, 0, 1, 0, 0, 1, 0, 1, 0, 1, 0, 0, 1, 1, 1, 1, 1, 0, 1, 1, 1,
       0, 1, 1, 1, 1, 0, 1, 1, 1, 1, 0, 0, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1,
       0, 0, 1, 1, 0, 1, 0, 0, 1, 1, 0, 1, 1, 1, 0, 0, 0, 1, 1, 1, 1, 1,
       1, 0, 1, 1, 1, 1, 1, 0, 1, 1, 0, 0, 1, 0, 0, 0, 1, 0, 0, 1, 1, 0,
       0, 0, 0, 1, 1, 1, 1, 1, 1, 0, 1, 1, 0, 0, 1, 0, 0, 1, 0, 1, 0, 1,
       1, 0, 1, 0, 0, 1, 1, 0, 1, 1, 1, 1, 1, 0, 0,

In [127]:
test_set.target

array([1, 1, 1, 1, 0, 1, 0, 0, 1, 1, 0, 0, 1, 0, 0, 1, 1, 1, 0, 0, 0, 0,
       0, 1, 0, 1, 1, 1, 1, 0, 1, 0, 1, 1, 0, 1, 0, 0, 1, 0, 1, 1, 0, 0,
       0, 0, 0, 1, 1, 0, 0, 1, 1, 1, 0, 1, 1, 0, 1, 0, 0, 1, 1, 1, 0, 0,
       1, 1, 1, 1, 0, 1, 0, 0, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 0, 0,
       1, 1, 1, 0, 0, 1, 0, 1, 0, 0, 1, 1, 1, 0, 0, 0, 1, 0, 1, 1, 1, 1,
       0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1,
       0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 1, 0, 1, 1, 1, 0, 1, 1, 1, 0, 1,
       1, 0, 0, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 1, 1, 1, 0, 0, 0, 0, 1,
       0, 0, 0, 1, 0, 0, 1, 0, 1, 0, 1, 0, 0, 1, 1, 0, 1, 1, 0, 1, 1, 1,
       0, 1, 1, 1, 1, 0, 1, 1, 1, 1, 0, 0, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1,
       0, 0, 1, 1, 0, 1, 0, 0, 0, 1, 0, 1, 1, 1, 0, 0, 0, 1, 1, 1, 0, 1,
       1, 0, 1, 1, 0, 0, 1, 0, 1, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 1, 0,
       0, 0, 0, 1, 1, 1, 1, 1, 1, 0, 1, 1, 0, 1, 1, 0, 0, 0, 0, 1, 0, 1,
       1, 0, 1, 0, 0, 1, 1, 0, 0, 1, 1, 1, 1, 0, 0,

In [128]:

import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import Pipeline
import os

data_dir = os.path.join(os.getcwd(), 'data')


train_df = pd.read_csv(os.path.join(data_dir, f"newsgroup_train_{num_categories}_classes.csv"), sep='\t')
test_df = pd.read_csv(os.path.join(data_dir, f"newsgroup_test_{num_categories}_classes.csv"), sep='\t')

text_clf = Pipeline([
    ('vect', CountVectorizer()),
    ('tfidf', TfidfTransformer()),
    ('clf', MultinomialNB()),
])

text_clf.fit(train_df["text"], train_df["label"])

docs_test = test_df["text"]
predicted = text_clf.predict(docs_test)
np.mean(predicted == test_df["label"])



0.9256926952141058