In [5]:
!pip install scikit-learn



In [67]:
import sklearn
from sklearn.datasets import fetch_20newsgroups
num_categories = 20


In [68]:
newsgroup_all_categories = ['alt.atheism',
 'comp.graphics',
 'comp.os.ms-windows.misc',
 'comp.sys.ibm.pc.hardware',
 'comp.sys.mac.hardware',
 'comp.windows.x',
 'misc.forsale',
 'rec.autos',
 'rec.motorcycles',
 'rec.sport.baseball',
 'rec.sport.hockey',
 'sci.crypt',
 'sci.electronics',
 'sci.med',
 'sci.space',
 'soc.religion.christian',
 'talk.politics.guns',
 'talk.politics.mideast',
 'talk.politics.misc',
 'talk.religion.misc']

# We only use 4 for simplicity and as a sanity check. Actually, let's have multiple versions.

categories_dict = {
    2: ['soc.religion.christian', 'sci.med'],
    4: ['alt.atheism', 'soc.religion.christian', 'comp.graphics', 'sci.med'],
    20: newsgroup_all_categories
}

In [69]:
train_set = fetch_20newsgroups(subset='train',
    categories=categories_dict[num_categories], shuffle=True, random_state=42)


In [70]:
train_set.target_names


['alt.atheism',
 'comp.graphics',
 'comp.os.ms-windows.misc',
 'comp.sys.ibm.pc.hardware',
 'comp.sys.mac.hardware',
 'comp.windows.x',
 'misc.forsale',
 'rec.autos',
 'rec.motorcycles',
 'rec.sport.baseball',
 'rec.sport.hockey',
 'sci.crypt',
 'sci.electronics',
 'sci.med',
 'sci.space',
 'soc.religion.christian',
 'talk.politics.guns',
 'talk.politics.mideast',
 'talk.politics.misc',
 'talk.religion.misc']

In [71]:
len(train_set.data)


11314

In [72]:
len(train_set.filenames)
train_set.filenames[0]


'/Users/M919315/scikit_learn_data/20news_home/20news-bydate-train/rec.autos/102994'

In [73]:
import pandas as pd

df = pd.DataFrame()
df['text'] = train_set.data
df['label'] = train_set.target

df

Unnamed: 0,text,label
0,From: lerxst@wam.umd.edu (where's my thing)\nS...,7
1,From: guykuo@carson.u.washington.edu (Guy Kuo)...,4
2,From: twillis@ec.ecn.purdue.edu (Thomas E Will...,4
3,From: jgreen@amber (Joe Green)\nSubject: Re: W...,1
4,From: jcm@head-cfa.harvard.edu (Jonathan McDow...,14
...,...,...
11309,From: jim.zisfein@factory.com (Jim Zisfein) \n...,13
11310,From: ebodin@pearl.tufts.edu\nSubject: Screen ...,4
11311,From: westes@netcom.com (Will Estes)\nSubject:...,3
11312,From: steve@hcrlgw (Steven Collins)\nSubject: ...,1


In [74]:
df.to_csv(f"data/newsgroup_train_{num_categories}_classes.csv", index=False, sep='\t')

In [75]:
from sklearn.feature_extraction.text import CountVectorizer
count_vect = CountVectorizer()
X_train_counts = count_vect.fit_transform(train_set.data)
X_train_counts.shape

(11314, 130107)

In [76]:
from sklearn.feature_extraction.text import TfidfTransformer
tf_transformer = TfidfTransformer(use_idf=False).fit(X_train_counts)
X_train_tf = tf_transformer.transform(X_train_counts)
X_train_tf.shape

(11314, 130107)

In [77]:
tfidf_transformer = TfidfTransformer()
X_train_tfidf = tfidf_transformer.fit_transform(X_train_counts)
X_train_tfidf.shape

(11314, 130107)

In [78]:
from sklearn.naive_bayes import MultinomialNB
clf = MultinomialNB().fit(X_train_tfidf, train_set.target)

In [79]:
docs_new = ['God is love', 'OpenGL on the GPU is fast']
X_new_counts = count_vect.transform(docs_new)
X_new_tfidf = tfidf_transformer.transform(X_new_counts)

predicted = clf.predict(X_new_tfidf)

for doc, category in zip(docs_new, predicted):
    print('%r => %s' % (doc, train_set.target_names[category]))


'God is love' => soc.religion.christian
'OpenGL on the GPU is fast' => rec.autos


In [80]:
from sklearn.pipeline import Pipeline
text_clf = Pipeline([
    ('vect', CountVectorizer()),
    ('tfidf', TfidfTransformer()),
    ('clf', MultinomialNB()),
])

In [81]:
text_clf.fit(train_set.data, train_set.target)


In [82]:
import numpy as np
test_set = fetch_20newsgroups(subset='test',
    categories=categories_dict[num_categories], shuffle=True, random_state=42)
docs_test = test_set.data
predicted = text_clf.predict(docs_test)
np.mean(predicted == test_set.target)


0.7738980350504514

In [83]:
df = pd.DataFrame()
df['text'] = test_set.data
df['label'] = test_set.target

df.to_csv(f"data/newsgroup_test_{num_categories}_classes.csv", index=False, sep='\t')

In [84]:
predicted

array([ 7, 11,  0, ...,  9,  3, 15])

In [85]:
test_set.target

array([ 7,  5,  0, ...,  9,  6, 15])

In [86]:

import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import Pipeline
import os

data_dir = os.path.join(os.getcwd(), 'data')


train_df = pd.read_csv(os.path.join(data_dir, f"newsgroup_train_{num_categories}_classes.csv"), sep='\t')
test_df = pd.read_csv(os.path.join(data_dir, f"newsgroup_test_{num_categories}_classes.csv"), sep='\t')

text_clf = Pipeline([
    ('vect', CountVectorizer()),
    ('tfidf', TfidfTransformer()),
    ('clf', MultinomialNB()),
])

text_clf.fit(train_df["text"], train_df["label"])

docs_test = test_df["text"]
predicted = text_clf.predict(docs_test)
np.mean(predicted == test_df["label"])



0.7738980350504514