### 1. Import each file into its own row in a dataframe…

In [198]:
import glob
x = 0
err = 0
content_list = []
for file in glob.glob('20_newsgroups/*/*'):
    theme = file.split('/')[1]
    with open(file, 'r') as f:
        try:
            content_list.append({'Theme':theme, 'Content':f.read()})
        except ValueError as e:
            err += 1
            #print('Error with file', file, "->", e)
    x += 1
#    if x > 1500:
#        break
print(x, "files read,", err, "errors.")

20417 files read, 55 errors.


In [145]:
import pandas as pd
df = pd.DataFrame(content_list)

In [146]:
df.tail()

Unnamed: 0,Content,Theme
20357,In article <930426.140835.4f1.rusnews.w165w@ma...,talk.religion.misc
20358,In article <1993Apr27.073723.18577@csis.dit.cs...,talk.religion.misc
20359,In article <1rc1f3INN7rl@emx.cc.utexas.edu> \n...,talk.religion.misc
20360,In article <1993Apr26.231845.13843@digi.lonest...,talk.religion.misc
20361,In article <C64H4w.BFH@darkside.osrhe.uoknor.e...,talk.religion.misc


### 2. Use the LabelEncoder to convert the group names to numeric labels 


In [147]:
from sklearn import preprocessing
themes_list = list(set(df['Theme']))
le = preprocessing.LabelEncoder()
le.fit(themes_list)
df['Theme_id'] = df['Theme'].apply(le.transform)
df.tail()

Unnamed: 0,Content,Theme,Theme_id
20357,In article <930426.140835.4f1.rusnews.w165w@ma...,talk.religion.misc,19
20358,In article <1993Apr27.073723.18577@csis.dit.cs...,talk.religion.misc,19
20359,In article <1rc1f3INN7rl@emx.cc.utexas.edu> \n...,talk.religion.misc,19
20360,In article <1993Apr26.231845.13843@digi.lonest...,talk.religion.misc,19
20361,In article <C64H4w.BFH@darkside.osrhe.uoknor.e...,talk.religion.misc,19


### 3. Pick out 10 words or phrases to use as manually created features. Doing an 80/20 train/test split, how well does a Naive Bayes classifier do?


In [148]:
features = [
    "God",
    "gun",
    "car",
    "sci",
    "space",
    "society",
    "mac",
    "computer",
    "chip",
    "game"
]

feature_cols = []
for word in features:
    df['has_' + word] = df['Content'].str.contains(word)
    feature_cols.append('has_' + word)
    print(word, '>', len(df) - df['has_' + word].value_counts()[0])
print("Total:", len(df))

God > 1379
gun > 1061
car > 4958
sci > 1700
space > 1028
society > 409
mac > 1651
computer > 1020
chip > 777
game > 1267
Total: 20362


In [185]:
from sklearn.cross_validation import train_test_split
X_train, X_test, y_train, y_test = train_test_split(
    df[feature_cols],
    df['Theme_id'],
    test_size=.2
)

In [138]:
from sklearn import naive_bayes
clf = naive_bayes.BernoulliNB()
clf.fit(X_train, y_train)
clf.predict(X_test)
print(clf.score(X_train, y_train))
print(clf.score(X_test, y_test))

0.197311068819
0.19641541861


### 4. Use a CountVectorizer to automatically create your list of features. Doing an 80/20 train/test split, how well can a Naive Bayes classifier do?


In [139]:
from sklearn.feature_extraction.text import CountVectorizer
vectorizer = CountVectorizer(ngram_range=(1, 2))

In [140]:
%time vectorizer.fit(df['Content'])

CPU times: user 21.5 s, sys: 608 ms, total: 22.1 s
Wall time: 22.1 s


CountVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 2), preprocessor=None, stop_words=None,
        strip_accents=None, token_pattern='(?u)\\b\\w\\w+\\b',
        tokenizer=None, vocabulary=None)

In [141]:
every_single_word_features = vectorizer.transform(df['Content'])

In [186]:
X_train, X_test, y_train, y_test = train_test_split(
    df[feature_cols],
    df['Theme_id'],
    test_size=.2
)
clf = naive_bayes.BernoulliNB()
clf.fit(X_train, y_train)
clf.predict(X_test)
print(clf.score(X_train, y_train))
print(clf.score(X_test, y_test))

0.199152802505
0.199116130616


### 5. PUSH THAT SCORE UP! You can adjust ngrams, max_features and any other options of the vectorizer, or try a decision tree or any other type of classifier.


```
ngram_range : tuple (min_n, max_n)
    The lower and upper boundary of the range of n-values for different
    n-grams to be extracted. All values of n such that min_n <= n <= max_n
    will be used.
    
stop_words : string {'english'}, list, or None (default)
    If 'english', a built-in stop word list for English is used.  
    
max_features : int or None, default=None
    If not None, build a vocabulary that only consider the top
    max_features ordered by term frequency across the corpus.

    This parameter is ignored if vocabulary is not None.
```

In [220]:
vectorizer = CountVectorizer(ngram_range=(1, 3), max_features=20000, stop_words='english')
# 200 -> 0.41, 0.39
# 500 -> 0.55, 0.51
# 5000 -> 0.75, 0.7
# 10000 -> 0.777, 0.71

In [221]:
%time vectorizer.fit(df['Content'])
%time every_single_word_features = vectorizer.transform(df['Content'])

CPU times: user 31.1 s, sys: 1.07 s, total: 32.1 s
Wall time: 32.2 s
CPU times: user 11.7 s, sys: 69.1 ms, total: 11.7 s
Wall time: 11.8 s


In [222]:
every_single_word_features.shape

(20362, 20000)

In [223]:
X_train, X_test, y_train, y_test = train_test_split(
    every_single_word_features,
    df['Theme_id'],
    test_size=.2
)
clf = naive_bayes.BernoulliNB()
clf.fit(X_train, y_train)
clf.predict(X_test)
print(clf.score(X_train, y_train))
print(clf.score(X_test, y_test))
print(clf.predict(X_test))

0.78691141261
0.719616989934
[16 14  5 ..., 12 18 17]


# YEAH!!

In [230]:
sentences = [
    "TSN Sportsdesk just reported that the OTTAWA SUN has reported that",
    "I love science fiction",
    "Car, cars are my wheels",
    "Do you believe in God?",
    "Jet Set Willy is the best game ever.",
    "I wish my mum boughts me a Hummer",
    "Windows sucks, man, my computer freezed again!",
    "The probability of dying from toe cancer is higher than you think.",
    "Move hard and break stuff",
    "Basketball is the best sport ever",
    "Guns should be banned, I think",
    "Windows Mac OS hardware",
    "Basketball goals football hockey stuff"
]

In [231]:
sentences_features = vectorizer.transform(sentences)
print('Shape:', sentences_features.shape, "\n")
results = clf.predict(sentences_features)
for sentence, theme in zip(sentences, results):
    print(sentence, '->', le.inverse_transform(theme))


Shape: (13, 20000) 

TSN Sportsdesk just reported that the OTTAWA SUN has reported that -> misc.forsale
I love science fiction -> misc.forsale
Car, cars are my wheels -> misc.forsale
Do you believe in God? -> misc.forsale
Jet Set Willy is the best game ever. -> misc.forsale
I wish my mum boughts me a Hummer -> misc.forsale
Windows sucks, man, my computer freezed again! -> misc.forsale
The probability of dying from toe cancer is higher than you think. -> misc.forsale
Move hard and break stuff -> misc.forsale
Basketball is the best sport ever -> misc.forsale
Guns should be banned, I think -> misc.forsale
Windows Mac OS hardware -> misc.forsale
Basketball goals football hockey stuff -> misc.forsale


(Maybe 20000 isn't enough...)