# Loading data

We use the brown corpus, which is categorized into genre, for data that we apply the text classfication to.

In [None]:
import nltk
from nltk.corpus import brown
corpus = [(brown.words(fileid), brown.categories(fileid)) for fileid in brown.fileids()]

In [None]:
corpus

In [None]:
import random
random.seed(0)
random.shuffle(corpus)

In [None]:
corpus

In [None]:
len(corpus)

### Prepare a list of documents and a list of correct *class* (category in this example)

Each document should be `str`, which is ready to be separated into words by splitting by white space.

In [None]:
docs = [' '.join(words) for words, cats in corpus]
cats = [' '.join(cats) for words, cats in corpus]

In [None]:
docs[0]

In [None]:
cats[0]

# Vectorization

We use the machine learning library `scikit-learn`.
`CountVectorizer` transforms a document into a simple word frequency vector.

`max_df` and `min_df` are parameters for limiting vocabulary based on *document frequency (DF)*.
`max_df` specifies the upper limit of ratio of DF against the number of all documents because word types that appear too many documents seem not to be useful for classification.
`min_df` specifies the lower limit of DF because word types that appears very few times may lead a classifier to overfit easily.

In [None]:
from sklearn.feature_extraction.text import CountVectorizer
vectorizer = CountVectorizer(max_df=0.8, min_df=3)
xs = vectorizer.fit_transform(docs)

### Features correspond to the word vocabulary in the given documents

In [None]:
features = vectorizer.get_feature_names()
len(features)

### Each document is represented by a vector

In [None]:
print(xs[0])

In [None]:
features[9485]

### The target variable y is just a list of correct class

In [None]:
ys = cats

# Training of Logistic Regression

In [None]:
from sklearn.linear_model import LogisticRegression
lr = LogisticRegression()
lr.fit(xs, ys)

# Prediction of Class

In [None]:
lr.predict(xs)

In [None]:
lr.predict(xs) == ys

In [None]:
import numpy as np
np.sum(lr.predict(xs) == ys)

In [None]:
print(np.sum(ys == lr.predict(xs)) / len(ys))

# Separate test data from training data

In [None]:
xs_train = xs[:450]
ys_train = ys[:450]
xs_test = xs[450:]
ys_test = ys[450:]

In [None]:
lr = LogisticRegression()
lr.fit(xs_train, ys_train)

In [None]:
print(lr.predict(xs_test))

In [None]:
print(ys_test)

In [None]:
lr.predict(xs_test) == ys_test

In [None]:
print(np.sum(ys_test == lr.predict(xs_test)) / len(ys_test))

# TF-IDF Vectorization

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
vectorizer = TfidfVectorizer(max_df=0.8, min_df=3)
xs = vectorizer.fit_transform(docs)

In [None]:
xs_train = xs[:450]
xs_test = xs[450:]

In [None]:
lr = LogisticRegression()
lr.fit(xs_train, ys_train)

In [None]:
print(np.sum(ys_test == lr.predict(xs_test)) / len(ys_test))