# Newsgroups_classification_simple_TF

In [1]:
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline

from sklearn.datasets import fetch_20newsgroups
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn import metrics
from pprint import pprint

import tensorflow as tf
import tensorflow.contrib.slim as slim

import warnings
warnings.filterwarnings('always')

## 1. Data Loading and processing
TfidfVectorizer만 사용

In [2]:
categories = ['rec.sport.baseball', 'soc.religion.christian', 'comp.windows.x', 'sci.space']
newsgroups_train = fetch_20newsgroups(subset='train', remove=('headers', 'footers', 'quotes'), categories=categories)
newsgroups_test = fetch_20newsgroups(subset='test', remove=('headers', 'footers', 'quotes'), categories=categories)

X_train = newsgroups_train.data
Y_train = newsgroups_train.target
X_test = newsgroups_test.data
Y_test = newsgroups_test.target

### 1.1 Declare two vectorizers

In [3]:
# count_vectorizer = CountVectorizer(min_df=40)
tfidf_vectorizer = TfidfVectorizer(min_df=40)

### 1.2 Fitting vectorizer to the training set

In [4]:
# count_vectorizer = count_vectorizer.fit(X_train)
tfidf_vectorizer = tfidf_vectorizer.fit(X_train)

### 1.3 Transform X_train and X_test using 2 vectorizers

In [5]:
# X_train_count = count_vectorizer.transform(X_train)
# X_test_count = count_vectorizer.transform(X_test)

X_train_tfidf = tfidf_vectorizer.transform(X_train)
X_test_tfidf = tfidf_vectorizer.transform(X_test)

### 1.4 Convert sparse matrix into dense matrix

In [6]:
X_train = X_train_tfidf.toarray()
X_test = X_test_tfidf.toarray()

In [7]:
num_train = Y_train.shape[0]
num_test = Y_test.shape[0]

print('Number of tranining points', num_train)
print('Number of test points', num_test)

Number of tranining points 2382
Number of test points 1584


In [8]:
dim_X = X_train.shape[1]
print('Dimension of X : %d'%dim_X)

Dimension of X : 891


In [9]:
labels = np.unique(Y_test)
print('Labels : ',labels)

Labels :  [0 1 2 3]


## 2. Fitting classifiers with TF-IDF vectorizer and Tensorflow

### 2.1 Placeholder
- Shape of the placeholder for inputs : [batch_size, dim_X]
- Shape of the placeholder for outputs : [batch_size]

In [10]:
X = tf.placeholder(tf.float32, [None, dim_X], name='Inputs')
Y = tf.placeholder(tf.int32, [None], name='Labels')

### 2.2 Build the model
- TF-Slim을 이용하여 아주 간단하게 모델을 선언
- https://github.com/tensorflow/tensorflow/tree/master/tensorflow/contrib/slim

In [11]:
def fully_connected(inputs, num_labels, hidden_sizes=[100, 100], scope='FCN'):
    """
    [fully_connected] n개의 hidden layer를 갖는 feed-forward network 생성 (with TF-Slim)
    
    [Args]
      - inputs: 입력 데이터를 위한 placeholder
      - hidden_sizes: a list (은닉 노드 수를 원하는 층 수 만큼 기록한 리스트)
      - Scope: default value ("FCN")
    """
    # Inputs에서 1차원의 텐서들이 placeholder로 들어온다고 가정
    input_dim = inputs.get_shape()[1]

    # Number of hidden layers
    num_hidden_layers = len(hidden_sizes)
    
    with slim.arg_scope([slim.fully_connected],
                        activation_fn=tf.nn.relu,
                        weights_initializer=tf.contrib.layers.xavier_initializer(),
                        biases_initializer=tf.constant_initializer(0.0),
                        weights_regularizer=slim.l2_regularizer(0.05)):
        net = inputs
        for i in range(num_hidden_layers):
            scope_name = 'fc' + str(i)
            net = slim.fully_connected(inputs=net, num_outputs=hidden_sizes[i], scope=scope_name)
        net = slim.fully_connected(inputs=net, num_outputs=num_labels, activation_fn=None, scope='logits')
    
    return net

In [12]:
logits = fully_connected(inputs=X, num_labels=len(labels), hidden_sizes=[100, 100], scope='FCN')

### 2.3 Cost function and optimizer

In [13]:
cost = tf.reduce_mean(tf.nn.sparse_softmax_cross_entropy_with_logits(logits=logits, labels=Y))
optimizer = tf.train.AdamOptimizer(learning_rate=0.01)
train_op = optimizer.minimize(cost)

### 2.4 Predicting operator

In [14]:
predict = tf.argmax(logits, axis=1)
correct_prediction = tf.nn.in_top_k(logits, Y, 1)
accuracy = tf.reduce_mean(tf.cast(correct_prediction, tf.float32))

### 2.5 Run

In [15]:
NUM_EPOCHS = 100
BATCH_SIZE = 20

In [16]:
# 결과를 저장할 리스트를 작성
train_cost_list = list()
test_cost_list = list()
test_accuracy_list = list()

In [17]:
with tf.Session() as sess:
    # Variable initialization
    sess.run(tf.global_variables_initializer())
    
    # Indices for construncting batches
    start_idx = range(0, num_train, BATCH_SIZE)
    end_idx = range(BATCH_SIZE, num_train + 1, BATCH_SIZE)
    
    NUM_BATCHES = len(start_idx)
    
    for epoch in range(0, NUM_EPOCHS):
        # Set "Train_cost" as 0 before starting the epoch
        train_cost = 0
        
        # Training Phrase
        for start, end in zip(start_idx, end_idx):
            
            # Construnct the input batch
            batch_xs = X_train[start:end]
            batch_ys = Y_train[start:end]
            
            # Calculate cost
            tmp_cost, _ = sess.run([cost, train_op], feed_dict={X:batch_xs, Y:batch_ys})
            train_cost += tmp_cost
            
        train_cost = train_cost / NUM_BATCHES
        train_cost_list.append(train_cost)
        print("[{} epoch] training cost {: 0.4f}".format((epoch + 1), train_cost))
        
        # Check test performance
        if (epoch + 1) % 10 == 0:
            test_cost, test_accuracy = sess.run([cost, accuracy], feed_dict={X: X_test, Y:Y_test})
            test_cost_list.append(test_cost)
            test_accuracy_list.append(test_accuracy)
            print("\t[{} epoch] test accuracy {:0.4f}".format((epoch + 1), test_accuracy))
            
    # Test phase
    Y_test_hat, test_accuracy = sess.run([predict, accuracy], feed_dict = {X:X_test, Y:Y_test})
    print("\n")
    print("Test accuracy : {:0.4f}".format(test_accuracy))

[1 epoch] training cost  0.5378
[2 epoch] training cost  0.1494
[3 epoch] training cost  0.0748
[4 epoch] training cost  0.0561
[5 epoch] training cost  0.0741
[6 epoch] training cost  0.0613
[7 epoch] training cost  0.0719
[8 epoch] training cost  0.0497
[9 epoch] training cost  0.0483
[10 epoch] training cost  0.0519
	[10 epoch] test accuracy 0.8068
[11 epoch] training cost  0.0518
[12 epoch] training cost  0.0439
[13 epoch] training cost  0.0342
[14 epoch] training cost  0.0309
[15 epoch] training cost  0.0308
[16 epoch] training cost  0.0308
[17 epoch] training cost  0.0309
[18 epoch] training cost  0.0307
[19 epoch] training cost  0.0307
[20 epoch] training cost  0.0306
	[20 epoch] test accuracy 0.8081
[21 epoch] training cost  0.0306
[22 epoch] training cost  0.0306
[23 epoch] training cost  0.0306
[24 epoch] training cost  0.0306
[25 epoch] training cost  0.0306
[26 epoch] training cost  0.0305
[27 epoch] training cost  0.0305
[28 epoch] training cost  0.0305
[29 epoch] training

In [18]:
Y_test_hat

array([1, 3, 2, ..., 3, 2, 3])

In [19]:
Y_test

array([2, 3, 2, ..., 3, 2, 3])

In [20]:
from sklearn.metrics import confusion_matrix
cm = confusion_matrix(Y_test, Y_test_hat)

print(cm)

[[325  23  34  13]
 [ 19 328  28  22]
 [ 21  44 292  37]
 [ 13  30  20 335]]
