# 인구조사 데이터

수입이 50000 달러가 넘는지 여부에 따라 이진 분류

In [4]:
import tempfile
# import urllib # python 2.x
import urllib.request
import pandas as pd

COLUMNS = ["age", "workclass", "fnlwgt", "education", "education_num",
"marital_status", "occupation", "relationship", "race", "gender",
"capital_gain", "capital_loss", "hours_per_week", "native_country",
"income_bracket"]

df_train = pd.read_csv('adult.data', names=COLUMNS, skipinitialspace=True)
df_test = pd.read_csv('adult.test', names=COLUMNS, skipinitialspace=True, skiprows=1)

In [5]:
df_train.head()

Unnamed: 0,age,workclass,fnlwgt,education,education_num,marital_status,occupation,relationship,race,gender,capital_gain,capital_loss,hours_per_week,native_country,income_bracket
0,39,State-gov,77516,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States,<=50K
1,50,Self-emp-not-inc,83311,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States,<=50K
2,38,Private,215646,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States,<=50K
3,53,Private,234721,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,40,United-States,<=50K
4,28,Private,338409,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,40,Cuba,<=50K


수입이 50,000달러가 넘는다면 1, 아니면 0의 값을 가지는 label column 생성

In [6]:
LABEL_COLUMN = "label"
df_train[LABEL_COLUMN] = (df_train['income_bracket'].apply(lambda x: ">50K" in x)).astype(int)
df_test[LABEL_COLUMN] = (df_test['income_bracket'].apply(lambda x: ">50K" in x)).astype(int)

데이터 타입에 따라 컬럼 분류(categorical 인지 continuous 인지)

In [7]:
CATEGORICAL_COLUMNS = ["workclass", "education", "marital_status", "occupation", "relationship", "race", "gender", "native_country"]
CONTINUOUS_COLUMNS = ["age", "education_num", "capital_gain", "capital_loss", "hours_per_week"]

### 데이터를 tensor로 변환하기
feature 와 label 로 data를 분류

In [30]:
import tensorflow as tf

def input_fn(df):
    continuous_cols = {k: tf.constant(df[k].values) for k in CONTINUOUS_COLUMNS}
    categorical_cols = {k: tf.SparseTensor(indices=[[i, 0] for i in range(df[k].size)],
                                           values=df[k].values,
                                           dense_shape=[df[k].size, 1])
                        for k in CATEGORICAL_COLUMNS}
    
    feature_cols = dict(continuous_cols.items() | categorical_cols.items())
    
    label = tf.constant(df[LABEL_COLUMN].values)
    
    return feature_cols, label

def train_input_fn():
    return input_fn(df_train)

def eval_input_fn():
    return input_fn(df_test)

기본 Categorical 특성 열

In [29]:
# 특성값의 범위? 를 알고 있다면 sparse_column_with_keys
gender = tf.contrib.layers.sparse_column_with_keys(column_name="gender", keys=["female", "male"])
# 특성값의 범위를 모른다면 sparse_column_with_hash_bucket
education = tf.contrib.layers.sparse_column_with_hash_bucket("education", hash_bucket_size=1000)
race = tf.contrib.layers.sparse_column_with_keys(column_name="race", keys=["Amer-Indian-Eskimo", "Asian-Pac-Islander", "Black", "Other", "White"])
marital_status = tf.contrib.layers.sparse_column_with_hash_bucket("marital_status", hash_bucket_size=100)
relationship = tf.contrib.layers.sparse_column_with_hash_bucket("relationship", hash_bucket_size=100)
workclass = tf.contrib.layers.sparse_column_with_hash_bucket("workclass", hash_bucket_size=100)
occupation = tf.contrib.layers.sparse_column_with_hash_bucket("occupation", hash_bucket_size=1000)
native_country = tf.contrib.layers.sparse_column_with_hash_bucket("native_country", hash_bucket_size=1000)

































기초 Continuous 특성 열

In [26]:
age = tf.contrib.layers.real_valued_column("age")
education_num = tf.contrib.layers.real_valued_column("education_num")
capital_gain = tf.contrib.layers.real_valued_column("capital_gain")
capital_loss = tf.contrib.layers.real_valued_column("capital_loss")
hours_per_week = tf.contrib.layers.real_valued_column("hours_per_week")

버킷화를 통해 Continuous 특성들을 범주화 하기

In [33]:
age_buckets = tf.contrib.layers.bucketized_column(age, boundaries=[18, 25, 30, 35, 40,
45, 50, 55, 60, 65])

In [34]:
age_buckets

_BucketizedColumn(source_column=_RealValuedColumn(column_name='age', dimension=1, default_value=None, dtype=tf.float32, normalizer=None), boundaries=(18, 25, 30, 35, 40, 45, 50, 55, 60, 65))

다수의 열을 CrossedColumn으로 교차하기

In [14]:
education_x_occupation = tf.contrib.layers.crossed_column([education, occupation], hash_bucket_size=int(1e4))
age_buckets_x_race_x_occupation = tf.contrib.layers.crossed_column([age_buckets, race, occupation], hash_bucket_size=int(1e6))









### 회귀 모델 정의

In [37]:
model_dir = tempfile.mkdtemp()
m = tf.contrib.learn.LinearClassifier(feature_columns=[gender, native_country, education, occupation, workclass, marital_status],model_dir=model_dir)

INFO:tensorflow:Using default config.


INFO:tensorflow:Using default config.


INFO:tensorflow:Using config: {'_keep_checkpoint_max': 5, '_num_ps_replicas': 0, '_environment': 'local', '_evaluation_master': '', '_cluster_spec': <tensorflow.python.training.server_lib.ClusterSpec object at 0x000000000D0A44E0>, '_tf_random_seed': None, '_keep_checkpoint_every_n_hours': 10000, '_save_checkpoints_steps': None, '_task_id': 0, '_save_checkpoints_secs': 600, '_is_chief': True, '_task_type': None, '_master': '', '_tf_config': gpu_options {
  per_process_gpu_memory_fraction: 1
}
, '_save_summary_steps': 100}


INFO:tensorflow:Using config: {'_keep_checkpoint_max': 5, '_num_ps_replicas': 0, '_environment': 'local', '_evaluation_master': '', '_cluster_spec': <tensorflow.python.training.server_lib.ClusterSpec object at 0x000000000D0A44E0>, '_tf_random_seed': None, '_keep_checkpoint_every_n_hours': 10000, '_save_checkpoints_steps': None, '_task_id': 0, '_save_checkpoints_secs': 600, '_is_chief': True, '_task_type': None, '_master': '', '_tf_config': gpu_options {
  per_process_gpu_memory_fraction: 1
}
, '_save_summary_steps': 100}


### 모델을 훈련, 평가

In [38]:
m.fit(input_fn=train_input_fn, steps=200)

Instructions for updating:
Estimator is decoupled from Scikit Learn interface by moving into
separate class SKCompat. Arguments x, y and batch_size are only
available in the SKCompat class, Estimator will only accept input_fn.
Example conversion:
  est = Estimator(...) -> est = SKCompat(Estimator(...))


Instructions for updating:
Estimator is decoupled from Scikit Learn interface by moving into
separate class SKCompat. Arguments x, y and batch_size are only
available in the SKCompat class, Estimator will only accept input_fn.
Example conversion:
  est = Estimator(...) -> est = SKCompat(Estimator(...))


Instructions for updating:
Estimator is decoupled from Scikit Learn interface by moving into
separate class SKCompat. Arguments x, y and batch_size are only
available in the SKCompat class, Estimator will only accept input_fn.
Example conversion:
  est = Estimator(...) -> est = SKCompat(Estimator(...))


Instructions for updating:
Estimator is decoupled from Scikit Learn interface by moving into
separate class SKCompat. Arguments x, y and batch_size are only
available in the SKCompat class, Estimator will only accept input_fn.
Example conversion:
  est = Estimator(...) -> est = SKCompat(Estimator(...))


Instructions for updating:
Estimator is decoupled from Scikit Learn interface by moving into
separate class SKCompat. Arguments x, y and batch_size are only
available in the SKCompat class, Estimator will only accept input_fn.
Example conversion:
  est = Estimator(...) -> est = SKCompat(Estimator(...))


Instructions for updating:
Estimator is decoupled from Scikit Learn interface by moving into
separate class SKCompat. Arguments x, y and batch_size are only
available in the SKCompat class, Estimator will only accept input_fn.
Example conversion:
  est = Estimator(...) -> est = SKCompat(Estimator(...))


INFO:tensorflow:Create CheckpointSaverHook.


INFO:tensorflow:Create CheckpointSaverHook.


INFO:tensorflow:Saving checkpoints for 1 into C:\Users\ADMINI~1\AppData\Local\Temp\tmp1h8d09s1\model.ckpt.


INFO:tensorflow:Saving checkpoints for 1 into C:\Users\ADMINI~1\AppData\Local\Temp\tmp1h8d09s1\model.ckpt.


INFO:tensorflow:step = 1, loss = 0.69319


INFO:tensorflow:step = 1, loss = 0.69319


INFO:tensorflow:global_step/sec: 37.0349


INFO:tensorflow:global_step/sec: 37.0349


INFO:tensorflow:step = 101, loss = 0.391665


INFO:tensorflow:step = 101, loss = 0.391665


INFO:tensorflow:Saving checkpoints for 200 into C:\Users\ADMINI~1\AppData\Local\Temp\tmp1h8d09s1\model.ckpt.


INFO:tensorflow:Saving checkpoints for 200 into C:\Users\ADMINI~1\AppData\Local\Temp\tmp1h8d09s1\model.ckpt.


INFO:tensorflow:Loss for final step: 0.380432.


INFO:tensorflow:Loss for final step: 0.380432.


<tensorflow.contrib.learn.python.learn.estimators.linear.LinearClassifier at 0xd0a4470>

In [40]:
results = m.evaluate(input_fn=eval_input_fn, steps=1)
for key in sorted(results):
    print ("%s: %s" % (key, results[key]))

INFO:tensorflow:Starting evaluation at 2017-02-24-06:29:53


INFO:tensorflow:Starting evaluation at 2017-02-24-06:29:53


INFO:tensorflow:Evaluation [1/1]


INFO:tensorflow:Evaluation [1/1]


INFO:tensorflow:Finished evaluation at 2017-02-24-06:29:54


INFO:tensorflow:Finished evaluation at 2017-02-24-06:29:54


INFO:tensorflow:Saving dict for global step 200: accuracy = 0.828942, accuracy/baseline_label_mean = 0.236226, accuracy/threshold_0.500000_mean = 0.828942, auc = 0.865804, global_step = 200, labels/actual_label_mean = 0.236226, labels/prediction_mean = 0.242469, loss = 0.374666, precision/positive_threshold_0.500000_mean = 0.699962, recall/positive_threshold_0.500000_mean = 0.482839


INFO:tensorflow:Saving dict for global step 200: accuracy = 0.828942, accuracy/baseline_label_mean = 0.236226, accuracy/threshold_0.500000_mean = 0.828942, auc = 0.865804, global_step = 200, labels/actual_label_mean = 0.236226, labels/prediction_mean = 0.242469, loss = 0.374666, precision/positive_threshold_0.500000_mean = 0.699962, recall/positive_threshold_0.500000_mean = 0.482839






accuracy: 0.828942
accuracy/baseline_label_mean: 0.236226
accuracy/threshold_0.500000_mean: 0.828942
auc: 0.865804
global_step: 200
labels/actual_label_mean: 0.236226
labels/prediction_mean: 0.242469
loss: 0.374666
precision/positive_threshold_0.500000_mean: 0.699962
recall/positive_threshold_0.500000_mean: 0.482839


### 과적화를 피하기 위한 정규화 추가

In [41]:
m = tf.contrib.learn.LinearClassifier(feature_columns=[gender, native_country, education, occupation, workclass, marital_status,],
                                      optimizer=tf.train.FtrlOptimizer(
        learning_rate=0.1,
        l1_regularization_strength=1.0,
        l2_regularization_strength=1.0),
                                      model_dir=model_dir)

INFO:tensorflow:Using default config.


INFO:tensorflow:Using default config.


INFO:tensorflow:Using config: {'_keep_checkpoint_max': 5, '_num_ps_replicas': 0, '_environment': 'local', '_evaluation_master': '', '_cluster_spec': <tensorflow.python.training.server_lib.ClusterSpec object at 0x00000000130E7DA0>, '_tf_random_seed': None, '_keep_checkpoint_every_n_hours': 10000, '_save_checkpoints_steps': None, '_task_id': 0, '_save_checkpoints_secs': 600, '_is_chief': True, '_task_type': None, '_master': '', '_tf_config': gpu_options {
  per_process_gpu_memory_fraction: 1
}
, '_save_summary_steps': 100}


INFO:tensorflow:Using config: {'_keep_checkpoint_max': 5, '_num_ps_replicas': 0, '_environment': 'local', '_evaluation_master': '', '_cluster_spec': <tensorflow.python.training.server_lib.ClusterSpec object at 0x00000000130E7DA0>, '_tf_random_seed': None, '_keep_checkpoint_every_n_hours': 10000, '_save_checkpoints_steps': None, '_task_id': 0, '_save_checkpoints_secs': 600, '_is_chief': True, '_task_type': None, '_master': '', '_tf_config': gpu_options {
  per_process_gpu_memory_fraction: 1
}
, '_save_summary_steps': 100}


In [42]:
results_modify = m.evaluate(input_fn=eval_input_fn, steps=1)
for key in sorted(results):
    print ("%s: %s" % (key, results_modify[key]))

INFO:tensorflow:Starting evaluation at 2017-02-24-06:45:46


INFO:tensorflow:Starting evaluation at 2017-02-24-06:45:46


INFO:tensorflow:Evaluation [1/1]


INFO:tensorflow:Evaluation [1/1]


INFO:tensorflow:Finished evaluation at 2017-02-24-06:45:48


INFO:tensorflow:Finished evaluation at 2017-02-24-06:45:48


INFO:tensorflow:Saving dict for global step 200: accuracy = 0.828942, accuracy/baseline_label_mean = 0.236226, accuracy/threshold_0.500000_mean = 0.828942, auc = 0.865804, global_step = 200, labels/actual_label_mean = 0.236226, labels/prediction_mean = 0.242469, loss = 0.374666, precision/positive_threshold_0.500000_mean = 0.699962, recall/positive_threshold_0.500000_mean = 0.482839


INFO:tensorflow:Saving dict for global step 200: accuracy = 0.828942, accuracy/baseline_label_mean = 0.236226, accuracy/threshold_0.500000_mean = 0.828942, auc = 0.865804, global_step = 200, labels/actual_label_mean = 0.236226, labels/prediction_mean = 0.242469, loss = 0.374666, precision/positive_threshold_0.500000_mean = 0.699962, recall/positive_threshold_0.500000_mean = 0.482839






accuracy: 0.828942
accuracy/baseline_label_mean: 0.236226
accuracy/threshold_0.500000_mean: 0.828942
auc: 0.865804
global_step: 200
labels/actual_label_mean: 0.236226
labels/prediction_mean: 0.242469
loss: 0.374666
precision/positive_threshold_0.500000_mean: 0.699962
recall/positive_threshold_0.500000_mean: 0.482839
