In [12]:
import pandas as pd
import numpy as np
import lightgbm as lgb
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score
from sklearn import preprocessing
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import f1_score

In [13]:
columns = [
    "age",
    "workclass",
    "fnlwgt",
    "education",
    "education-num",
    "marital-status",
    "occupation",
    "relationship",
    "race",
    "sex",
    "capital-gain",
    "capital-loss",
    "hours-per-week",
    "native-country",
    "income",
]

categorical_features = [
    "workclass",
    "education",
    "marital-status",
    "occupation",
    "relationship",
    "race",
    "sex",
    "native-country",
]

In [14]:
le = preprocessing.LabelEncoder()

In [15]:
train = pd.read_csv('../datasets/adult.data', header=None)
train.columns = columns
X_train = train.drop(['income'], axis=1)
X_train[categorical_features] = X_train[categorical_features].astype('category')
y_train = train['income'].astype('category')
le.fit(y_train)
y_train = le.transform(y_train)

In [16]:
test = pd.read_csv('../datasets/adult.test', header=None, skiprows=1)
test.columns = columns
X_test = test.drop(['income'], axis=1)
X_test[categorical_features] = X_test[categorical_features].astype('category')
y_test = test['income'].astype('category')
le.fit(y_test)
y_test = le.transform(y_test)

In [23]:
train_data = lgb.Dataset(X_train, label=y_train,
                         categorical_feature=categorical_features)
eval_data  = lgb.Dataset(X_test, label=y_test, reference= train_data,
                         categorical_feature=categorical_features)

params = {
    'task': 'train',
    'boosting_type': 'gbdt',
    'verbose': 2,
}

model = lgb.train(
    params,
    train_data,
    valid_sets=eval_data,
    num_boost_round=100,
    verbose_eval=10,
)

In [32]:
y_pred = model.predict(X_train, num_iteration=model.best_iteration)
y_pred[y_pred<=0.5] = 0
y_pred[y_pred>0.5] = 1

# 精度 (Accuracy) を計算する
accuracy = sum(y_train == y_pred) / len(y_train)
print(accuracy)

0.888025552040785


In [33]:
y_pred = model.predict(X_test, num_iteration=model.best_iteration)
y_pred[y_pred<=0.5] = 0
y_pred[y_pred>0.5] = 1

# 精度 (Accuracy) を計算する
accuracy = sum(y_test == y_pred) / len(y_test)
print(accuracy)

# 混合行列
cm = confusion_matrix(y_test, y_pred)
print(cm)

# F値
print(f1_score(y_test, y_pred))

0.8726122474049506
[[11732   703]
 [ 1371  2475]]
0.704726651480638
