In [11]:
import pandas as pd
import numpy as np
import lightgbm as lgb
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score
from sklearn import preprocessing
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import f1_score
from sklearn.linear_model import LogisticRegression
from sklearn.svm import LinearSVC, SVC
from sklearn.ensemble import RandomForestClassifier
import category_encoders as ce

In [2]:
columns = [
    "age",
    "workclass",
    "fnlwgt",
    "education",
    "education-num",
    "marital-status",
    "occupation",
    "relationship",
    "race",
    "sex",
    "capital-gain",
    "capital-loss",
    "hours-per-week",
    "native-country",
    "income",
]

categorical_features = [
    "workclass",
    "education",
    "marital-status",
    "occupation",
    "relationship",
    "race",
    "sex",
    "native-country",
]

In [3]:
le_y = preprocessing.LabelEncoder()
le_x = {feature:preprocessing.LabelEncoder() for feature in categorical_features}

In [4]:
train = pd.read_csv('../datasets/adult.data', header=None)
train.columns = columns

# data
X_train = train.drop(['income'], axis=1)
X_train[categorical_features] = X_train[categorical_features].astype('category')
for feature in categorical_features:
    le_x[feature].fit(X_train[feature])
    X_train[feature] = le_x[feature].transform(X_train[feature])

# label
y_train = train['income'].astype('category')
le_y.fit(y_train)
y_train = le_y.transform(y_train)

In [5]:
test = pd.read_csv('../datasets/adult.test', header=None, skiprows=1)
test.columns = columns

# data
X_test = test.drop(['income'], axis=1)
X_test[categorical_features] = X_test[categorical_features].astype('category')
for feature in categorical_features:
    le_x[feature].fit(X_test[feature])
    X_test[feature] = le_x[feature].transform(X_test[feature])

# label
y_test = test['income'].astype('category')
le_y.fit(y_test)
y_test = le_y.transform(y_test)

In [6]:
# Logistic Regression
model = LogisticRegression()
model.fit(X_train, y_train)
y_pred = model.predict(X_train)
y_pred[y_pred<=0.5] = 0
y_pred[y_pred>0.5] = 1

# 精度 (Accuracy) を計算する
accuracy = sum(y_train == y_pred) / len(y_train)
print(accuracy)

y_pred = model.predict(X_test)
y_pred[y_pred<=0.5] = 0
y_pred[y_pred>0.5] = 1

# 精度 (Accuracy) を計算する
accuracy = sum(y_test == y_pred) / len(y_test)
print(accuracy)

# 混合行列
cm = confusion_matrix(y_test, y_pred)
print(cm)

# F値
print(f1_score(y_test, y_pred))



0.7929731887841283
0.7950371598796143
[[11834   601]
 [ 2736  1110]]
0.3994961310059384


In [8]:
# Linear SVC
model = LinearSVC()
model.fit(X_train, y_train)
y_pred = model.predict(X_train)
y_pred[y_pred<=0.5] = 0
y_pred[y_pred>0.5] = 1

# 精度 (Accuracy) を計算する
accuracy = sum(y_train == y_pred) / len(y_train)
print(accuracy)

y_pred = model.predict(X_test)
y_pred[y_pred<=0.5] = 0
y_pred[y_pred>0.5] = 1

# 精度 (Accuracy) を計算する
accuracy = sum(y_test == y_pred) / len(y_test)
print(accuracy)

# 混合行列
cm = confusion_matrix(y_test, y_pred)
print(cm)

# F値
print(f1_score(y_test, y_pred))

0.2408095574460244
0.2362876973158897
[[    1 12434]
 [    0  3846]]
0.38219218920798964




In [9]:
# SVC
model = SVC()
model.fit(X_train, y_train)
y_pred = model.predict(X_train)
y_pred[y_pred<=0.5] = 0
y_pred[y_pred>0.5] = 1

# 精度 (Accuracy) を計算する
accuracy = sum(y_train == y_pred) / len(y_train)
print(accuracy)

y_pred = model.predict(X_test)
y_pred[y_pred<=0.5] = 0
y_pred[y_pred>0.5] = 1

# 精度 (Accuracy) を計算する
accuracy = sum(y_test == y_pred) / len(y_test)
print(accuracy)

# 混合行列
cm = confusion_matrix(y_test, y_pred)
print(cm)

# F値
print(f1_score(y_test, y_pred))



0.9987101133257578
0.7639579878385848
[[12427     8]
 [ 3835    11]]
0.005692108667529106


In [7]:
# RandomForestClassifier
model = RandomForestClassifier()
model.fit(X_train, y_train)
y_pred = model.predict(X_train)
y_pred[y_pred<=0.5] = 0
y_pred[y_pred>0.5] = 1

# 精度 (Accuracy) を計算する
accuracy = sum(y_train == y_pred) / len(y_train)
print(accuracy)

y_pred = model.predict(X_test)
y_pred[y_pred<=0.5] = 0
y_pred[y_pred>0.5] = 1

# 精度 (Accuracy) を計算する
accuracy = sum(y_test == y_pred) / len(y_test)
print(accuracy)

# 混合行列
cm = confusion_matrix(y_test, y_pred)
print(cm)

# F値
print(f1_score(y_test, y_pred))



0.9874082491323977
0.8502548983477674
[[11644   791]
 [ 1647  2199]]
0.6433586892919837


Collecting category_encoders
  Downloading https://files.pythonhosted.org/packages/a0/52/c54191ad3782de633ea3d6ee3bb2837bda0cf3bc97644bb6375cf14150a0/category_encoders-2.1.0-py2.py3-none-any.whl (100kB)
Installing collected packages: category-encoders
Successfully installed category-encoders-2.1.0
