In [20]:
from sklearn.datasets import fetch_california_housing
from sklearn.model_selection import train_test_split, KFold
from sklearn.linear_model import LinearRegression, LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import r2_score, recall_score

from sklearn.ensemble import RandomForestClassifier, VotingClassifier 

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [21]:
X, y = fetch_california_housing(return_X_y=True)

X_medinc_mean = X[:,0].mean()
filter1 = X[:,0] < X_medinc_mean
cat_y = filter1.astype(int)
# print(cat_y)

X = X[:, 1:]  # exclude Medinc
X_train, X_test, y_train, y_test = train_test_split(X, cat_y, stratify=cat_y, random_state=10)

In [22]:
m = RandomForestClassifier(max_depth=3).fit(X_train, y_train)

print('train score ', m.score(X_train, y_train))
print('test score ', m.score(X_test, y_test))
# print(m.get_params())

train score  0.8104005167958657
test score  0.8073643410852713


In [23]:
est = [ ('log_reg', LogisticRegression(solver = 'liblinear', max_iter=1000)),
       ('rand_f', RandomForestClassifier(max_depth=3))
]
m = VotingClassifier(est, weights=[2, 1]).fit(X_train, y_train)  # 투표?
print('train score ', m.score(X_train, y_train))
print('test score ', m.score(X_test, y_test))

train score  0.8224160206718346
test score  0.8217054263565892


In [24]:
kf = KFold(n_splits=4)
for train_index, validation_index in kf.split(X_train):
    train_data = X_train[train_index]
    validation_data = X_train[validation_index]
    train_target = y_train[train_index]
    validation_target = y_train[validation_index]
    print(train_index)
    print(validation_index)

[ 3870  3871  3872 ... 15477 15478 15479]
[   0    1    2 ... 3867 3868 3869]
[    0     1     2 ... 15477 15478 15479]
[3870 3871 3872 ... 7737 7738 7739]
[    0     1     2 ... 15477 15478 15479]
[ 7740  7741  7742 ... 11607 11608 11609]
[    0     1     2 ... 11607 11608 11609]
[11610 11611 11612 ... 15477 15478 15479]
