In [2]:
import lightgbm as lgb
import pandas as pd
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import GridSearchCV
from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split
from sklearn.datasets import  make_classification
# 加载数据
print('Load data...')

iris = load_iris()
data=iris.data
target = iris.target
X_train,X_test,y_train,y_test =train_test_split(data,target,test_size=0.2)

print(X_train)
print(y_train)
# df_train = pd.read_csv('../regression/regression.train', header=None, sep='\t')
# df_test = pd.read_csv('../regression/regression.test', header=None, sep='\t')
# y_train = df_train[0].values
# y_test = df_test[0].values
# X_train = df_train.drop(0, axis=1).values
# X_test = df_test.drop(0, axis=1).values

print('Start training...')
# 创建模型，训练模型
gbm = lgb.LGBMRegressor(objective='regression',num_leaves=31,learning_rate=0.05,n_estimators=20)
gbm.fit(X_train, y_train,eval_set=[(X_test, y_test)],eval_metric='l1',early_stopping_rounds=5)

print('Start predicting...')
# 测试机预测
y_pred = gbm.predict(X_test, num_iteration=gbm.best_iteration_)
# 模型评估
print('The rmse of prediction is:', mean_squared_error(y_test, y_pred) ** 0.5)

# feature importances
print('Feature importances:', list(gbm.feature_importances_))

# 网格搜索，参数优化
estimator = lgb.LGBMRegressor(num_leaves=31)

param_grid = {
    'learning_rate': [0.01, 0.1, 1],
    'n_estimators': [20, 40]
}

gbm = GridSearchCV(estimator, param_grid)

gbm.fit(X_train, y_train)

print('Best parameters found by grid search are:', gbm.best_params_)

Load data...
[[4.7 3.2 1.3 0.2]
 [6.7 3.1 5.6 2.4]
 [5.1 3.8 1.5 0.3]
 [6.9 3.1 5.1 2.3]
 [5.5 2.6 4.4 1.2]
 [5.  3.2 1.2 0.2]
 [5.6 2.5 3.9 1.1]
 [4.8 3.4 1.9 0.2]
 [5.1 3.8 1.6 0.2]
 [5.4 3.4 1.5 0.4]
 [6.8 3.  5.5 2.1]
 [6.8 2.8 4.8 1.4]
 [5.7 2.9 4.2 1.3]
 [5.  3.3 1.4 0.2]
 [6.4 2.8 5.6 2.1]
 [6.7 3.  5.2 2.3]
 [6.  2.7 5.1 1.6]
 [6.9 3.1 4.9 1.5]
 [6.  2.9 4.5 1.5]
 [7.7 3.8 6.7 2.2]
 [6.7 3.3 5.7 2.5]
 [6.  3.  4.8 1.8]
 [6.2 2.2 4.5 1.5]
 [5.  3.  1.6 0.2]
 [4.9 3.6 1.4 0.1]
 [4.5 2.3 1.3 0.3]
 [5.7 3.  4.2 1.2]
 [6.  2.2 4.  1. ]
 [6.3 2.5 4.9 1.5]
 [5.4 3.9 1.7 0.4]
 [5.6 2.9 3.6 1.3]
 [6.5 3.  5.2 2. ]
 [6.1 2.9 4.7 1.4]
 [5.5 4.2 1.4 0.2]
 [6.3 2.3 4.4 1.3]
 [6.9 3.2 5.7 2.3]
 [5.5 2.4 3.8 1.1]
 [5.  3.5 1.6 0.6]
 [6.  3.4 4.5 1.6]
 [7.2 3.6 6.1 2.5]
 [6.1 3.  4.6 1.4]
 [4.6 3.2 1.4 0.2]
 [4.4 2.9 1.4 0.2]
 [5.  2.  3.5 1. ]
 [6.3 3.4 5.6 2.4]
 [5.1 3.5 1.4 0.2]
 [5.8 2.6 4.  1.2]
 [4.9 2.4 3.3 1. ]
 [6.4 3.2 5.3 2.3]
 [5.1 3.4 1.5 0.2]
 [5.6 3.  4.1 1.3]
 [6.8 3.2 5.9 2.3]



In [12]:
train_data = lgb.Dataset(X_train, label=y_train)

params = {
    'boosting_type': 'gbdt',
    'objective': 'softmax',
    'num_class':3,
    'metric': 'multi_error',
    'num_leaves': 31,
    'learning_rate': 0.05,
    #'feature_fraction': 0.9,
    #'bagging_fraction': 0.8,
    #'bagging_freq': 5,
    'verbose': 0
}
print('Starting training...')
# train

gbm = lgb.train(params,
                train_data,
               valid_sets=train_data)
                #early_stopping_rounds=5)
model_json = gbm.dump_model()


Starting training...
[1]	training's multi_error: 0.0333333
[2]	training's multi_error: 0.0333333
[3]	training's multi_error: 0.0333333
[4]	training's multi_error: 0.0333333
[5]	training's multi_error: 0.0333333
[6]	training's multi_error: 0.0333333
[7]	training's multi_error: 0.0333333
[8]	training's multi_error: 0.0416667
[9]	training's multi_error: 0.0416667
[10]	training's multi_error: 0.0333333
[11]	training's multi_error: 0.0416667
[12]	training's multi_error: 0.0333333
[13]	training's multi_error: 0.0416667
[14]	training's multi_error: 0.0333333
[15]	training's multi_error: 0.0333333
[16]	training's multi_error: 0.0333333
[17]	training's multi_error: 0.0333333
[18]	training's multi_error: 0.025
[19]	training's multi_error: 0.025
[20]	training's multi_error: 0.025
[21]	training's multi_error: 0.025
[22]	training's multi_error: 0.025
[23]	training's multi_error: 0.025
[24]	training's multi_error: 0.025
[25]	training's multi_error: 0.025
[26]	training's multi_error: 0.0166667
[27]	t

In [6]:
y_pred = gbm.predict(X_train)

print(y_pred)

for i in range(len(y_pred)):
        max_value=max(y_pred[i])
        for j in range(len(y_pred[i])):
            if max_value==y_pred[i][j]:
                y_pred[i][j]=1
            else:
                y_pred[i][j]=0
print(y_pred)

[[0.99355986 0.00381378 0.00262637]
 [0.00713784 0.00485514 0.98800702]
 [0.9949986  0.00262501 0.00237639]
 [0.00695189 0.03078034 0.96226778]
 [0.00577342 0.96483849 0.02938809]
 [0.99355986 0.00381378 0.00262637]
 [0.00564067 0.98792352 0.00643581]
 [0.87909924 0.10649682 0.01440394]
 [0.9949986  0.00262501 0.00237639]
 [0.9949986  0.00262501 0.00237639]
 [0.00384904 0.00487123 0.99127974]
 [0.00558221 0.96400081 0.03041698]
 [0.00536437 0.98408301 0.01055262]
 [0.9950154  0.00295182 0.00203278]
 [0.00384756 0.00525289 0.99089955]
 [0.00384904 0.00487123 0.99127974]
 [0.01583106 0.51135795 0.472811  ]
 [0.02154397 0.89162393 0.0868321 ]
 [0.01099358 0.92906238 0.05994403]
 [0.00713784 0.00485514 0.98800702]
 [0.00713784 0.00485514 0.98800702]
 [0.01413222 0.30283133 0.68303645]
 [0.01107253 0.92708289 0.06184458]
 [0.9897727  0.00553121 0.00469609]
 [0.99568218 0.00228368 0.00203414]
 [0.98696324 0.00903116 0.0040056 ]
 [0.00804738 0.97612207 0.01583055]
 [0.0055816  0.98949392 0.00

In [9]:
print(y_train)
from sklearn.preprocessing import  OneHotEncoder

before_one_hot =  y_train.reshape([-1,1])
print(before_one_hot)
enc = OneHotEncoder()
enc.fit(before_one_hot)

one_hoted_y  = enc.transform(before_one_hot).toarray()
print(one_hoted_y.shape)


[0 2 0 2 1 0 1 0 0 0 2 1 1 0 2 2 1 1 1 2 2 2 1 0 0 0 1 1 1 0 1 2 1 0 1 2 1
 0 1 2 1 0 0 1 2 0 1 1 2 0 1 2 0 1 1 1 1 2 1 1 2 0 0 0 0 2 2 2 2 0 1 0 2 1
 2 0 0 1 0 2 2 0 2 2 2 2 1 1 1 2 2 0 1 0 2 0 0 0 0 0 0 1 0 2 2 0 1 2 2 1 1
 2 0 2 1 0 2 0 1 2]
[[0]
 [2]
 [0]
 [2]
 [1]
 [0]
 [1]
 [0]
 [0]
 [0]
 [2]
 [1]
 [1]
 [0]
 [2]
 [2]
 [1]
 [1]
 [1]
 [2]
 [2]
 [2]
 [1]
 [0]
 [0]
 [0]
 [1]
 [1]
 [1]
 [0]
 [1]
 [2]
 [1]
 [0]
 [1]
 [2]
 [1]
 [0]
 [1]
 [2]
 [1]
 [0]
 [0]
 [1]
 [2]
 [0]
 [1]
 [1]
 [2]
 [0]
 [1]
 [2]
 [0]
 [1]
 [1]
 [1]
 [1]
 [2]
 [1]
 [1]
 [2]
 [0]
 [0]
 [0]
 [0]
 [2]
 [2]
 [2]
 [2]
 [0]
 [1]
 [0]
 [2]
 [1]
 [2]
 [0]
 [0]
 [1]
 [0]
 [2]
 [2]
 [0]
 [2]
 [2]
 [2]
 [2]
 [1]
 [1]
 [1]
 [2]
 [2]
 [0]
 [1]
 [0]
 [2]
 [0]
 [0]
 [0]
 [0]
 [0]
 [0]
 [1]
 [0]
 [2]
 [2]
 [0]
 [1]
 [2]
 [2]
 [1]
 [1]
 [2]
 [0]
 [2]
 [1]
 [0]
 [2]
 [0]
 [1]
 [2]]
(120, 3)


In case you used a LabelEncoder before this OneHotEncoder to convert the categories to integers, then you can now use the OneHotEncoder directly.


In [11]:
from sklearn.metrics import classification_report
from sklearn.metrics import precision_score

#classification_report(one_hoted_y, y_pred)
precision_score(one_hoted_y, y_pred,average='micro')

0.9916666666666667