In [21]:
from catboost import CatBoostClassifier
from sklearn.model_selection import StratifiedKFold, cross_val_score

In [18]:
train = pd.read_csv('train.csv')
bulk = pd.read_csv('machine failure.csv')
bulk = bulk.drop(columns=['UDI'])
train_df = pd.concat([train, bulk], axis=0)
test_df = pd.read_csv("test.csv")

In [12]:
train_df.head()

Unnamed: 0,id,Product ID,Type,Air temperature [K],Process temperature [K],Rotational speed [rpm],Torque [Nm],Tool wear [min],Machine failure,TWF,HDF,PWF,OSF,RNF
0,0.0,L50096,L,300.6,309.6,1596,36.1,140,0,0,0,0,0,0
1,1.0,M20343,M,302.6,312.1,1759,29.1,200,0,0,0,0,0,0
2,2.0,L49454,L,299.3,308.5,1805,26.5,25,0,0,0,0,0,0
3,3.0,L53355,L,301.0,310.9,1524,44.3,197,0,0,0,0,0,0
4,4.0,M24050,M,298.0,309.0,1641,35.4,34,0,0,0,0,0,0


In [13]:
cat_columns =  ['Type', 'Product ID']
train_df[cat_columns] = train_df[cat_columns].astype('category')

In [19]:
test_df[cat_columns] = test_df[cat_columns].astype('category')
test_df.head()

Unnamed: 0,id,Product ID,Type,Air temperature [K],Process temperature [K],Rotational speed [rpm],Torque [Nm],Tool wear [min],TWF,HDF,PWF,OSF,RNF
0,136429,L50896,L,302.3,311.5,1499,38.0,60,0,0,0,0,0
1,136430,L53866,L,301.7,311.0,1713,28.8,17,0,0,0,0,0
2,136431,L50498,L,301.3,310.4,1525,37.7,96,0,0,0,0,0
3,136432,M21232,M,300.1,309.6,1479,47.6,5,0,0,0,0,0
4,136433,M19751,M,303.4,312.3,1515,41.3,114,0,0,0,0,0


In [20]:
X = train_df.drop(columns=['id', 'Machine failure'])
y = train_df['Machine failure']

In [22]:
# 層化サンプリングと交差検証を行うためのクラス
cv = StratifiedKFold(shuffle=True)
# cat_features引数には、カテゴリカルな特徴量の列番号または列名を指定
# metric_period=200は、200イテレーションごとにメトリクス（AUC）を出力することを意味する
# eval_metric='AUC'は、モデルの評価指標としてAUCを使用することを指定
cat_model = CatBoostClassifier(cat_features=cat_columns, metric_period=200, eval_metric='AUC')
# cross_val_scoreは、指定したモデルとデータセットを交差検証するための関数。
# cat_modelは評価するモデル。
# n_jobs=-1は、利用可能なすべてのCPUコアを使用して並列処理を行う
# verbose=1は、進行状況を表示するための設定
# scoring='roc_auc'は、AUCをスコアとして使用
scores = cross_val_score(cat_model, X,y,cv=cv,n_jobs=-1,verbose=1, scoring='roc_auc')
np.mean(scores)

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 2 concurrent workers.
[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed:  6.0min finished


0.9782141991817263

In [23]:
cat_model.fit(X,y)
y_prob = cat_model.predict_proba(test_df.drop(columns=['id']))
sub = test_df[['id']]
sub['Machine failure'] = y_prob[:,1]
sub

Learning rate set to 0.086628
0:	total: 556ms	remaining: 9m 15s
200:	total: 19.3s	remaining: 1m 16s
Learning rate set to 0.078755
0:	total: 665ms	remaining: 11m 4s
200:	total: 30.4s	remaining: 2m
400:	total: 55.6s	remaining: 1m 23s
600:	total: 1m 20s	remaining: 53.2s
800:	total: 1m 50s	remaining: 27.4s
999:	total: 2m 20s	remaining: 0us
Learning rate set to 0.078755
0:	total: 455ms	remaining: 7m 34s
200:	total: 30.7s	remaining: 2m 2s
400:	total: 59.1s	remaining: 1m 28s
600:	total: 1m 27s	remaining: 58.4s
800:	total: 1m 53s	remaining: 28.1s
999:	total: 2m 21s	remaining: 0us
400:	total: 33.7s	remaining: 50.4s
600:	total: 49.5s	remaining: 32.9s
800:	total: 1m 5s	remaining: 16.3s
999:	total: 1m 24s	remaining: 0us


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  sub['Machine failure'] = y_prob[:,1]


Unnamed: 0,id,Machine failure
0,136429,0.000807
1,136430,0.001573
2,136431,0.000429
3,136432,0.002103
4,136433,0.000789
...,...,...
90949,227378,0.001093
90950,227379,0.001321
90951,227380,0.000483
90952,227381,0.003643


Learning rate set to 0.078755
0:	total: 677ms	remaining: 11m 16s
200:	total: 30.7s	remaining: 2m 2s
400:	total: 56.6s	remaining: 1m 24s
600:	total: 1m 20s	remaining: 53.5s
800:	total: 1m 51s	remaining: 27.6s
999:	total: 2m 22s	remaining: 0us
Learning rate set to 0.078755
0:	total: 134ms	remaining: 2m 13s
200:	total: 28.9s	remaining: 1m 54s
400:	total: 56.4s	remaining: 1m 24s
600:	total: 1m 23s	remaining: 55.6s
800:	total: 1m 49s	remaining: 27.2s
999:	total: 2m 16s	remaining: 0us
Learning rate set to 0.078755
0:	total: 148ms	remaining: 2m 28s
200:	total: 16.4s	remaining: 1m 5s
400:	total: 30.5s	remaining: 45.6s
600:	total: 42.8s	remaining: 28.4s
800:	total: 54.4s	remaining: 13.5s
999:	total: 1m 6s	remaining: 0us


In [24]:
sub.to_csv("submissions_cat_2.csv", index=False)