In [1]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import LinearSVC
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import make_pipeline
from sklearn.ensemble import StackingClassifier
from sklearn.model_selection import train_test_split

import pandas as pd
import joblib

In [2]:
data_save = pd.read_csv('./OCVS_train1.csv')
data_column = data_save.columns

x_data = data_save[data_column[1:-1]]
y_data = data_save[data_column[-1]]

In [3]:
estimators = [
    ('rf', RandomForestClassifier(n_estimators=100, random_state=13, criterion='gini', max_depth=100, min_samples_leaf=1, max_features='log2', max_leaf_nodes=350, bootstrap=False)),
    ('svr', make_pipeline(StandardScaler(),
                        LinearSVC(random_state=13)))
]

In [4]:
clf = StackingClassifier(estimators=estimators, final_estimator=LogisticRegression(C=1.0, fit_intercept=True, solver='newton-cg', max_iter=170, multi_class='auto', tol=0.0004, random_state=13))

In [5]:
joblib.dump(clf, './model.pkl')

['./model.pkl']

In [6]:
X_train, X_test, y_train, y_test = train_test_split(x_data, y_data, test_size=0.3)

In [7]:
print('정확도 : ', round(clf.fit(X_train, y_train).score(X_test, y_test) * 100, 2), '%')

정확도 :  93.51 %


In [8]:
prediction_file = pd.read_csv('./OCVS_test1.csv')
prediction_data = prediction_file[data_column[1:-1]]

label_data = clf.predict(prediction_data)
print(label_data)

[0 0 1 0 0 0 0 1 1 0 0 1 0 1 0 0 0 0 0]


In [9]:
prediction_file['label']=label_data
result_dataset = prediction_file[['url','label']]
result_dataset.to_csv('OCVS_result1.csv', index=False)
    
print('Finished')

Finished
