In [None]:
import warnings
warnings.filterwarnings('ignore')

import pandas as pd
import numpy as np
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import cross_val_score
from sklearn.svm import SVR, LinearSVR
from sklearn.metrics import accuracy_score
from time import gmtime, strftime

import optuna

In [None]:
train_df = pd.read_csv('train.csv')
test_df = pd.read_csv('test.csv')

In [None]:
param = {
    #'kernel': 'rbf',
    #'C': 1e+0,
    #'gamma': 1e-2,
    'verbose': 3
}

In [None]:
num_data, num_feature = train_df.shape
print(num_data, num_feature)

In [None]:
def objective(trial):
    if len(train_df_0) > len(train_df_1):
        train_df_balanced = train_df_1.append(train_df_0.sample(n=len(train_df_1)))
    else:
        train_df_balanced = train_df_0.append(train_df_1.sample(n=len(train_df_0)))
    X = train_df_balanced.drop(['ID_code', 'target'], axis=1)
    y = train_df_balanced.target

    #param['kernel'] = trial.suggest_categorical('kernel', ['rbf', 'sigmoid'])
    param['C'] = trial.suggest_loguniform('C', 1e+0, 1e+2)
    #param['gamma'] = trial.suggest_loguniform('gamma', 1e-2, 1e+1)
    print(param)
    #clf = SVR(**param)
    clf = LinearSVR(**param)
    kf = StratifiedKFold(n_splits=5, shuffle=True, random_state=7485)
    score = cross_val_score(clf, X=X.values, y=y.values, cv=kf, n_jobs=-1, verbose=3)
    accuracy = score.mean()
    print(accuracy)
    return 1.0 - accuracy

In [None]:
train_df.head()

In [None]:
train_df.target.value_counts(normalize=True)

In [None]:
test_df.head()

In [None]:
train_df.dtypes

In [None]:
train_df.isnull().sum().sort_values(ascending=False)[:10]

In [None]:
# Drop Different Columns from train and test
print('\nTrain and Test Datasets have the same columns?:',
      train_df.drop('target',axis=1).columns.tolist()==test_df.columns.tolist())
print("\nVariables not in test but in train : ", 
      set(train_df.drop('target',axis=1).columns).difference(set(test_df.columns)))
dif = list(set(train_df.drop('target',axis=1).columns).difference(set(test_df.columns)))

In [None]:
# Prepare data
train_df_0 = train_df.query('target == 0')
train_df_1 = train_df.query('target == 1')
X_test = test_df.drop(['ID_code'], axis=1)
print(len(train_df), len(test_df))

In [None]:
# Parameter Tuning
study = optuna.create_study()
study.optimize(objective, n_trials=100)

print('Number of finished trials: {}'.format(len(study.trials)))

print('Best trial:')
trial = study.best_trial
best_params = study.best_params

print('  Value: {}'.format(trial.value))

print('  Params: ')
for key, value in trial.params.items():
    print('    {}: {}'.format(key, value))

In [None]:
def min_max(x, axis=None):
    min = x.min(axis=axis, keepdims=True)
    max = x.max(axis=axis, keepdims=True)
    result = (x-min)/(max-min)
    return result

In [None]:
# Building model using BEST parameters, then predict test data
print("svm_model ...")
param.update(best_params)
print(param)
#clf = SVR(**param)
clf = LinearSVR(**param)
prediction = np.zeros(len(X_test))
for i in range(10):
    print(i)
    if len(train_df_0) > len(train_df_1):
        train_df_balanced = train_df_1.append(train_df_0.sample(n=len(train_df_1)))
    else:
        train_df_balanced = train_df_0.append(train_df_1.sample(n=len(train_df_0)))
    X = train_df_balanced.drop(['ID_code', 'target'], axis=1)
    y = train_df_balanced.target
    clf.fit(X.values, y.values)
    result = clf.predict(X_test.values)
    prediction += min_max(result)/10
print("...Done")

In [None]:
# Save
test_df['target'] = prediction
submission_string = 'svm_' + strftime("%Y-%m-%d %H:%M:%S", gmtime()) + '.csv'
test_df.loc[:, ['ID_code', 'target']].to_csv(submission_string, index=False)