In [9]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import tensorflow as tf
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.manifold import TSNE
from sklearn.decomposition import PCA, TruncatedSVD
from sklearn.metrics import confusion_matrix
import matplotlib.patches as mpatches
from sklearn.model_selection import train_test_split
from sklearn.model_selection import StratifiedShuffleSplit
from xgboost.sklearn import XGBClassifier
import xgboost as xgb
from sklearn.model_selection import GridSearchCV
import lightgbm as lgb
import os
import time
import catboost as cb
from sklearn.preprocessing import OneHotEncoder
from bayes_opt import BayesianOptimization#pip install bayesian-optimization
from sklearn.model_selection import KFold
import copy
from sklearn.preprocessing import LabelEncoder

In [10]:
def reduce_mem_usage(df):
    """ iterate through all the columns of a dataframe and modify the data type
        to reduce memory usage.        
    """
    start_mem = df.memory_usage().sum() / 1024**2
    print('Memory usage of dataframe is {:.2f} MB'.format(start_mem))
    
    for col in df.columns:
        col_type = df[col].dtype
        
        if col_type != object:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)  
            else:
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)
        else:
            df[col] = df[col].astype('category')

    end_mem = df.memory_usage().sum() / 1024**2
    print('Memory usage after optimization is: {:.2f} MB'.format(end_mem))
    print('Decreased by {:.1f}%'.format(100 * (start_mem - end_mem) / start_mem))
    
    return df

In [11]:
#df = reduce_mem_usage(pd.read_csv('train.csv'))
#df_test = reduce_mem_usage(pd.read_csv('test.csv'))
df = pd.read_csv('train.csv')
df_test = pd.read_csv('test.csv')

In [None]:
#print(df['acqic'].value_counts(dropna=False, normalize=True).head())
for i,cn in enumerate(df[df.columns]):
    print(df[cn].value_counts(dropna=False, normalize=True).sort_index())
    print("-------------------------------")

In [None]:
#nan NY值處理 trainset testset
#df['insfg'] = df['insfg'].map( {'Y': 1, 'N': 0} ).astype(int)#分期交易註記
#df['ovrlt'] = df['ovrlt'].map( {'Y': 1, 'N': 0} ).astype(int)#超額註記碼
#df['ecfg'] = df['ecfg'].map( {'Y': 1, 'N': 0} ).astype(int)#網路交易註記
#df = df.drop(['flbmk','flg_3dsmk'],axis=1)
#df_test = df_test.drop('flbmk',axis=1)
#df_test= df_test.drop('flg_3dsmk',axis=1)

In [12]:
#類別轉換
#contp交易類別 csmcu消費地幣別 etymd交易型態 stocn消費地國別 scity hcefg支付形態  ?mchno特店代號
#csmcu 消費地幣別     ovrlt超額註記碼 insfg  分期交易註記ecfg 網路交易註記

#cat_cols = ['contp', 'csmcu', 'etymd', 'stocn', 'scity', 'hcefg', 'csmcu', 'ovrlt', 'insfg', 'ecfg']
cat_cols =  list(df.columns)
cat_cols.remove('fraud_ind')
cat_cols.remove('txkey')
cat_cols.remove('loctm')
cat_cols.remove('locdt')
cat_cols.remove('conam')
for col in cat_cols:
    if col in df.columns:
        le = LabelEncoder()
        le.fit(list(df[col].astype(str).values) + list(df_test[col].astype(str).values))
        df[col] = le.transform(list(df[col].astype(str).values))
        df_test[col] = le.transform(list(df_test[col].astype(str).values)) 

In [None]:
for col in df.columns:
    if(df[col].duplicated().sum() <10):
        print(col)

In [None]:
#df['fraud_ind'] = df['fraud_ind'].astype('category')
#df_test['fraud_ind'] = df_test['fraud_ind'].astype('category')

In [None]:
for col in cat_cols:
    if col in df.columns:
        df[col] = df[col].astype('category')
        df_test[col] = df_test[col].astype('category')     

In [None]:
#若特徵中其中有一個值超過90%
big_top_value_cols = [col for col in df.columns if df[col].value_counts(dropna=False, normalize=True).values[0] > 0.9]
big_top_value_cols_test = [col for col in df_test.columns if df_test[col].value_counts(dropna=False, normalize=True).values[0] > 0.9]
print(big_top_value_cols )

In [None]:
print(df.info()) #contp csmcu etymd stocn消費地國別 scity hcefg  csmcu   ovrlt insfg ecfg
print("---------------------------")
print(df_test.info())

In [None]:
print(df.isnull().sum())

In [None]:
#for i,col in enumerate(df[df.columns]):
#    df[col].fillna(-999, inplace=True)

In [None]:
df.describe()

In [None]:
print(df.info())

In [13]:
by_group_col ='mcc'
groupby_col =['bacno','cano']
df['mcc_to_mean_cred'] = df[by_group_col] / df.groupby(groupby_col)[by_group_col].transform('mean')
df['mcc_to_std_cred'] = df[by_group_col] / df.groupby(groupby_col)[by_group_col].transform('std')
df_test['mcc_to_mean_cred'] = df[by_group_col] / df.groupby(groupby_col)[by_group_col].transform('mean')
df_test['mcc_to_std_cred'] = df[by_group_col] / df.groupby(groupby_col)[by_group_col].transform('std')

In [14]:
df.to_csv('df.csv',index = None)

In [None]:
by_group_col ='conam'
groupby_col =['bacno','cano']
df['conam_to_mean_cred'] = df[by_group_col] / df.groupby(groupby_col)[by_group_col].transform('mean')
df['conam_to_std_cred'] = df[by_group_col] / df.groupby(groupby_col)[by_group_col].transform('std')
by_group_col ='conam'
groupby_col =['acqic','stocn','scity','mcc','mchno']
df['conam_to_mean_store'] = df[by_group_col] / df.groupby(groupby_col)[by_group_col].transform('mean')
df['conam_to_std_store'] = df[by_group_col] / df.groupby(groupby_col)[by_group_col].transform('std')
groupby_col =['insfg','iterm']
df['conam_to_mean_staging'] = df[by_group_col] / df.groupby(groupby_col)[by_group_col].transform('mean')
df['conam_to_std_staging'] = df[by_group_col] / df.groupby(groupby_col)[by_group_col].transform('std')
groupby_col =['contp','hcefg','etymd']
df['conam_to_mean_trade'] = df[by_group_col] / df.groupby(groupby_col)[by_group_col].transform('mean')
df['conam_to_std_trade'] = df[by_group_col] / df.groupby(groupby_col)[by_group_col].transform('std')

In [None]:
i = 'fraud_ind'
col_name ='conam_to_mean_store'
cor = np.corrcoef(df[col_name], df[i])[0,1]
df.loc[df['fraud_ind'] == 0].set_index(col_name)[i].fillna(-1).plot(style='.', title=i+" corr= "+str(round(cor,3)), figsize=(15, 3), label="isFraud=0")
df.loc[df['fraud_ind'] == 1].set_index(col_name)[i].fillna(-1).plot(style='.', title=i+" corr= "+str(round(cor,3)), figsize=(15, 3), label="isFraud=1")
#test_transaction.set_index('TransactionDT')[i].plot(style='.', title=i+" corr= "+str(round(cor,3)), figsize=(15, 3))
plt.legend()
plt.show()
i = 'fraud_ind'
col_name ='conam_to_mean_staging'
cor = np.corrcoef(df[col_name], df[i])[0,1]
df.loc[df['fraud_ind'] == 0].set_index(col_name)[i].fillna(-1).plot(style='.', title=i+" corr= "+str(round(cor,3)), figsize=(15, 3), label="isFraud=0")
df.loc[df['fraud_ind'] == 1].set_index(col_name)[i].fillna(-1).plot(style='.', title=i+" corr= "+str(round(cor,3)), figsize=(15, 3), label="isFraud=1")
#test_transaction.set_index('TransactionDT')[i].plot(style='.', title=i+" corr= "+str(round(cor,3)), figsize=(15, 3))
plt.legend()
plt.show()
i = 'fraud_ind'
col_name ='conam_to_mean_cred'
cor = np.corrcoef(df[col_name], df[i])[0,1]
df.loc[df['fraud_ind'] == 0].set_index(col_name)[i].fillna(-1).plot(style='.', title=i+" corr= "+str(round(cor,3)), figsize=(15, 3), label="isFraud=0")
df.loc[df['fraud_ind'] == 1].set_index(col_name)[i].fillna(-1).plot(style='.', title=i+" corr= "+str(round(cor,3)), figsize=(15, 3), label="isFraud=1")
#test_transaction.set_index('TransactionDT')[i].plot(style='.', title=i+" corr= "+str(round(cor,3)), figsize=(15, 3))
plt.legend()
plt.show()
i = 'fraud_ind'
col_name ='conam_to_mean_trade'
cor = np.corrcoef(df[col_name], df[i])[0,1]
df.loc[df['fraud_ind'] == 0].set_index(col_name)[i].fillna(-1).plot(style='.', title=i+" corr= "+str(round(cor,3)), figsize=(15, 3), label="isFraud=0")
df.loc[df['fraud_ind'] == 1].set_index(col_name)[i].fillna(-1).plot(style='.', title=i+" corr= "+str(round(cor,3)), figsize=(15, 3), label="isFraud=1")
#test_transaction.set_index('TransactionDT')[i].plot(style='.', title=i+" corr= "+str(round(cor,3)), figsize=(15, 3))
plt.legend()
plt.show()

In [None]:
train['time'] = train['TransactionDT'] / (60*60*24) - 9/24

In [None]:
#trainset的資料分布
from matplotlib import gridspec
plt.figure(figsize=(16,30*4))
gs = gridspec.GridSpec(30, 1)#创建20行1列的画布
for i, col  in enumerate(df[df.columns]):    
    ax = plt.subplot(gs[i])
    sns.distplot(df[col][df["fraud_ind"] == 1],label = 'Is Fraud',bins=100)
    sns.distplot(df[col][df["fraud_ind"] == 0],label = 'Not Fraud',bins=100)
    plt.legend();
    ax.set_xlabel('')
    ax.set_title('histogram of feature: ' + str(col))

In [None]:
print('No Frauds', round(df['fraud_ind'].value_counts()[0]/len(df) * 100,2), '% of the dataset')
print('Frauds', round(df['fraud_ind'].value_counts()[1]/len(df) * 100,2), '% of the dataset')
df['fraud_ind'].value_counts()

In [18]:
#看分布圖處理
droplist = ['txkey','locdt']
df = df.drop(droplist,axis=1)
df_test = df_test.drop(droplist,axis=1)

In [22]:
def clean_inf_nan(df):
    return df.replace([np.inf, -np.inf], np.nan)   
df = clean_inf_nan(df)

In [None]:
for i,cn in enumerate(df[df.columns]):
    print(df[cn].value_counts(dropna=False, normalize=True).head())
    print("-------------------------------")

In [None]:
#方案二 原始數據 無scaler
X = df.drop(['fraud_ind'], axis = 1)
Y = df['fraud_ind']
X_train, X_test, Y_train, Y_test = train_test_split(X,Y,test_size = 0.2, random_state = 10)
X_train = X_train.values
X_test = X_test.values
Y_train = Y_train.values
Y_test = Y_test.values

In [None]:
# Let's store our Y_test legit and fraud counts for normalization purposes later on
Y_test_transfer =pd.Series(Y_test)
Y_test_nofraud = Y_test_transfer.value_counts()[0]
Y_test_fraud = Y_test_transfer.value_counts()[1]
print(Y_test_nofraud)
from time import time
t0=time()

In [None]:
#用隨機森林看特徵
from sklearn.ensemble import RandomForestClassifier
rfc = RandomForestClassifier(n_estimators=50, max_depth=100, random_state=0,n_jobs=-1)
rfc.fit(X_train,Y_train)  
pred=rfc.predict(X_test)

PlotConfusionMatrix(Y_test,pred,Y_test_nofraud,Y_test_fraud)

x_feature = list(df.columns)
x_feature.remove('fraud_ind')
names = df[x_feature].columns
"""for feature in zip(names, rfc.feature_importances_):
    print(feature)"""

#可视化由随机森林分类器判定的各类的重要顺序

plt.style.use('fivethirtyeight')#其中的一种主题，可以通过plt.style.availabel查看有多少种主题
#plt.rcParams['figure.figsize'] = (12,6)#设置画布尺寸
importances = rfc.feature_importances_


feat_names = names
indices = np.argsort(importances)[::-1]#按照重要顺序从小到大排序并获取逆序索引
fig = plt.figure(figsize=(12,6))
plt.title("Feature importances by RandomTreeClassifier")
plt.bar(range(len(indices)), importances[indices], color='lightblue',  align="center")
plt.step(range(len(indices)), np.cumsum(importances[indices]), where='mid', label='Cumulative')
plt.xticks(range(len(indices)), feat_names[indices], rotation='vertical',fontsize=14)
plt.xlim([-1, len(indices)])
plt.show()

In [None]:
x_feature = list(df.columns)
x_feature.remove('fraud_ind')
names = df[x_feature].columns
for feature in zip(names, rfc.feature_importances_):
    print(feature)

#可视化由随机森林分类器判定的各类的重要顺序

plt.style.use('fivethirtyeight')#其中的一种主题，可以通过plt.style.availabel查看有多少种主题
#plt.rcParams['figure.figsize'] = (12,6)#设置画布尺寸
importances = rfc.feature_importances_


feat_names = names
indices = np.argsort(importances)[::-1]#按照重要顺序从小到大排序并获取逆序索引
fig = plt.figure(figsize=(12,6))
plt.title("Feature importances by RandomTreeClassifier")
plt.bar(range(len(indices)), importances[indices], color='lightblue',  align="center")
plt.step(range(len(indices)), np.cumsum(importances[indices]), where='mid', label='Cumulative')
plt.xticks(range(len(indices)), feat_names[indices], rotation='vertical',fontsize=14)
plt.xlim([-1, len(indices)])
plt.show()

In [None]:
print(df.columns)

In [None]:
#玩玩random forest
from sklearn.ensemble import RandomForestClassifier
rfc = RandomForestClassifier(n_estimators=1000, max_depth=100, random_state=0,n_jobs=-1)
rfc.fit(X_train,Y_train)  
pred=rfc.predict(X_test)

PlotConfusionMatrix(Y_test,pred,Y_test_nofraud,Y_test_fraud)

In [None]:
#随机森林删去多於特徵
x_feature = list(df.columns)
x_feature.remove('fraud_ind')
x_feature.remove('insfg')
x_feature.remove('iterm')
x_feature.remove('contp')
x_feature.remove('hcefg')
x_feature.remove('ovrlt')
x_feature.remove('mchno')
x_feature.remove('scity')
x_feature.remove('stscd')
x_feature.remove('txkey')
x_feature.remove('stocn')


In [23]:
X_train = df.drop(['fraud_ind'], axis = 1)
Y_train = df['fraud_ind'].copy()
X_test = df_test

In [None]:
import gc
import sklearn.metrics as metrics
def LGB_bayesian(
    #learning_rate,
    num_leaves, 
    bagging_fraction,
    feature_fraction,
    min_child_weight, 
    min_data_in_leaf,
    max_depth,
    reg_alpha,
    reg_lambda
     ):
    
    folds = KFold(n_splits=5, shuffle=True, random_state=15)
    oof = np.zeros(len(X_train))

    for fold_, (trn_idx, val_idx) in enumerate(folds.split(X_train, Y_train)):
        print("fold n°{}".format(fold_))
        train_df, y_train_df = X_train.iloc[trn_idx], Y_train.iloc[trn_idx]
        valid_df, y_valid_df = X_train.iloc[val_idx], Y_train.iloc[val_idx]
    
        trn_data = lgb.Dataset(train_df, label=y_train_df)
        val_data = lgb.Dataset(valid_df, label=y_valid_df)
         # LightGBM expects next three parameters need to be integer. 
        num_leaves = int(num_leaves)
        min_data_in_leaf = int(min_data_in_leaf)
        max_depth = int(max_depth)

        assert type(num_leaves) == int
        assert type(min_data_in_leaf) == int
        assert type(max_depth) == int
        param = {
              'num_leaves': num_leaves, 
              'min_data_in_leaf': min_data_in_leaf,
              'min_child_weight': min_child_weight,
              'bagging_fraction' : bagging_fraction,
              'feature_fraction' : feature_fraction,
              #'learning_rate' : learning_rate,
              'max_depth': max_depth,
              'reg_alpha': reg_alpha,
              'reg_lambda': reg_lambda,
              'objective': 'binary',
              'save_binary': True,
              'seed': 1337,
              'feature_fraction_seed': 1337,
              'bagging_seed': 1337,
              'drop_seed': 1337,
              'data_random_seed': 1337,
              'boosting_type': 'gbdt',
              'verbose': 1,
              'is_unbalance': True,
              'boost_from_average': True,
              'metric':'auc'}    
    
        clf = lgb.train(param,
                        trn_data,
                        num_boost_round=50,
                        valid_sets = [trn_data, val_data],
                        verbose_eval=0,
                        early_stopping_rounds = 200)
        pred = clf.predict(valid_df,um_iteration=clf.best_iteration)
        oof[val_idx] = pred
       # oof[val_idx] = clf.predict(X_train.iloc[val_idx],
        #                           num_iteration=clf.best_iteration)
        
      
        #score = roc_auc_score(Y_train.iloc[trn_idx], oof[val_idx])
    return metrics.roc_auc_score(y_valid_df, pred)

In [None]:
# Bounded region of parameter space
bounds_LGB = {
    'num_leaves': (31, 600), 
    'min_data_in_leaf': (20, 200),
    'bagging_fraction' : (0.1, 0.9),
    'feature_fraction' : (0.1, 0.9),
    #'learning_rate': (0.01, 0.3),
    'min_child_weight': (0.00001, 0.01),   
    'reg_alpha': (1, 2), 
    'reg_lambda': (1, 2),
    'max_depth':(-1,60),
}

In [None]:
LGB_BO = BayesianOptimization(LGB_bayesian, bounds_LGB, random_state=42)

In [None]:
print(LGB_BO.space.keys)

n_iter: How many steps of bayesian optimization you want to perform. The more steps the more likely to find a good maximum you are.
init_points: How many steps of random exploration you want to perform. Random exploration can help by diversifying the exploration space.

In [None]:
import warnings
init_points = 10 #init_points表示初始点，n_iter代表迭代次数（即采样数）
n_iter = 15

print('-' * 130)

with warnings.catch_warnings():
    warnings.filterwarnings('ignore')
    LGB_BO.maximize(init_points=init_points, n_iter=n_iter, acq='ucb', xi=0.0, alpha=1e-6)

In [None]:
#查看最优化的score
print(LGB_BO.max['target'])
 
#查看优化得到的参数
print(LGB_BO.max['params'])

In [24]:
params = {'num_leaves': 499,
          'min_child_weight': 0.009009297771374483,
          'feature_fraction': 0.8931730501715401,
          'bagging_fraction': 0.8249551970384116,
          'min_data_in_leaf': 120,
          'objective': 'binary',
          'max_depth': 47,
          'learning_rate': 0.1,
          "boosting_type": "gbdt",
          "bagging_seed": 11,
          "metric": 'auc',
          "verbosity": -1,
          'reg_alpha': 1.1690935357787136,
          'reg_lambda': 1.119698800271026,
          'is_unbalance': True,
          'random_state': 1337,  
          'device': 'cpu',
          #'gpu_platform_id': 1,
          #'gpu_device_id': 0
         }

In [25]:
from sklearn.metrics import roc_auc_score
from sklearn.metrics import f1_score

feature_importances = pd.DataFrame()
feature_importances['feature'] = X_train.columns

splits = 4
folds = KFold(n_splits = splits)
oof = np.zeros(len(X_train))
predictions = np.zeros(len(X_test))

for fold_, (trn_idx, val_idx) in enumerate(folds.split(X_train.values, Y_train.values)):
    print("Fold {}".format(fold_))
    train_df, y_train_df = X_train.iloc[trn_idx], Y_train.iloc[trn_idx]
    valid_df, y_valid_df = X_train.iloc[val_idx], Y_train.iloc[val_idx]
    
    trn_data = lgb.Dataset(train_df, label=y_train_df)
    val_data = lgb.Dataset(valid_df, label=y_valid_df)
    
    clf = lgb.train(params,
                    trn_data,
                    num_boost_round= 1000,
                    valid_sets = [trn_data, val_data],
                    verbose_eval=200,
                    early_stopping_rounds = 200)
    pred = clf.predict(valid_df)
    oof[val_idx] = pred
    
    feature_importances['fold_{}'.format(fold_ + 1)] = clf.feature_importance()
    
    print( "  auc = ", roc_auc_score(y_valid_df, pred) )
    threshold = 0.5
    y_pre = [int(item>threshold) for  item in pred]
    print( "  auc = ", roc_auc_score(y_valid_df, y_pre) )
    print( "  f1 = ", f1_score(y_valid_df, y_pre))
    print( "  confusion_matrix = \r\n", confusion_matrix(y_valid_df, y_pre) )
    predictions += clf.predict(X_test) / splits

Fold 0
Training until validation scores don't improve for 200 rounds
[200]	training's auc: 0.999922	valid_1's auc: 0.988229
[400]	training's auc: 0.999996	valid_1's auc: 0.988523
[600]	training's auc: 0.999999	valid_1's auc: 0.988798
[800]	training's auc: 0.999999	valid_1's auc: 0.988824
Early stopping, best iteration is:
[662]	training's auc: 0.999999	valid_1's auc: 0.988855
  auc =  0.9888548883141521
  auc =  0.90094060301936
  f1 =  0.8009127210496291
  confusion_matrix = 
 [[374141   1072]
 [  1022   4212]]
Fold 1
Training until validation scores don't improve for 200 rounds
[200]	training's auc: 0.99992	valid_1's auc: 0.989797
[400]	training's auc: 0.999995	valid_1's auc: 0.990346
[600]	training's auc: 0.999998	valid_1's auc: 0.990349
Early stopping, best iteration is:
[595]	training's auc: 0.999998	valid_1's auc: 0.990371
  auc =  0.9903712324529074
  auc =  0.9016802325629625
  f1 =  0.7970713366973384
  confusion_matrix = 
 [[374368   1083]
 [   968   4028]]
Fold 2
Training un

In [None]:
feature_importances['average'] = feature_importances.mean(axis=1)

plt.figure(figsize=(16, 16))
sns.barplot(data=feature_importances.sort_values(by='average', ascending=False).head(30), x='average', y='feature')
plt.title('TOP feature importance over cv folds average');

In [None]:
# 结果保存
threshold = 0.5
predictions = [int(item>threshold) for  item in predictions]
sampleSubmission = pd.read_csv('submission_test_sample.csv')
sampleSubmission['fraud_ind'] = predictions
sampleSubmission.to_csv('submission_test.csv',index = None)

In [None]:
threshold = 0.5
y_pre = [int(item>threshold) for  item in pred]
print( "  auc = ", roc_auc_score(y_valid_df, y_pre))
print( "  f1 = ", f1_score(y_valid_df, y_pre))
print( "  confusion_matrix = \r\n", confusion_matrix(y_valid_df, y_pre) )
#predictions += clf.predict(X_test) / splits

In [None]:
x_feature = list(df.columns)
x_feature.remove('fraud_ind')
x_feature.remove('insfg')
x_feature.remove('iterm')
x_feature.remove('contp')
x_feature.remove('hcefg')
x_feature.remove('ovrlt')
x_feature.remove('mchno')
x_feature.remove('scity')
x_feature.remove('stscd')
x_feature.remove('txkey')
x_feature.remove('stocn')
df_test_final=df_test[x_feature]
#預測(填答案)
#若是decision tree只取重要特徵 要先做drop
#df_test_d=df_test_initial[['acqic','bacno','cano','conam','csmcu','ecfg','locdt','mcc','scity','stocn','stscd']]
result = gridcv.predict(df_test_final)