### 1.カーネルの目的：

先週は、自分のモデルを改善するために全ての特徴量に取り組みました。ほぼすべての特徴量を扱ったので、Kaggle の仲間たちと共有することにしました。

注記：
すべての特徴量が有用とは限らないかもしれませんが、この作業は他の Kaggler にとって、データ操作の助けや、新たな興味深い特徴量を作成する際のヒントになるかもしれないと思っています。

### 2.Importing Libraries

In [28]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import scipy as sp
from scipy import stats
import matplotlib.pyplot as plt
import seaborn as sns

# Standard plotly imports
#import plotly.plotly as py
import plotly.graph_objs as go
import plotly.tools as tls
from plotly.offline import iplot, init_notebook_mode
#import cufflinks
#import cufflinks as cf
import plotly.figure_factory as ff

# Using plotly + cufflinks in offline mode
init_notebook_mode(connected=True)
#cufflinks.go_offline(connected=True)

# Preprocessing, modelling and evaluating
from sklearn import preprocessing
from sklearn.metrics import confusion_matrix, roc_auc_score
from sklearn.model_selection import StratifiedKFold, cross_val_score, KFold
from xgboost import XGBClassifier
import xgboost as xgb

from sklearn.preprocessing import minmax_scale
from sklearn.decomposition import PCA
from sklearn.cluster import KMeans
from scipy.spatial.distance import cdist, pdist


## Hyperopt modules
from hyperopt import fmin, hp, tpe, Trials, space_eval, STATUS_OK, STATUS_RUNNING
from functools import partial

import os
import gc

In [29]:
## Function to reduce the DF size
def reduce_mem_usage(df, verbose=True):
    numerics = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64']
    start_mem = df.memory_usage().sum() / 1024**2    
    for col in df.columns:
        col_type = df[col].dtypes
        if col_type in numerics:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)  
            else:
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)    
    end_mem = df.memory_usage().sum() / 1024**2
    if verbose: print('Mem. usage decreased to {:5.2f} Mb ({:.1f}% reduction)'.format(end_mem, 100 * (start_mem - end_mem) / start_mem))
    return df

def PCA_change(df, cols, n_components, prefix='PCA_', rand_seed=4):
    pca = PCA(n_components=n_components, random_state=rand_seed)

    principalComponents = pca.fit_transform(df[cols])

    principalDf = pd.DataFrame(principalComponents)

    df.drop(cols, axis=1, inplace=True)

    principalDf.rename(columns=lambda x: str(prefix)+str(x), inplace=True)

    df = pd.concat([df, principalDf], axis=1)
    
    return df

In [30]:
df_trans = pd.read_csv('./data/train_transaction.csv', index_col='TransactionID')
df_test_trans = pd.read_csv('./data/test_transaction.csv', index_col='TransactionID')

In [31]:
df_trans = reduce_mem_usage(df_trans)
df_test_trans = reduce_mem_usage(df_test_trans)

Mem. usage decreased to 544.60 Mb (69.3% reduction)
Mem. usage decreased to 474.52 Mb (68.8% reduction)


### 3.Concatenating train and test

In [32]:
df_test_trans['isFraud'] = 'test'
df = pd.concat([df_trans, df_test_trans], axis=0, sort=False )
df = df.reset_index()


DataFrame is highly fragmented.  This is usually the result of calling `frame.insert` many times, which has poor performance.  Consider joining all columns at once using pd.concat(axis=1) instead. To get a de-fragmented frame, use `newframe = frame.copy()`



In [33]:
df = reduce_mem_usage(df)

Mem. usage decreased to 1031.75 Mb (0.4% reduction)


In [35]:
for c1, c2 in df_trans.dtypes.reset_index().values:
    if(c2 == "O"):
        df[c1], uniques = pd.factorize(df[c1])
        print(df[c1])

0          0
1          0
2          0
3          0
4          1
          ..
1097226    2
1097227    2
1097228    0
1097229    0
1097230    2
Name: ProductCD, Length: 1097231, dtype: int64
0          0
1          1
2          2
3          1
4          1
          ..
1097226    1
1097227    1
1097228    2
1097229    1
1097230    2
Name: card4, Length: 1097231, dtype: int64
0          0
1          0
2          1
3          1
4          0
          ..
1097226    1
1097227    1
1097228    1
1097229    1
1097230    0
Name: card6, Length: 1097231, dtype: int64
0         -1
1          0
2          1
3          2
4          0
          ..
1097226    0
1097227    5
1097228    5
1097229    5
1097230    5
Name: P_emaildomain, Length: 1097231, dtype: int64
0         -1
1         -1
2         -1
3         -1
4         -1
          ..
1097226    0
1097227    1
1097228   -1
1097229   -1
1097230    1
Name: R_emaildomain, Length: 1097231, dtype: int64
0          0
1         -1
2          0
3         -

In [36]:
print('Shape before PCA')
df.shape

Shape before PCA


(1097231, 394)

### 4.V Features

In [37]:
mas_v = df.columns[55:]

for col in mas_v:
    df[col].fillna((df[col].min() - 2), inplace=True)
    df[col] = (minmax_scale(df[col], feature_range=(0,1)))
    
df = PCA_change(df, mas_v, prefix='PCA_V_', n_components=35)

columns = ['PCA_V_0', 'PCA_V_1', 'PCA_V_2', 'PCA_V_3', 'PCA_V_4', 'PCA_V_5', 
           'PCA_V_6', 'PCA_V_7', 'PCA_V_8', 'PCA_V_9', 'PCA_V_10', 'PCA_V_11', 
           'PCA_V_12', 'PCA_V_13', 'PCA_V_14', 'PCA_V_15', 'PCA_V_16', 
           'PCA_V_17', 'PCA_V_18', 'PCA_V_19', 'PCA_V_20', 'PCA_V_21', 
           'PCA_V_22', 'PCA_V_23', 'PCA_V_24', 'PCA_V_25', 'PCA_V_26', 
           'PCA_V_27', 'PCA_V_28', 'PCA_V_29', 'PCA_V_30', 'PCA_V_31', 
           'PCA_V_32', 'PCA_V_33', 'PCA_V_34']

km = KMeans(n_clusters=6)
km = km.fit(df[columns])
df['clusters_V'] = km.predict(df[columns])
gc.collect()


A value is trying to be set on a copy of a DataFrame or Series through chained assignment using an inplace method.
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.




A value is trying to be set on a copy of a DataFrame or Series through chained assignment using an inplace method.
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.




A

22

In [38]:
df[columns] = reduce_mem_usage(df[columns])

Mem. usage decreased to 73.25 Mb (50.0% reduction)


### C Featreus

In [39]:
c_feat = ['C1', 'C2', 'C3', 'C4', 'C5', 'C6', 'C7',
              'C8', 'C9', 'C10', 'C11', 'C12', 'C13', 'C14']

for col in c_feat:
    df[col] = df[col].fillna((df[col].min() - 1))
    df[col] = (minmax_scale(df[col], feature_range=(0,1)))
  
df = PCA_change(df, c_feat, prefix='PCA_C_', n_components=3)

c_features = ['PCA_C_0', 'PCA_C_1', 'PCA_C_2']

km = KMeans(n_clusters=4)
km = km.fit(df[c_features])
df['clusters_C'] = km.predict(df[c_features])

In [40]:
df = reduce_mem_usage(df)
df.shape

Mem. usage decreased to 163.24 Mb (43.9% reduction)


(1097231, 81)

In [41]:
m_feat = ['M1', 'M2', 'M3', 'M4', 'M5', 'M6', 'M7', 'M8', 'M9']

for col in m_feat:
    df[col].fillna('None', inplace=True)


A value is trying to be set on a copy of a DataFrame or Series through chained assignment using an inplace method.
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.



