In [1]:
import pandas as pd
import numpy as np

# Y に欠損値が有ったら取り除く

In [2]:
#データ例
x1 = np.arange(0,6)
y = np.arange(0,6)
df_x = pd.DataFrame(x1,columns=["x1"])
df_y = pd.DataFrame(y,columns=["y"])
df_y["y"][2,4] = None
df_x["x1"][3] = None
df = pd.concat([df_x,df_y], axis=1)
df.head()

Unnamed: 0,x1,y
0,0.0,0.0
1,1.0,1.0
2,2.0,
3,,3.0
4,4.0,


In [3]:
#yに欠損値がある行を確認
df[df["y"].isnull()]

Unnamed: 0,x1,y
2,2.0,
4,4.0,


In [4]:
#指定した行（y）に欠損値がある行を削除
df = df.dropna(subset=['y'])
df

Unnamed: 0,x1,y
0,0.0,0.0
1,1.0,1.0
3,,3.0
5,5.0,5.0


# オブジェクトカラムへの処理

In [5]:
x1 = np.array(["a","b","a","a","c"])
x2 = np.arange(1,6).astype("str")
x1 = pd.DataFrame(x1,columns=["x1"])
x2 = pd.DataFrame(x2,columns=["x2"])
df_x = pd.concat([x1,x2],axis=1)
y = np.arange(1,6)
df_y = pd.DataFrame(y,columns=["y"])
df = pd.concat([df_x,df_y], axis=1)
df

Unnamed: 0,x1,x2,y
0,a,1,1
1,b,2,2
2,a,3,3
3,a,4,4
4,c,5,5


In [6]:
#値が(str)オブジェクトのカラム名を取得
df.select_dtypes('object').columns

Index(['x1', 'x2'], dtype='object')

In [7]:
#何個あるか
df["x1"].value_counts()

a    3
b    1
c    1
Name: x1, dtype: int64

In [8]:
#ユニークな値の抽出
np.unique(df["x1"])

array(['a', 'b', 'c'], dtype=object)

In [9]:
#各オブジェクトカラム内にユニークな値が何種類あるか
df.select_dtypes('object').apply(pd.Series.nunique, axis = 0)

x1    3
x2    5
dtype: int64

## 各オブジェクトカラム内の要素がtrainとtestで同じかどうかチェック

In [9]:
#各オブジェクトカラム内の要素がtrainとtestで同じかどうかチェック
def check_object(train,test):
    columns = train.select_dtypes('object').columns
    flag = True
    for col in columns:
        trn_uni = np.unique(train[col].dropna())
        tst_uni = np.unique(test[col].dropna())
        if len(trn_uni) == len(tst_uni):
            for i,j in zip(trn_uni,tst_uni):
                if i != j:
                    print("{}に違うユニーク要素があります".format(col))
                    flag = False
                    break
        else:
            print("{}のユニーク要素数が違います\n train要素数:{}\n test要素数:{}".format(col,len(trn_uni),len(tst_uni)))
            flag = False
    if flag:
        print("ALL OK")

In [10]:
#各オブジェクトカラム内の要素がtrainとtestで同じ場合
df1 = df.copy()
check_object(df,df1)

ALL OK


In [11]:
#各オブジェクトカラム内の要素がtrainとtestで違った場合　（ユニーク要素数は同じ）
df1 = df.copy()
df1["x1"][1] = "f"
check_object(df,df1)

x1に違うユニーク要素があります


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  This is separate from the ipykernel package so we can avoid doing imports until


In [12]:
#各オブジェクトカラム内の要素がtrainとtestで違った場合　（ユニーク要素数が違う）
df1 = df.copy()
df1["x1"][0] = "f"
df2 = df.copy()
check_object(df,df1)

x1のユニーク要素数が違います
 train要素数:3
 test要素数:4


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  This is separate from the ipykernel package so we can avoid doing imports until


#  

## マッピング

In [8]:
#普通に
map_dic = {"a":1, "b":2, "c": 3}
df1 = df.copy()
df1["x1"] = df1["x1"].map(map_dic)
df1

Unnamed: 0,x1,x2,y
0,1,1,1
1,2,2,2
2,1,3,3
3,1,4,4
4,3,5,5


In [14]:
#要素数がnum以下のオブジェクトカラムをマッピングする
def mapping(train_df, test_df, num):
    train = train_df.copy()
    test = test_df.copy()
    columns = num >= train_df.select_dtypes('object').apply(pd.Series.nunique, axis = 0)
    columns = columns[columns == True]
    columns =columns.index
    
    np.array(columns)
    for col in columns:
        #yの値が高い順
        map_name = train_df[[col , "y"]].groupby([col] , as_index=False ).mean().sort_values(by="y" , ascending=False )[col].values
        
        #マッピング例 dataset["housing"] = dataset["housing"].map( { "no" : 1 , "yes" :0} ).astype(int)
        dic = {}
        for i in range(len(map_name)):
            dic[map_name[i]] = len(map_name) - i
        
        train[col] = train[col].map(dic).astype(int)
        test[col] = test[col].map(dic).astype(int)
    return train, test

In [15]:
df1 = df.copy()
train,test = mapping(df,df1,3)
train

Unnamed: 0,x1,x2,y
0,2,1,1
1,1,2,2
2,2,3,3
3,2,4,4
4,3,5,5


#  

## オブジェクトカラムの数を数える

In [16]:
def count_object(train,test):
    train_obj = train.select_dtypes('object').columns
    test_obj = test.select_dtypes("object").columns
    print("train object", len(train_obj))
    print("test object", len(test_obj))

In [17]:
df1 = df.copy()
train,test = mapping(df,df1,6)
count_object(df,df1)
print("-"*20)
count_object(train,test)

train object 2
test object 2
--------------------
train object 0
test object 0


#  

# メモリ節約 (数値を最適な型に変更)

In [18]:
# reference ・・・https://www.kaggle.com/gemartin/load-data-reduce-memory-usage
def reduce_mem_usage(df):
    """ iterate through all the columns of a dataframe and modify the data type
        to reduce memory usage.
    """
    start_mem = df.memory_usage().sum() / 1024**2
    
    for col in df.columns:
        col_type = df[col].dtype
        
        if col_type != object:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)  
            else:
                #if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                #    df[col] = df[col].astype(np.float16)
                #el
                if c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)
        #else:
            #df[col] = df[col].astype('category')

    end_mem = df.memory_usage().sum() / 1024**2
    print('Memory usage of dataframe is {:.2f} MB --> {:.2f} MB (Decreased by {:.1f}%)'.format(
        start_mem, end_mem, 100 * (start_mem - end_mem) / start_mem))
    return df

In [19]:
from sklearn.datasets import load_boston
boston = load_boston()
df_data = pd.DataFrame(boston.data,columns=boston.feature_names)
print(df_data.info())
print("-"*25)
df_data = reduce_mem_usage(df_data)
print(df_data.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 506 entries, 0 to 505
Data columns (total 13 columns):
CRIM       506 non-null float64
ZN         506 non-null float64
INDUS      506 non-null float64
CHAS       506 non-null float64
NOX        506 non-null float64
RM         506 non-null float64
AGE        506 non-null float64
DIS        506 non-null float64
RAD        506 non-null float64
TAX        506 non-null float64
PTRATIO    506 non-null float64
B          506 non-null float64
LSTAT      506 non-null float64
dtypes: float64(13)
memory usage: 51.5 KB
None
-------------------------
Memory usage of dataframe is 0.05 MB --> 0.03 MB (Decreased by 49.9%)
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 506 entries, 0 to 505
Data columns (total 13 columns):
CRIM       506 non-null float32
ZN         506 non-null float32
INDUS      506 non-null float32
CHAS       506 non-null float32
NOX        506 non-null float32
RM         506 non-null float32
AGE        506 non-null float32
DIS    