In [1]:
# General
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np

# prep
from sklearn.preprocessing import LabelEncoder

# tqdm
from tqdm import tqdm

# 親のフォルダのパスを追加
import sys; sys.path.insert(0, '..')

from util.data_loader import DataLoader
import importlib

In [2]:
def reduce_mem_usage(df, verbose=True):
    numerics = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64']
    start_mem = df.memory_usage().sum() / 1024**2
    for col in df.columns:
        col_type = df[col].dtypes
        if col_type in numerics:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)
            else:
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)
    end_mem = df.memory_usage().sum() / 1024**2
    if verbose: print('Mem. usage decreased to {:5.2f} Mb ({:.1f}% reduction)'.format(end_mem, 100 * (start_mem - end_mem) / start_mem))
    return df

In [3]:
loader = DataLoader()
# データの読み込み
print('Importing song_infos...')
# データフレームの読み込み
pokemon_infos = loader.load()

pokemon_infos.info()

Importing song_infos...
<class 'pandas.core.frame.DataFrame'>
Index: 1010 entries, 0 to 1024
Data columns (total 10 columns):
 #   Column           Non-Null Count  Dtype 
---  ------           --------------  ----- 
 0   name             1010 non-null   object
 1   height           1010 non-null   int64 
 2   weight           1010 non-null   int64 
 3   types            1010 non-null   object
 4   hp               1010 non-null   int64 
 5   attack           1010 non-null   int64 
 6   defense          1010 non-null   int64 
 7   special-attack   1010 non-null   int64 
 8   special-defense  1010 non-null   int64 
 9   speed            1010 non-null   int64 
dtypes: int64(8), object(2)
memory usage: 86.8+ KB


In [5]:
pokemon_infos

Unnamed: 0,name,height,weight,types,hp,attack,defense,special-attack,special-defense,speed
0,bulbasaur,7,69,"[grass, poison]",45,49,49,65,65,45
1,ivysaur,10,130,"[grass, poison]",60,62,63,80,80,60
2,venusaur,20,1000,"[grass, poison]",80,82,83,100,100,80
3,charmander,6,85,[fire],39,52,43,60,50,65
4,charmeleon,11,190,[fire],58,64,58,80,65,80
...,...,...,...,...,...,...,...,...,...,...
1018,hydrapple,18,930,"[grass, dragon]",106,80,110,120,80,44
1019,gouging,35,5900,"[fire, dragon]",105,115,121,65,93,91
1020,raging,52,4800,"[electric, dragon]",125,73,91,137,89,75
1023,terapagos,2,65,[normal],90,65,85,65,85,60


In [6]:
pd.options.display.float_format = '{:.2f}'.format

def custom_info(data):
    # 基本的な統計情報
    print("基本統計情報")
    display(data.describe())

custom_info(pokemon_infos)

基本統計情報


Unnamed: 0,height,weight,hp,attack,defense,special-attack,special-defense,speed
count,1010.0,1010.0,1010.0,1010.0,1010.0,1010.0,1010.0,1010.0
mean,12.1,664.9,69.99,77.22,72.27,69.66,69.92,66.81
std,12.56,1213.93,26.63,29.71,29.32,29.42,26.57,28.61
min,1.0,1.0,1.0,5.0,5.0,10.0,20.0,5.0
25%,5.0,85.0,50.0,55.0,50.0,46.0,50.0,45.0
50%,10.0,280.0,67.0,75.0,69.5,65.0,65.0,65.0
75%,15.0,695.5,85.0,100.0,90.0,90.0,85.0,86.75
max,200.0,9999.0,255.0,181.0,230.0,173.0,230.0,200.0


In [7]:
drop_types = pokemon_infos.drop("types", axis=1)

# 'なし'をNaNに変換
drop_types.replace('なし', np.nan, inplace=True)

# 欠損値の詳細を確認
pd.concat(
    [
        drop_types.count().rename("件数"),
        drop_types.nunique(dropna=False).rename("ユニーク数"),
        drop_types.dtypes.rename("型"),
        drop_types.isnull().sum().rename("欠損の数"),
        (drop_types.isnull().sum() * 100 /
            pokemon_infos.shape[0]).rename("欠損の割合（%）").round(2),
    ], axis=1
)

Unnamed: 0,件数,ユニーク数,型,欠損の数,欠損の割合（%）
name,1010,1010,object,0,0.0
height,1010,54,int64,0,0.0
weight,1010,476,int64,0,0.0
hp,1010,105,int64,0,0.0
attack,1010,117,int64,0,0.0
defense,1010,107,int64,0,0.0
special-attack,1010,110,int64,0,0.0
special-defense,1010,102,int64,0,0.0
speed,1010,119,int64,0,0.0
