## 2.2 DataFrameの分割

In [1]:
import pandas as pd

df = pd.read_csv(
    'sample_1_5.csv',
    dtype={
        'col_int8': 'int8',
        'col_uint8': 'uint8',
        'col_int16': 'int16',
        'col_uint16': 'uint16',
        'col_int32': 'int32',
        'col_uint32': 'uint32',
        'col_int64': 'int64',
        'col_uint64': 'uint64',
        'col_float16': 'float16',
        'col_float32': 'float32',
        'col_float64': 'float64',
        'col_bool': 'bool',
        'col_string': 'string',
        'col_ordered': 'category',
        'col_unordered': 'category',
    },
    parse_dates=[13]
)

ordered_categories = ['EXTRA-SMALL', 'SMALL', 'MIDDLE', 'LARGE', 'EXTRA-LARGE']

df['col_ordered'] = df['col_ordered'].cat.set_categories(
    ordered_categories,
    ordered=True)

unordered_categories = [
    'Mouse',
    'Cat',
    'Dog',
    'Hamster',
    'Rabbit',
    'Ferret']

df['col_unordered'] = df['col_unordered'].cat.set_categories(
    unordered_categories,
    ordered=False)

df.shape

(200000, 16)

In [2]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 200000 entries, 0 to 199999
Data columns (total 16 columns):
 #   Column         Non-Null Count   Dtype         
---  ------         --------------   -----         
 0   col_int8       200000 non-null  int8          
 1   col_uint8      200000 non-null  uint8         
 2   col_int16      200000 non-null  int16         
 3   col_uint16     200000 non-null  uint16        
 4   col_int32      200000 non-null  int32         
 5   col_uint32     200000 non-null  uint32        
 6   col_int64      200000 non-null  int64         
 7   col_uint64     200000 non-null  uint64        
 8   col_float16    200000 non-null  float16       
 9   col_float32    200000 non-null  float32       
 10  col_float64    200000 non-null  float64       
 11  col_bool       200000 non-null  bool          
 12  col_string     200000 non-null  string        
 13  col_datetime   200000 non-null  datetime64[ns]
 14  col_ordered    200000 non-null  category      
 15  

In [3]:
df1 = df.iloc[:150000]

df1.shape

(150000, 16)

In [4]:
df2 = df.iloc[150000:]

df2.shape

(50000, 16)

In [5]:
df3 = df.iloc[:, :12]

df3.shape

(200000, 12)

In [6]:
df4 = df.iloc[:, 12:]

df4.shape

(200000, 4)

In [7]:
gb = df.groupby('col_unordered')

type(gb)

pandas.core.groupby.generic.DataFrameGroupBy

In [8]:
gb.size()

col_unordered
Mouse      33383
Cat        33273
Dog        33384
Hamster    33287
Rabbit     33602
Ferret     33071
dtype: int64

In [9]:
df_mouse = gb.get_group('Mouse')

df_mouse.head()

Unnamed: 0,col_int8,col_uint8,col_int16,col_uint16,col_int32,col_uint32,col_int64,col_uint64,col_float16,col_float32,col_float64,col_bool,col_string,col_datetime,col_ordered,col_unordered
4,12,212,13470,57662,812726532,1340894438,-1791361854263043924,2901203935713848353,0.620117,0.325553,0.450991,False,v0TZ8qlw,1700-01-05 16:37:16,EXTRA-SMALL,Mouse
17,-118,72,-14622,57604,1582005113,513547424,1401451633856761704,15366429918558710399,0.737305,0.61086,0.922958,True,BvHzVsau,1700-01-18 13:00:45,MIDDLE,Mouse
23,18,98,-30154,12532,67134227,514303156,1365196833970852386,15568901983481002003,0.760742,0.494002,0.607546,True,JRXRXE,1700-01-24 08:47:02,EXTRA-LARGE,Mouse
27,-44,205,25230,54757,570723259,2591380613,6262849499533420187,1727506916833625085,0.67041,0.066153,0.933833,True,cECY49,1700-01-28 20:33:08,LARGE,Mouse
41,-51,180,4625,26306,1367833466,2546003471,-6956886051460496162,13868885902914802716,0.833984,0.28215,0.223907,True,SfbgF,1700-02-11 21:55:07,MIDDLE,Mouse


In [10]:
df_cat = gb.get_group('Cat')
# df.loc[df.loc[:, 'col_unordered'] == 'Cat'] や 
# df.query('col_unordered == "Cat"') でも同じ

df_cat.head()

Unnamed: 0,col_int8,col_uint8,col_int16,col_uint16,col_int32,col_uint32,col_int64,col_uint64,col_float16,col_float32,col_float64,col_bool,col_string,col_datetime,col_ordered,col_unordered
5,-61,225,-18121,9830,-1446151776,3013488260,-2701692826071598425,2241484607082862008,0.073425,0.971069,0.018226,True,srbjJLP,1700-01-06 13:51:19,LARGE,Cat
8,-23,54,-7424,65385,-1324881698,953993731,6128203876391204943,14000964572561896882,0.140381,0.604176,0.108115,True,mXjeG6,1700-01-09 21:10:42,EXTRA-SMALL,Cat
16,-98,22,-16646,58094,-658607969,655607873,3404345805479656430,5096669087158830124,0.07605,0.832736,0.199611,True,TIic,1700-01-17 04:50:04,SMALL,Cat
31,-111,187,-16968,42828,-1960399259,822734557,-4765912514916795068,18406417208120733228,0.828125,0.309161,0.219022,True,yR6pM,1700-02-01 04:58:36,LARGE,Cat
34,66,152,12013,4952,-820819230,3512962893,-8713382684225017206,2055208308306467596,0.740234,0.258134,0.437826,False,15C4mC2,1700-02-04 09:05:33,EXTRA-LARGE,Cat


In [11]:
for group, df_group in gb:
    print(group, df_group.shape)

Mouse (33383, 16)
Cat (33273, 16)
Dog (33384, 16)
Hamster (33287, 16)
Rabbit (33602, 16)
Ferret (33071, 16)


In [12]:
from sklearn.model_selection import train_test_split

df_train, df_test = train_test_split(df, test_size=0.2, random_state=0)
# train_test_split(df, test_size=40000) と同じ

df_train.shape, df_test.shape

((160000, 16), (40000, 16))

In [13]:
df_train.head()

Unnamed: 0,col_int8,col_uint8,col_int16,col_uint16,col_int32,col_uint32,col_int64,col_uint64,col_float16,col_float32,col_float64,col_bool,col_string,col_datetime,col_ordered,col_unordered
127478,-72,27,3853,57491,-1112717343,3009994274,-2993066590894730154,4856804777963727627,0.265625,0.767206,0.196115,True,tWM,2049-01-09 18:47:58,MIDDLE,Cat
155552,94,36,5166,28348,1370340258,2042364264,-3777001664252634560,6705984686542068610,0.45459,0.142779,0.72772,False,462l7q8,2125-11-21 10:48:47,MIDDLE,Mouse
75475,10,29,3365,29491,-1965935571,947581500,7570720735943845926,15179047376142702404,0.654297,0.65585,0.840668,False,0se2ZxuD,1906-08-25 14:16:24,EXTRA-SMALL,Hamster
186114,29,93,5840,27722,575550766,2230778676,3868248375348797193,10046415197673935811,0.540527,0.407988,0.555073,True,hTh3fTs,2209-07-26 05:46:58,SMALL,Rabbit
93717,-65,165,-2511,994,61908793,2286780953,-5675846870268755869,6211416484931316168,0.745117,0.731461,0.311473,False,kcOWM,1956-08-04 01:10:50,EXTRA-SMALL,Ferret


In [14]:
df_train = df_train.reset_index(drop=True)

df_train.head()

Unnamed: 0,col_int8,col_uint8,col_int16,col_uint16,col_int32,col_uint32,col_int64,col_uint64,col_float16,col_float32,col_float64,col_bool,col_string,col_datetime,col_ordered,col_unordered
0,-72,27,3853,57491,-1112717343,3009994274,-2993066590894730154,4856804777963727627,0.265625,0.767206,0.196115,True,tWM,2049-01-09 18:47:58,MIDDLE,Cat
1,94,36,5166,28348,1370340258,2042364264,-3777001664252634560,6705984686542068610,0.45459,0.142779,0.72772,False,462l7q8,2125-11-21 10:48:47,MIDDLE,Mouse
2,10,29,3365,29491,-1965935571,947581500,7570720735943845926,15179047376142702404,0.654297,0.65585,0.840668,False,0se2ZxuD,1906-08-25 14:16:24,EXTRA-SMALL,Hamster
3,29,93,5840,27722,575550766,2230778676,3868248375348797193,10046415197673935811,0.540527,0.407988,0.555073,True,hTh3fTs,2209-07-26 05:46:58,SMALL,Rabbit
4,-65,165,-2511,994,61908793,2286780953,-5675846870268755869,6211416484931316168,0.745117,0.731461,0.311473,False,kcOWM,1956-08-04 01:10:50,EXTRA-SMALL,Ferret


In [15]:
df_test.reset_index(drop=True, inplace=True)

df_test.head()

Unnamed: 0,col_int8,col_uint8,col_int16,col_uint16,col_int32,col_uint32,col_int64,col_uint64,col_float16,col_float32,col_float64,col_bool,col_string,col_datetime,col_ordered,col_unordered
0,-18,60,10529,21271,197061330,1639714456,1257327177626768709,1723013210922501670,0.886719,0.991829,0.792836,True,RCr4k5I,1849-02-07 00:11:20,MIDDLE,Ferret
1,-69,89,7214,29271,1818787157,1302698397,1204508283466257026,10474355017457038888,0.731445,0.611232,0.162229,False,9XRF,2024-11-04 00:13:00,LARGE,Cat
2,-49,101,-25236,63527,-201377119,492804185,-3153268850932708637,4950203708192017949,0.710449,0.595511,0.062612,True,n8TbGiO,1857-02-28 15:29:48,MIDDLE,Ferret
3,15,204,32555,62806,1288807388,1557849753,2720881038481201732,3210356497066780303,0.191162,0.477569,0.13378,False,BBx57b,2015-10-01 16:29:55,MIDDLE,Mouse
4,38,35,-15893,60334,1084023158,1797398521,1908318620199937126,2261059718305035407,0.777344,0.565411,0.517109,True,eL8HBU,1896-04-07 20:00:40,MIDDLE,Dog
