# [pandas前処理編] 第2章 解説
Copyright © 2022 田中裕行 All rights reserved.

## ■(1) pandasの基礎

In [None]:
# ライブラリのインポート
import pandas as pd

# リストをシリーズ(一次元配列)に変換
list1 = [12, 34, 56]
se = pd.Series(list1)
print(se)

0    12
1    34
2    56
dtype: int64


In [None]:
# リストをデータフレーム(ニ次元配列)に変換
list2 = [[1,2,3], [4,5,6], [7,8,9]]
df = pd.DataFrame(list2)
print(df)

   0  1  2
0  1  2  3
1  4  5  6
2  7  8  9


## ■(2) データの読込

In [None]:
# サンプルデータを利用するためにSeabornをインポート
import seaborn as sns
print(sns.get_dataset_names())      # 利用可能なサンプルデータの名前表示
print(len(sns.get_dataset_names())) # 利用可能なサンプルデータの個数表示

['anagrams', 'anscombe', 'attention', 'brain_networks', 'car_crashes', 'diamonds', 'dots', 'dowjones', 'exercise', 'flights', 'fmri', 'geyser', 'glue', 'healthexp', 'iris', 'mpg', 'penguins', 'planets', 'seaice', 'taxis', 'tips', 'titanic']
22


In [None]:
# Seabornのデータセットからアヤメのデータを読み込み
iris = sns.load_dataset('iris')
print(iris.head()) # headで先頭の5行を表示

   sepal_length  sepal_width  petal_length  petal_width species
0           5.1          3.5           1.4          0.2  setosa
1           4.9          3.0           1.4          0.2  setosa
2           4.7          3.2           1.3          0.2  setosa
3           4.6          3.1           1.5          0.2  setosa
4           5.0          3.6           1.4          0.2  setosa


In [None]:
# Seabornのデータセットからタイタニックのデータを読み込み
titanic = sns.load_dataset('titanic')
print(titanic.head()) # headで先頭の5行を表示

   survived  pclass     sex   age  sibsp  parch     fare embarked  class  \
0         0       3    male  22.0      1      0   7.2500        S  Third   
1         1       1  female  38.0      1      0  71.2833        C  First   
2         1       3  female  26.0      0      0   7.9250        S  Third   
3         1       1  female  35.0      1      0  53.1000        S  First   
4         0       3    male  35.0      0      0   8.0500        S  Third   

     who  adult_male deck  embark_town alive  alone  
0    man        True  NaN  Southampton    no  False  
1  woman       False    C    Cherbourg   yes  False  
2  woman       False  NaN  Southampton   yes   True  
3  woman       False    C  Southampton   yes  False  
4    man        True  NaN  Southampton    no   True  


## ■(3) データの表示

In [None]:
print(titanic.tail()) # tailで末尾の5行を表示

     survived  pclass     sex   age  sibsp  parch   fare embarked   class  \
886         0       2    male  27.0      0      0  13.00        S  Second   
887         1       1  female  19.0      0      0  30.00        S   First   
888         0       3  female   NaN      1      2  23.45        S   Third   
889         1       1    male  26.0      0      0  30.00        C   First   
890         0       3    male  32.0      0      0   7.75        Q   Third   

       who  adult_male deck  embark_town alive  alone  
886    man        True  NaN  Southampton    no   True  
887  woman       False    B  Southampton   yes   True  
888  woman       False  NaN  Southampton    no  False  
889    man        True    C    Cherbourg   yes   True  
890    man        True  NaN   Queenstown    no   True  


In [None]:
print(titanic) # 先頭と末尾を5行ずつ表示

     survived  pclass     sex   age  sibsp  parch     fare embarked   class  \
0           0       3    male  22.0      1      0   7.2500        S   Third   
1           1       1  female  38.0      1      0  71.2833        C   First   
2           1       3  female  26.0      0      0   7.9250        S   Third   
3           1       1  female  35.0      1      0  53.1000        S   First   
4           0       3    male  35.0      0      0   8.0500        S   Third   
..        ...     ...     ...   ...    ...    ...      ...      ...     ...   
886         0       2    male  27.0      0      0  13.0000        S  Second   
887         1       1  female  19.0      0      0  30.0000        S   First   
888         0       3  female   NaN      1      2  23.4500        S   Third   
889         1       1    male  26.0      0      0  30.0000        C   First   
890         0       3    male  32.0      0      0   7.7500        Q   Third   

       who  adult_male deck  embark_town alive  alo

## ■(4) データの要約

In [None]:
# shapeで行数と列数を表示
titanic.shape

(891, 15)

In [None]:
# describeで各列の要約統計量を表示
titanic.describe()

Unnamed: 0,survived,pclass,age,sibsp,parch,fare
count,891.0,891.0,714.0,891.0,891.0,891.0
mean,0.383838,2.308642,29.699118,0.523008,0.381594,32.204208
std,0.486592,0.836071,14.526497,1.102743,0.806057,49.693429
min,0.0,1.0,0.42,0.0,0.0,0.0
25%,0.0,2.0,20.125,0.0,0.0,7.9104
50%,0.0,3.0,28.0,0.0,0.0,14.4542
75%,1.0,3.0,38.0,1.0,0.0,31.0
max,1.0,3.0,80.0,8.0,6.0,512.3292


## ■(5) データの追加

In [None]:
# 行の追加
df1 = pd.DataFrame([[1,2,3],[4,5,6]])
print(df1)

df1 = df1.append([[7,8,9]])  # appendで行を追加
print(df1)

   0  1  2
0  1  2  3
1  4  5  6
   0  1  2
0  1  2  3
1  4  5  6
0  7  8  9


In [None]:
# 列の追加
df2 = pd.DataFrame({'A':[1,2,3], 'B':[4,5,6]})
print(df2)

df2 = df2.assign(C=[7,8,9])  # assignでC列を追加
print(df2)

   A  B
0  1  4
1  2  5
2  3  6
   A  B  C
0  1  4  7
1  2  5  8
2  3  6  9


## ■(6) データの結合

In [None]:
# 縦に連結(concat, axis=0)
df1 = pd.DataFrame([[1,2,3],[4,5,6]])
df2 = pd.DataFrame([[11,22,33],[44,55,66]])
dfv = pd.concat([df1, df2], axis=0) # axis=0で縦に連結(省略可)
print(dfv)

    0   1   2
0   1   2   3
1   4   5   6
0  11  22  33
1  44  55  66


In [None]:
# 横に連結(concat, axis=1)
df1 = pd.DataFrame([[1,2,3],[4,5,6]])
df2 = pd.DataFrame([[11,22,33],[44,55,66]])
dfh = pd.concat([df1, df2], axis=1) # axis=1で横に連結
print(dfh)

   0  1  2   0   1   2
0  1  2  3  11  22  33
1  4  5  6  44  55  66


## ■(7) データの分割

In [None]:
# 分割前のデータの確認
print('元データの行数と列数', titanic.shape)
n = titanic.shape[0] // 2 # 半分の値を計算(小数切り捨て)
print('半分の値', n)

# データを2つに分割
ti_train = titanic.iloc[:n,:] # 前半半分を学習用データとして確保
ti_test  = titanic.iloc[n:,:] # 後半半分を検証用データとして確保

# 分割後のデータの確認
print('学習用データの行数と列数', ti_train.shape)
print('検証用データの行数と列数', ti_test.shape)

元データの行数と列数 (891, 15)
半分の値 445
学習用データの行数と列数 (445, 15)
検証用データの行数と列数 (446, 15)


## ■(8) データの集計

In [None]:
# 果物の売上データ
df = pd.DataFrame({
    'fruit': ['apple','banana','apple','orange','banana','apple'],
    'price': [   230 ,    360 ,   280 ,    320 ,    240 ,   430 ]})
df

Unnamed: 0,fruit,price
0,apple,230
1,banana,360
2,apple,280
3,orange,320
4,banana,240
5,apple,430


In [None]:
# 果物の種類別の合計
df.groupby(['fruit']).sum()

Unnamed: 0_level_0,price
fruit,Unnamed: 1_level_1
apple,940
banana,600
orange,320


In [None]:
 # 果物の種類別の平均
df.groupby(['fruit']).mean()

Unnamed: 0_level_0,price
fruit,Unnamed: 1_level_1
apple,313.333333
banana,300.0
orange,320.0


## ■(9) データの加工1(欠損値の置換)

In [None]:
# 欠損値を平均値で置換
df = titanic['age'].copy() # 年齢のデータをコピー
print('欠損値の個数：', df.isnull().sum())
print('年齢の平均値：', df.mean()) # 欠損値を除く平均値の算出
df = df.fillna(df.mean())          # fillnaで欠損値(NaN)を平均値で置換
print('欠損値の個数：', df.isnull().sum())

欠損値の個数： 177
年齢の平均値： 29.69911764705882
欠損値の個数： 0


In [None]:
# 欠損値を中央値で置換
df = titanic['age'].copy() # 年齢のデータをコピー
print('欠損値の個数：', df.isnull().sum())
print('年齢の中央値：', df.median()) # 欠損値を除く中央値の算出
df = df.fillna(df.median())          # fillnaで欠損値(NaN)を中央値で置換
print('欠損値の個数：', df.isnull().sum())

欠損値の個数： 177
年齢の中央値： 28.0
欠損値の個数： 0


In [None]:
# 欠損値を最頻値で置換
df = titanic['age'].copy() # 年齢のデータをコピー
print('欠損値の個数：', df.isnull().sum())
print('年齢の最頻値：', df.mode().iloc[0]) # 欠損値を除く最頻値の算出
df = df.fillna(df.mode().iloc[0])          # fillnaで欠損値(NaN)を最頻値で置換
print('欠損値の個数：', df.isnull().sum())

欠損値の個数： 177
年齢の最頻値： 24.0
欠損値の個数： 0


## ■(10) データの加工2(非数値をダミー変数に変換)

In [None]:
# 文字列を数値に置換
df = titanic['sex'].copy()       # 性別のデータをコピー
print(df.head())                 # コピーしたデータを表示

0      male
1    female
2    female
3    female
4      male
Name: sex, dtype: object


In [None]:
# ダミー変数に変換
print(pd.get_dummies(df).head()) # ダミー変数を確認
df = pd.get_dummies(df)          # ダミー変数に変換
print(df.head())                 # ダミー変数に変換したデータを表示

   female  male
0       0     1
1       1     0
2       1     0
3       1     0
4       0     1
   female  male
0       0     1
1       1     0
2       1     0
3       1     0
4       0     1


## ■(11) データの保存と読込

In [None]:
# pandasのデータフレームをCSV形式でGoogleドライブに保存
from google.colab import drive 
drive.mount('/content/drive')
titanic.to_csv('/content/drive/My Drive/titanic.csv')

Mounted at /content/drive


In [None]:
# CSV形式のファイルをGoogleドライブから読み込んでpandasのデータフレームに変換
from google.colab import drive 
drive.mount('/content/drive')
df = pd.read_csv('/content/drive/My Drive/titanic.csv')
df.head()

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


Unnamed: 0.1,Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,who,adult_male,deck,embark_town,alive,alone
0,0,0,3,male,22.0,1,0,7.25,S,Third,man,True,,Southampton,no,False
1,1,1,1,female,38.0,1,0,71.2833,C,First,woman,False,C,Cherbourg,yes,False
2,2,1,3,female,26.0,0,0,7.925,S,Third,woman,False,,Southampton,yes,True
3,3,1,1,female,35.0,1,0,53.1,S,First,woman,False,C,Southampton,yes,False
4,4,0,3,male,35.0,0,0,8.05,S,Third,man,True,,Southampton,no,True
