# 準備

In [1]:
import pandas as pd
import tensorflow_data_validation as tfdv
from IPython.display import display
import warnings
warnings.filterwarnings('ignore')

In [2]:
df = pd.read_csv('data/titanic.csv')

ちなみに、infoで情報を出すとこんな感じ

In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  891 non-null    int64  
 1   Survived     891 non-null    int64  
 2   Pclass       891 non-null    int64  
 3   Name         891 non-null    object 
 4   Sex          891 non-null    object 
 5   Age          714 non-null    float64
 6   SibSp        891 non-null    int64  
 7   Parch        891 non-null    int64  
 8   Ticket       891 non-null    object 
 9   Fare         891 non-null    float64
 10  Cabin        204 non-null    object 
 11  Embarked     889 non-null    object 
dtypes: float64(2), int64(5), object(5)
memory usage: 83.7+ KB


# TFDV の基本操作

## 統計量の生成・可視化

まず`tfdv.generate_statistics_from_dataframe`を使用して各特徴量の統計量を作成していきます。戻り値はprotocol buffuersになっています。

In [4]:
stats = tfdv.generate_statistics_from_dataframe(df)
print(stats)

datasets {
  num_examples: 891
  features {
    num_stats {
      common_stats {
        num_non_missing: 891
        min_num_values: 1
        max_num_values: 1
        avg_num_values: 1.0
        num_values_histogram {
          buckets {
            low_value: 1.0
            high_value: 1.0
            sample_count: 89.1
          }
          buckets {
            low_value: 1.0
            high_value: 1.0
            sample_count: 89.1
          }
          buckets {
            low_value: 1.0
            high_value: 1.0
            sample_count: 89.1
          }
          buckets {
            low_value: 1.0
            high_value: 1.0
            sample_count: 89.1
          }
          buckets {
            low_value: 1.0
            high_value: 1.0
            sample_count: 89.1
          }
          buckets {
            low_value: 1.0
            high_value: 1.0
            sample_count: 89.1
          }
          buckets {
            low_value: 1.0
            high_value: 

`visualize_statistics`を使ってデータの統計量を可視化します。

In [5]:
tfdv.visualize_statistics(stats)

ちなみに、csvあるいはtfrecordのファイルを直接読み込んで統計量を作り、可視化することもできる。

In [6]:
# tfrecordの場合は generate_statistics_from_csv -> generate_statistics_from_tfrecord
tfdv.visualize_statistics(tfdv.generate_statistics_from_csv('data/titanic.csv', delimiter=','))





Instructions for updating:
Use eager execution and: 
`tf.data.TFRecordDataset(path)`


Instructions for updating:
Use eager execution and: 
`tf.data.TFRecordDataset(path)`


## スキーマの生成・可視化

tfdvではデータからスキーマを自動生成することができ、かつそのスキーマを可視化することができます。

In [6]:
stats = tfdv.generate_statistics_from_dataframe(df)
schema = tfdv.infer_schema(stats)
schema

feature {
  name: "PassengerId"
  type: INT
  presence {
    min_fraction: 1.0
    min_count: 1
  }
  shape {
    dim {
      size: 1
    }
  }
}
feature {
  name: "Survived"
  type: INT
  bool_domain {
  }
  presence {
    min_fraction: 1.0
    min_count: 1
  }
  shape {
    dim {
      size: 1
    }
  }
}
feature {
  name: "Pclass"
  type: INT
  presence {
    min_fraction: 1.0
    min_count: 1
  }
  shape {
    dim {
      size: 1
    }
  }
}
feature {
  name: "Name"
  type: BYTES
  presence {
    min_fraction: 1.0
    min_count: 1
  }
  shape {
    dim {
      size: 1
    }
  }
}
feature {
  name: "Sex"
  type: BYTES
  domain: "Sex"
  presence {
    min_fraction: 1.0
    min_count: 1
  }
  shape {
    dim {
      size: 1
    }
  }
}
feature {
  name: "Age"
  value_count {
    min: 1
    max: 1
  }
  type: FLOAT
  presence {
    min_count: 1
  }
}
feature {
  name: "SibSp"
  type: INT
  presence {
    min_fraction: 1.0
    min_count: 1
  }
  shape {
    dim {
      size: 1
    }
  }

In [7]:
tfdv.display_schema(schema)

Unnamed: 0_level_0,Type,Presence,Valency,Domain
Feature name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
'PassengerId',INT,required,,-
'Survived',INT,required,,-
'Pclass',INT,required,,-
'Name',BYTES,required,,-
'Sex',STRING,required,,'Sex'
'Age',FLOAT,optional,single,-
'SibSp',INT,required,,-
'Parch',INT,required,,-
'Ticket',BYTES,required,,-
'Fare',FLOAT,required,,-


Unnamed: 0_level_0,Values
Domain,Unnamed: 1_level_1
'Sex',"'female', 'male'"
'Embarked',"'C', 'Q', 'S'"


# 異常の検知

## 準備

In [8]:
train = df.iloc[:600]
valid = df.iloc[600:].reset_index(drop=True)

print(f'train: {train.shape}')
display(train.head())
print(f'valid: {valid.shape}')
display(valid.head())

train: (600, 12)


Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Thayer)",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


valid: (291, 12)


Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,601,1,2,"Jacobsohn, Mrs. Sidney Samuel (Amy Frances Christy)",female,24.0,2,1,243847,27.0,,S
1,602,0,3,"Slabenoff, Mr. Petco",male,,0,0,349214,7.8958,,S
2,603,0,1,"Harrington, Mr. Charles H",male,,0,0,113796,42.4,,S
3,604,0,3,"Torber, Mr. Ernst William",male,44.0,0,0,364511,8.05,,S
4,605,1,1,"Homer, Mr. Harry (""Mr E Haven"")",male,35.0,0,0,111426,26.55,,C


## スキーマ異常の検知

In [9]:
schema = tfdv.infer_schema(tfdv.generate_statistics_from_dataframe(train))

### カラムが欠けていた場合

In [10]:
lack_col_valid = valid.copy().drop('Pclass', axis=1)
anomaly = tfdv.validate_statistics(tfdv.generate_statistics_from_dataframe(lack_col_valid),
                                     schema,
                                    )
tfdv.display_anomalies(anomaly)

Unnamed: 0_level_0,Anomaly short description,Anomaly long description
Feature name,Unnamed: 1_level_1,Unnamed: 2_level_1
'Pclass',Column dropped,Column is completely missing


### 未知の値が入っていた場合

In [11]:
new_val_in_sex_valid = valid.copy()
new_val_in_sex_valid.loc[0, 'Sex'] = 'dog'
anomaly = tfdv.validate_statistics(tfdv.generate_statistics_from_dataframe(new_val_in_sex_valid),
                                   schema,
                                  )
tfdv.display_anomalies(anomaly)

Unnamed: 0_level_0,Anomaly short description,Anomaly long description
Feature name,Unnamed: 1_level_1,Unnamed: 2_level_1
'Sex',Unexpected string values,Examples contain values missing from the schema: dog (<1%).


### 異常がない場合

In [12]:
anomaly = tfdv.validate_statistics(tfdv.generate_statistics_from_dataframe(train),
                                   schema,
                                  )
tfdv.display_anomalies(anomaly)

In [13]:
tfdv.display_schema(schema)

Unnamed: 0_level_0,Type,Presence,Valency,Domain
Feature name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
'PassengerId',INT,required,,-
'Survived',INT,required,,-
'Pclass',INT,required,,-
'Name',BYTES,required,,-
'Sex',STRING,required,,'Sex'
'Age',FLOAT,optional,single,-
'SibSp',INT,required,,-
'Parch',INT,required,,-
'Ticket',BYTES,required,,-
'Fare',FLOAT,required,,-


Unnamed: 0_level_0,Values
Domain,Unnamed: 1_level_1
'Sex',"'female', 'male'"
'Embarked',"'C', 'Q', 'S'"


## 分布の異常検知

In [17]:
error_distribution_valid = valid.copy()
error_distribution_valid['Age'] = error_distribution_valid['Age'] * 2

tfdv.visualize_statistics(lhs_statistics=tfdv.generate_statistics_from_dataframe(train),
                          rhs_statistics=tfdv.generate_statistics_from_dataframe(error_distribution_valid),
                          lhs_name='TRAIN_DATASET',
                          rhs_name='VALID_DATASET'
)