In [None]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from scipy import stats as stats
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score, f1_score, confusion_matrix, precision_score, recall_score
from sklearn.model_selection import KFold
from warnings import filterwarnings
filterwarnings('ignore')

In [None]:
# 1-1. load data
file_path = r"C:\Users\tempuser\OneDrive - postech.ac.kr\2025\7. KAMP 경진대회\2025_KAMP_anomaly_detection\data\2. 소성가공 품질보증 AI 데이터셋.csv"
df = pd.read_csv(file_path)
df

In [None]:
# 완전성 품질 지수 확인
ratio = 30
sum = 0

for column in df.columns:
    tmp = df[column]
    print('column name :', column)
    print('[step 1] 변수별 결측 비율')
    print(f"{round(tmp.isnull().sum()/len(tmp)*100,2)}%")
    
    print('[step 2] 변수별 결측 비율 30% 초과 여부')
    print(tmp.isnull().sum()/len(tmp)*100>ratio)
    
    print('[step 3] 전체 데이터셋 결측치 개수')
    cmpt_len =tmp.isnull().sum().sum()
    print(cmpt_len)
    
    print(f"결측치 = {cmpt_len}개")
    
    print(f"완전성 지수 : {round((1-cmpt_len/len(df))*100, 2)}%")
    print('='*30)
    sum += (1-cmpt_len/len(df))*100
print(f"전체 완전성 지수 : {round(sum/len(df.columns), 2)}%")

In [None]:
# 유일성 품질 지수 확인
check_unique = df.value_counts().reset_index(name = 'count')
duplicates = check_unique[check_unique['count'] > 1]

total_unique_rows = len(check_unique)
duplicate_rows = len(duplicates) if not duplicates.empty else 0

if total_unique_rows > 0:
    perc_check_unique = round((total_unique_rows - duplicate_rows) / total_unique_rows * 100, 2)
else:
    perc_check_unique = 0.0

print(f"1개(고유 행) : {total_unique_rows}")
print(f"2개 이상(중복 행) : {duplicate_rows}")
print(f"유일성 지수 : {perc_check_unique}%")

In [None]:
# 일관성 품질 지수 
for col in df.columns:
    data_type = df[f'{col}'].dtype
    print(f'{col}: {data_type}')

In [None]:
# 유효성 품질 지수 확인
total_valid_cols = len(df.columns)
invalid_cols = 0

for column in df.columns:
    dtype = df[column].dtype
    
    if dtype == 'int64':
        valid_check = df[column].apply(lambda x: isinstance(x, int) or pd.isna(x))
    elif dtype == 'float64':
        valid_check = df[column].apply(lambda x: isinstance(x, float) or pd.isna(x))
    else:
        continue

    unique_valids = valid_check.drop_duplicates().tolist()
    print(f"{column} : 데이터 타입 {dtype}, 유효성 {unique_valids}")

    # False가 포함되어 있으면 비유효 컬럼으로 판단
    if False in unique_valids:
        invalid_cols += 1

validity_score = round((total_valid_cols - invalid_cols) / total_valid_cols * 100, 2)
print(f"유효성 지수 : {validity_score}%")