In [1]:
import pandas as pd

# 데이터 불러오기
insurance_df = pd.read_csv('C:\\R\\insurance.csv')

# 중복된 데이터의 수 확인
duplicate_count = insurance_df.duplicated().sum()

# 중복된 데이터 제거
insurance_df.drop_duplicates(inplace=True)

duplicate_count, insurance_df.shape

(1, (1337, 7))

In [2]:
# 널값의 수 확인
null_count = insurance_df.isnull().sum()

# 널값이 있는 행 제거
insurance_df.dropna(inplace=True)

null_count, insurance_df.shape

(age         0
 sex         0
 bmi         0
 children    0
 smoker      0
 region      0
 charges     0
 dtype: int64,
 (1337, 7))

In [3]:
insurance_df.rename(columns={'sex': 'gender', 'charges': 'chrges'}, inplace=True)
insurance_df.columns

Index(['age', 'gender', 'bmi', 'children', 'smoker', 'region', 'chrges'], dtype='object')

In [4]:
insurance_df['gender'] = insurance_df['gender'].replace({'female': 1, 'male': 0})

# 변경된 값을 확인하기 위해 'gender' 열의 고유값 확인
insurance_df['gender'].unique()

array([1, 0], dtype=int64)

In [5]:
#기초 통계량을 통해 이상값 찾기
insurance_df.describe()

Unnamed: 0,age,gender,bmi,children,chrges
count,1337.0,1337.0,1337.0,1337.0,1337.0
mean,39.222139,0.495138,30.663452,1.095737,13279.121487
std,14.044333,0.500163,6.100468,1.205571,12110.359656
min,18.0,0.0,15.96,0.0,1121.8739
25%,27.0,0.0,26.29,0.0,4746.344
50%,39.0,0.0,30.4,1.0,9386.1613
75%,51.0,1.0,34.7,2.0,16657.71745
max,64.0,1.0,53.13,5.0,63770.42801


In [7]:
def detect_outliers(df, column):
    Q1 = df[column].quantile(0.25)
    Q3 = df[column].quantile(0.75)
    IQR = Q3 - Q1
    lower_bound = Q1 - 1.5 * IQR
    upper_bound = Q3 + 1.5 * IQR
    return df[(df[column] < lower_bound) | (df[column] > upper_bound)]

# 각 열에서 이상치가 발견되는 행의 수 확인
outliers_counts = {column: len(detect_outliers(insurance_df, column)) for column in insurance_df.columns if insurance_df[column].dtype in ['float64', 'int64']}

outliers_counts

{'age': 0, 'gender': 0, 'bmi': 9, 'children': 0, 'chrges': 139}

In [9]:
# 'bmi'와 'chrges' 열의 이상치 제거
bmi_outliers = detect_outliers(insurance_df, 'bmi').index
chrges_outliers = detect_outliers(insurance_df, 'chrges').index

# 이상치 인덱스 병합
all_outliers = bmi_outliers.union(chrges_outliers)

# 이상치 제거
insurance_df.drop(all_outliers, inplace=True)

insurance_df.shape

(1130, 7)

In [10]:
# Min-Max 정규화 함수 정의
def min_max_scaling(df, column):
    min_val = df[column].min()
    max_val = df[column].max()
    df[column] = (df[column] - min_val) / (max_val - min_val)
    return df

# 숫자 데이터에 대해 min-max 방법으로 정규화
for column in insurance_df.columns:
    if insurance_df[column].dtype in ['float64', 'int64']:
        insurance_df = min_max_scaling(insurance_df, column)

# 변경된 데이터의 처음 몇 줄 확인
insurance_df.head()


Unnamed: 0,age,gender,bmi,children,smoker,region,chrges
0,0.021739,1.0,0.405158,0.0,yes,southwest,0.641899
1,0.0,0.0,0.604343,0.2,no,southeast,0.024583
2,0.217391,0.0,0.578215,0.6,no,southeast,0.135505
3,0.326087,0.0,0.228877,0.0,no,northwest,0.849561
4,0.304348,0.0,0.438412,0.0,no,northwest,0.11178


In [11]:
# 'chrges' 열과의 상관계수 계산
correlations = insurance_df.corr()['chrges']

# 'chrges'와의 상관계수가 낮은 열(절대값 기준)을 삭제
columns_to_drop = correlations[correlations.abs() < 0.1].index
insurance_df.drop(columns=columns_to_drop, inplace=True)

insurance_df.head()

ValueError: could not convert string to float: 'yes'