In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder

# 데이터 불러오기
df = pd.read_csv("train.csv")  # 파일 경로에 맞게 수정

# 1. 로그 변환
df['Annual Income'] = np.log1p(df['Annual Income'])

# 2. 결측치 처리
mean_fill = ['Health Score', 'Credit Score', 'Age', 'Vehicle Age']
median_fill = ['Previous Claims']

for col in mean_fill:
    df[col] = df[col].fillna(df[col].mean())

for col in median_fill:
    df[col] = df[col].fillna(df[col].median())

# 3. Label Encoding (범주형)
label_cols = [
    'Marital Status', 'Education Level', 'Policy Type', 'Gender',
    'Smoking Status', 'Number of Dependents', 'Property Type',
    'Exercise Frequency', 'Location'
]

for col in label_cols:
    df[col] = df[col].astype(str)
    df[col] = LabelEncoder().fit_transform(df[col])

# 4. 고유값 제거
df.drop(columns=['id'], inplace=True)

# 5. 날짜형 처리
df['Policy Start Date'] = pd.to_datetime(df['Policy Start Date'], errors='coerce')
df['Policy_Year'] = df['Policy Start Date'].dt.year
df['Policy_Month'] = df['Policy Start Date'].dt.month
df.drop(columns=['Policy Start Date'], inplace=True)

# 6. 다중 클래스 범주형 -> 빈도 인코딩
for col in ['Occupation', 'Customer Feedback']:
    freq_map = df[col].value_counts().to_dict()
    df[col] = df[col].map(freq_map)

# 전처리 완료
print(df.info())


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1200000 entries, 0 to 1199999
Data columns (total 21 columns):
 #   Column                Non-Null Count    Dtype  
---  ------                --------------    -----  
 0   Age                   1200000 non-null  float64
 1   Gender                1200000 non-null  int64  
 2   Annual Income         1155051 non-null  float64
 3   Marital Status        1200000 non-null  int64  
 4   Number of Dependents  1200000 non-null  int64  
 5   Education Level       1200000 non-null  int64  
 6   Occupation            841925 non-null   float64
 7   Health Score          1200000 non-null  float64
 8   Location              1200000 non-null  int64  
 9   Policy Type           1200000 non-null  int64  
 10  Previous Claims       1200000 non-null  float64
 11  Vehicle Age           1200000 non-null  float64
 12  Credit Score          1200000 non-null  float64
 13  Insurance Duration    1199999 non-null  float64
 14  Customer Feedback     1122176 non-

In [6]:
import pandas as pd


# 1. Annual Income 결측치 제거
df = df.dropna(subset=['Annual Income'])


# 2. Occupation 결측치를 'Unknown'으로 대체
df['Occupation'] = df['Occupation'].fillna('Unknown')


# 3. Customer Feedback 결측치를 'Unknown'으로 대체
df['Customer Feedback'] = df['Customer Feedback'].fillna('Unknown')


#4.
df.drop('Insurance Duration', axis=1, inplace=True)  # 보험 청구 여부 제거
# 결과 확인
print(df.isnull().sum())


# 저장
df.to_csv("cleaned_train.csv", index=False)

Age                     0
Gender                  0
Annual Income           0
Marital Status          0
Number of Dependents    0
Education Level         0
Occupation              0
Health Score            0
Location                0
Policy Type             0
Previous Claims         0
Vehicle Age             0
Credit Score            0
Customer Feedback       0
Smoking Status          0
Exercise Frequency      0
Property Type           0
Premium Amount          0
Policy_Year             0
Policy_Month            0
dtype: int64


In [7]:
df.isnull().sum()
print(df.count)

<bound method DataFrame.count of           Age  Gender  Annual Income  Marital Status  Number of Dependents  \
0        19.0       0       9.215328               1                     1   
1        39.0       0      10.363409               0                     3   
2        23.0       1      10.150465               0                     3   
3        21.0       1      11.862568               1                     2   
4        21.0       1      10.587897               2                     1   
...       ...     ...            ...             ...                   ...   
1199993  38.0       1       7.382746               1                     1   
1199994  34.0       1      10.062924               2                     4   
1199995  36.0       0      10.215264               1                     0   
1199996  54.0       1      10.485340               0                     5   
1199997  19.0       1      10.856785               0                     0   

         Education Level Occup

In [None]:
df.dorp