# 데이터 구조 및 전처리 코드

## 1. 데이터 구조 파악
- train.csv: 22,200행 × 49열 (id + 46개의 입력 변수 + shares + y)
- test.csv: 9,515행 × 47열 (id + 46개의 입력 변수)
- target(y): 0과 1이 거의 균형

## 2. 결측치 비율 확인
- 모든 변수에 약 10% 내외의 결측치 존재

## 3. 변수 유형
특성변수
- 수치형 변수: 대부분 float64
- 범주형 변수: `data_channel`, `weekday` (object)

목표변수
-수치형 변수 : 'shares', int64
-범주형 변수 : 'y', int64 (0,1로 이루어진 이진범주형 변수)

## 4. 전처리 방법
- 수치형 변수: 평균 대체 후 StandardScaler로 정규화
- 범주형 변수: 최빈값 대체 후 OneHotEncoder 인코딩
- 전처리 후 feature 수: 57개 (train: (22200, 57), test: (9515, 57))

In [None]:
from google.colab import drive
drive.mount("/content/gdrive")

Mounted at /content/gdrive


In [None]:
import os
root_dir="/content/gdrive/MyDrive/CSE/Colab Notebooks/2025 spring/project"
#각자 자료 저장한 루트로 수정
os.path.exists(root_dir)

True

In [None]:
import pandas as pd

data_path_train=os.path.join(root_dir,"train.csv")
data_path_test=os.path.join(root_dir,"test.csv")

train_df = pd.read_csv(data_path_train)
test_df = pd.read_csv(data_path_test)

print(train_df.shape)
print(test_df.shape)
train_df.info()
print(train_df['y'].value_counts(normalize=True))

(22200, 49)
(9515, 47)
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 22200 entries, 0 to 22199
Data columns (total 49 columns):
 #   Column                        Non-Null Count  Dtype  
---  ------                        --------------  -----  
 0   id                            22200 non-null  int64  
 1   n_tokens_title                20000 non-null  float64
 2   n_tokens_content              20054 non-null  float64
 3   n_unique_tokens               19942 non-null  float64
 4   n_non_stop_words              20017 non-null  float64
 5   n_non_stop_unique_tokens      19944 non-null  float64
 6   num_hrefs                     19967 non-null  float64
 7   num_self_hrefs                19920 non-null  float64
 8   num_imgs                      19986 non-null  float64
 9   num_videos                    19906 non-null  float64
 10  average_token_length          19987 non-null  float64
 11  num_keywords                  19955 non-null  float64
 12  kw_min_min                    19984 n

In [None]:
# 결측치 비율
train_df.isnull().mean().sort_values(ascending=False).head(46)

Unnamed: 0,0
num_videos,0.103333
avg_negative_polarity,0.102883
num_self_hrefs,0.102703
min_negative_polarity,0.102432
title_subjectivity,0.102342
abs_title_subjectivity,0.102072
n_unique_tokens,0.101712
n_non_stop_unique_tokens,0.101622
global_subjectivity,0.101532
global_rate_negative_words,0.101306


In [None]:
test_df.isnull().mean().sort_values(ascending=False).head(10)

In [None]:
# 전처리
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

numerical_cols = [col for col in train_df.select_dtypes(include=['float64', 'int64']).columns
                 if col not in ['id', 'shares', 'y']]
categorical_cols = ['data_channel', 'weekday']

numeric_transformer = Pipeline([
    ('imputer', SimpleImputer(strategy='mean')),
    ('scaler', StandardScaler())
])
categorical_transformer = Pipeline([
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('encoder', OneHotEncoder(handle_unknown='ignore'))
])

preprocessor = ColumnTransformer([
    ('num', numeric_transformer, numerical_cols),
    ('cat', categorical_transformer, categorical_cols)
])

X_train = train_df.drop(columns=['id', 'shares', 'y'])
y_train = train_df['y']
X_test = test_df.drop(columns=['id'])

X_train_processed = preprocessor.fit_transform(X_train)
X_test_processed = preprocessor.transform(X_test)

X_train_processed.shape, X_test_processed.shape