#Import

In [35]:
import pandas as pd
import numpy as np

from google.colab import files
import os

from sklearn.preprocessing import PolynomialFeatures
from sklearn.linear_model import LinearRegression

train = pd.read_csv("./train.csv")
train = train.drop(columns=["id", "shares"])
test = pd.read_csv("./test.csv")
test = test.drop(columns=["id"])

#Missing value

In [36]:
#train filled 3 variables
df = pd.read_csv('train.csv')
# 대표값 계산
median_avg_token_length = df.loc[
    (df['average_token_length'].notnull()) & (df['average_token_length'] != 0),
    'average_token_length'
].median()

mean_unique_tokens = df.loc[
    (df['n_non_stop_unique_tokens'].notnull()) & (df['n_non_stop_unique_tokens'] != 0),
    'n_non_stop_unique_tokens'
].mean()

### Step 1. 세 변수 중 하나라도 0이면 나머지도 0
zero_mask = (
    (df['n_non_stop_words'] == 0) |
    (df['n_non_stop_unique_tokens'] == 0) |
    (df['average_token_length'] == 0)
)
df.loc[zero_mask, ['n_non_stop_words', 'n_non_stop_unique_tokens', 'average_token_length']] = 0

### Step 2. n_non_stop_words 와 average_token_length 관계

# 2-1. average_token_length만 결측이고, n_non_stop_words가 0이 아닌 경우
mask_atl_null = df['average_token_length'].isnull() & (df['n_non_stop_words'] != 0)
df.loc[mask_atl_null, 'average_token_length'] = median_avg_token_length

# 2-2. n_non_stop_words만 결측이고, average_token_length가 0이 아닌 경우
mask_nsw_null = df['n_non_stop_words'].isnull() & (df['average_token_length'] != 0)
df.loc[mask_nsw_null, 'n_non_stop_words'] = 1

# 2-3. 둘 다 결측인 경우
both_null = df['n_non_stop_words'].isnull() & df['average_token_length'].isnull()
df.loc[both_null, 'n_non_stop_words'] = 1
df.loc[both_null, 'average_token_length'] = median_avg_token_length

### Step 3. n_non_stop_words 와 n_non_stop_unique_tokens 관계

# 3-1. n_non_stop_words == 0이면 나머지도 0
mask_nsw_zero = df['n_non_stop_words'] == 0
df.loc[mask_nsw_zero, ['n_non_stop_unique_tokens', 'average_token_length']] = 0

# 3-2. n_non_stop_unique_tokens만 결측이고, n_non_stop_words가 0이 아닌 경우
mask_unt_null = df['n_non_stop_unique_tokens'].isnull() & (df['n_non_stop_words'] != 0)
df.loc[mask_unt_null, 'n_non_stop_unique_tokens'] = mean_unique_tokens

### 결과 확인
print(df[['n_non_stop_words', 'n_non_stop_unique_tokens', 'average_token_length']].isnull().sum())

df.to_csv('train_filled_3_variables.csv', index=False)

print(df[['n_non_stop_words', 'n_non_stop_unique_tokens', 'average_token_length']].isnull().sum())

n_non_stop_words            0
n_non_stop_unique_tokens    0
average_token_length        0
dtype: int64
n_non_stop_words            0
n_non_stop_unique_tokens    0
average_token_length        0
dtype: int64


In [37]:
#test filled 3 variables

df = pd.read_csv("test.csv")

# 대표값 계산
median_avg_token_length = df.loc[
    (df['average_token_length'].notnull()) & (df['average_token_length'] != 0),
    'average_token_length'
].median()

mean_unique_tokens = df.loc[
    (df['n_non_stop_unique_tokens'].notnull()) & (df['n_non_stop_unique_tokens'] != 0),
    'n_non_stop_unique_tokens'
].mean()

### Step 1. 세 변수 중 하나라도 0이면 나머지도 0
zero_mask = (
    (df['n_non_stop_words'] == 0) |
    (df['n_non_stop_unique_tokens'] == 0) |
    (df['average_token_length'] == 0)
)
df.loc[zero_mask, ['n_non_stop_words', 'n_non_stop_unique_tokens', 'average_token_length']] = 0

### Step 2. n_non_stop_words 와 average_token_length 관계

# 2-1. average_token_length만 결측이고, n_non_stop_words가 0이 아닌 경우
mask_atl_null = df['average_token_length'].isnull() & (df['n_non_stop_words'] != 0)
df.loc[mask_atl_null, 'average_token_length'] = median_avg_token_length

# 2-2. n_non_stop_words만 결측이고, average_token_length가 0이 아닌 경우
mask_nsw_null = df['n_non_stop_words'].isnull() & (df['average_token_length'] != 0)
df.loc[mask_nsw_null, 'n_non_stop_words'] = 1

# 2-3. 둘 다 결측인 경우
both_null = df['n_non_stop_words'].isnull() & df['average_token_length'].isnull()
df.loc[both_null, 'n_non_stop_words'] = 1
df.loc[both_null, 'average_token_length'] = median_avg_token_length

### Step 3. n_non_stop_words 와 n_non_stop_unique_tokens 관계

# 3-1. n_non_stop_words == 0이면 나머지도 0
mask_nsw_zero = df['n_non_stop_words'] == 0
df.loc[mask_nsw_zero, ['n_non_stop_unique_tokens', 'average_token_length']] = 0

# 3-2. n_non_stop_unique_tokens만 결측이고, n_non_stop_words가 0이 아닌 경우
mask_unt_null = df['n_non_stop_unique_tokens'].isnull() & (df['n_non_stop_words'] != 0)
df.loc[mask_unt_null, 'n_non_stop_unique_tokens'] = mean_unique_tokens

### 결과 확인
print(df[['n_non_stop_words', 'n_non_stop_unique_tokens', 'average_token_length']].isnull().sum())

df.to_csv('test_filled_3_variables.csv', index=False)

print(df[['n_non_stop_words', 'n_non_stop_unique_tokens', 'average_token_length']].isnull().sum())

n_non_stop_words            0
n_non_stop_unique_tokens    0
average_token_length        0
dtype: int64
n_non_stop_words            0
n_non_stop_unique_tokens    0
average_token_length        0
dtype: int64


In [38]:
train = pd.read_csv("./train_filled_3_variables.csv")
test = pd.read_csv("./test_filled_3_variables.csv")

In [39]:
strategy_df = pd.read_csv("./strategy_df.csv")

strategies = strategy_df.iloc[:, 7]
columns = strategy_df.iloc[:, 1]

strategy_map = dict(zip(columns, strategies))

#Central tendency
for col, strategy in strategy_map.items():
    if col not in train.columns:
        continue
    if strategy == "mean":
        train[col].fillna(train[col].mean(), inplace=True)
        test[col].fillna(test[col].mean(), inplace=True)
    elif strategy == "median":
        train[col].fillna(train[col].median(), inplace=True)
        test[col].fillna(test[col].median(), inplace=True)
    elif strategy == "mode":
        train[col].fillna(train[col].mode().iloc[0], inplace=True)
        test[col].fillna(test[col].mode().iloc[0], inplace=True)
    else:
        print(f"⚠️ 알 수 없는 전략: {col} → {strategy}")

print("결측치 남은 개수:", train.isnull().sum().sum())
train.to_csv("train_clean.csv", index = False)
print("결측치 남은 개수:", test.isnull().sum().sum())
test.to_csv("test_clean.csv", index = False)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  train[col].fillna(train[col].mean(), inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  test[col].fillna(test[col].mean(), inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting 

⚠️ 알 수 없는 전략: n_unique_tokens → linear regression with 'n_non_stop_unique_tokens' or median
⚠️ 알 수 없는 전략: n_non_stop_words → linear regression with 'n_unique_tokens' or median
⚠️ 알 수 없는 전략: kw_max_min → linear regression with 'kw_avg_min' or median
⚠️ 알 수 없는 전략: kw_avg_min → linear regression with 'kw_max_min' or median
⚠️ 알 수 없는 전략: kw_max_avg → polynomial regression with 'kw_avg_avg' or median
⚠️ 알 수 없는 전략: kw_avg_avg → polynomial regression with 'kw_max_avg' or median
⚠️ 알 수 없는 전략: self_reference_max_shares → linear regression with 'self_reference_avg_sharess' or median
⚠️ 알 수 없는 전략: self_reference_avg_sharess → linear regression with 'self_reference_max_shares' or median
⚠️ 알 수 없는 전략: global_sentiment_polarity → linear regression with 'rate_positive_words' or mean
⚠️ 알 수 없는 전략: global_rate_negative_words → linear regression with 'rate_negative_words' or median
⚠️ 알 수 없는 전략: rate_positive_words → linear regression with 'global_sentiment_polarity' or median
⚠️ 알 수 없는 전략: rate_negativ

In [40]:
#Linear Regression

col_avg = ['n_unique_tokens', 'n_unique_tokens',"kw_avg_min", "self_reference_avg_sharess", "global_sentiment_polarity",
           "global_rate_negative_words", "avg_positive_polarity", "avg_negative_polarity", "title_subjectivity"]
col_max = ['n_non_stop_words','n_non_stop_unique_tokens', "kw_max_min", "self_reference_max_shares", "rate_positive_words",
           "rate_negative_words", "max_positive_polarity", "min_negative_polarity", "abs_title_sentiment_polarity"]

for i in range(len(col_avg)):
  var_avg = col_avg[i]
  var_max = col_max[i]

# 1) 둘 다 결측치 없는 행으로 regression 학습용 데이터 준비
  mask_both_train = train[var_avg].notna() & train[var_max].notna()
  train_reg = train.loc[mask_both_train, [var_max, var_avg]]

# 모델1: var_max → var_avg
  model_avg = LinearRegression()
  model_avg.fit(train_reg[[var_max]], train_reg[var_avg])

# 모델2: var_avg → var_max
  model_max = LinearRegression()
  model_max.fit(train_reg[[var_avg]], train_reg[var_max])

# 평균 토큰 결측치(var_avg)만 있는 경우
  mask_avg_miss_train = train[var_avg].isna() & train[var_max].notna()
  if mask_avg_miss_train.any():
    X_pred_avg = train.loc[mask_avg_miss_train, [var_max]]
    train.loc[mask_avg_miss_train, var_avg] = model_avg.predict(X_pred_avg)

# 비중지속 토큰 결측치(var_max)만 있는 경우
  mask_max_miss_train = train[var_max].isna() & train[var_avg].notna()
  if mask_max_miss_train.any():
    X_pred_max = train.loc[mask_max_miss_train, [var_avg]]
    train.loc[mask_max_miss_train, var_max] = model_max.predict(X_pred_max)

# test에도 동일하게 적용
  mask_both_test = test[var_avg].notna() & test[var_max].notna()

# 학습은 train 데이터만 사용
  mask_avg_miss_test = test[var_avg].isna() & test[var_max].notna()
  if mask_avg_miss_test.any():
    Xt_pred_avg = test.loc[mask_avg_miss_test, [var_max]]
    test.loc[mask_avg_miss_test, var_avg] = model_avg.predict(Xt_pred_avg)

  mask_max_miss_test = test[var_max].isna() & test[var_avg].notna()
  if mask_max_miss_test.any():
    Xt_pred_max = test.loc[mask_max_miss_test, [var_avg]]
    test.loc[mask_max_miss_test, var_max] = model_max.predict(Xt_pred_max)


In [41]:
#Polynomial Regression

# 둘 다 값이 있는 행을 골라 학습용 데이터로 선별
mask_train = train["kw_avg_avg"].notna() & train["kw_max_avg"].notna()
df_reg_train = train.loc[mask_train, ["kw_avg_avg", "kw_max_avg"]]

X_train = df_reg_train[["kw_avg_avg"]]
y_train = df_reg_train["kw_max_avg"]

poly = PolynomialFeatures(degree=2, include_bias=False)
X_train_poly = poly.fit_transform(X_train)

model = LinearRegression()
model.fit(X_train_poly, y_train)

# 하나만 결측치인 경우에 kw_max_avg 채우기
mask_fill_train = train["kw_avg_avg"].notna() & train["kw_max_avg"].isna()
X_pred_train = poly.transform(train.loc[mask_fill_train, ["kw_avg_avg"]])
train.loc[mask_fill_train, "kw_max_avg"] = model.predict(X_pred_train)

mask_fill_test = test["kw_avg_avg"].notna() & test["kw_max_avg"].isna()
X_pred_test = poly.transform(test.loc[mask_fill_test, ["kw_avg_avg"]])
test.loc[mask_fill_test, "kw_max_avg"] = model.predict(X_pred_test)

mask_both    = train["kw_avg_avg"].notna() & train["kw_max_avg"].notna()
df_both_rev  = train.loc[mask_both, ["kw_avg_avg", "kw_max_avg"]]

# 설명변수 바꿔치기
X_rev_train  = df_both_rev[["kw_max_avg"]]
y_rev_train  = df_both_rev["kw_avg_avg"]

poly_rev     = PolynomialFeatures(degree=2, include_bias=False)
X_rev_poly   = poly_rev.fit_transform(X_rev_train)

model_rev    = LinearRegression()
model_rev.fit(X_rev_poly, y_rev_train)

mask_fill_avg_train = train["kw_max_avg"].notna() & train["kw_avg_avg"].isna()
X_pred_rev_train    = poly_rev.transform(train.loc[mask_fill_avg_train, ["kw_max_avg"]])
train.loc[mask_fill_avg_train, "kw_avg_avg"] = model_rev.predict(X_pred_rev_train)

mask_fill_avg_test  = test["kw_max_avg"].notna() & test["kw_avg_avg"].isna()
X_pred_rev_test     = poly_rev.transform(test.loc[mask_fill_avg_test, ["kw_max_avg"]])
test.loc[mask_fill_avg_test,  "kw_avg_avg"] = model_rev.predict(X_pred_rev_test)


In [42]:
strategy1 = pd.read_csv("./strategy_df(1).csv")

strategies = strategy1.iloc[:, 7]
columns = strategy1.iloc[:, 1]

strategy_map = dict(zip(columns, strategies))

for col, strategy in strategy_map.items():
    if col not in train.columns:
        continue
    if strategy == "mean":
        train[col].fillna(train[col].mean(), inplace=True)
        test[col].fillna(test[col].mean(), inplace=True)
    elif strategy == "median":
        train[col].fillna(train[col].median(), inplace=True)
        test[col].fillna(test[col].median(), inplace=True)
    elif strategy == "mode":
        train[col].fillna(train[col].mode().iloc[0], inplace=True)
        test[col].fillna(test[col].mode().iloc[0], inplace=True)
    else:
        print(f"⚠️ 알 수 없는 전략: {col} → {strategy}")

print("결측치 남은 개수:", train.isnull().sum().sum())
print("결측치 남은 개수:", test.isnull().sum().sum())


결측치 남은 개수: 0
결측치 남은 개수: 0


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  train[col].fillna(train[col].mean(), inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  test[col].fillna(test[col].mean(), inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting 

In [43]:
train.to_csv('train_clean_reg.csv', index = False)
test.to_csv('test_clean_reg.csv', index = False)

#Outlier

In [44]:
import pandas as pd
import numpy as np
from sklearn.feature_selection import VarianceThreshold
from statsmodels.stats.outliers_influence import variance_inflation_factor
import os
from sklearn.cluster import DBSCAN
from sklearn.neighbors import NearestNeighbors
# ==이상치 처리==

# 1. 데이터 불러오기
df = pd.read_csv('train_clean_reg.csv')

# 기본 컬럼 분리
id_col = df[['id']]
shares_col = df[['shares']]
y_col = df[['y']]
cat_cols = df.select_dtypes(include='object').columns
df_cat = df[cat_cols]

# 수치형 변수 추출
numeric_cols = df.select_dtypes(include=np.number).columns
exclude_cols = ['id', 'shares', 'y']
target_cols = [col for col in numeric_cols if col not in exclude_cols]

# DBSCAN 적용할 변수쌍 정의
dbscan_replaced_cols = [
    'kw_max_min', 'kw_avg_min',
    'kw_avg_avg', 'kw_max_avg',
    'global_rate_negative_words', 'rate_negative_words',
    'abs_title_sentiment_polarity', 'title_subjectivity'
]
other_cols = [col for col in target_cols if col not in dbscan_replaced_cols]

# 2. 클리핑 적용 (DBSCAN 대상이 아닌 변수 전부 상하한 클리핑)
df_clipped = df.copy()
for col in other_cols:
    lower = df[col].quantile(0.05)
    upper = df[col].quantile(0.95)
    df_clipped[col] = np.clip(df[col], lower, upper)

# 3. DBSCAN 이상치 대체
df_num = df_clipped[target_cols].copy()  # 클리핑된 값에서 시작

# 변수쌍별 DBSCAN 파라미터
feature_pairs = [
    (['kw_max_min', 'kw_avg_min'], {'eps': 2000, 'min_samples': 20}),
    (['kw_avg_avg', 'kw_max_avg'], {'eps': 2500, 'min_samples': 8}),
    (['global_rate_negative_words', 'rate_negative_words'], {'eps': 0.015, 'min_samples': 40}),
    (['abs_title_sentiment_polarity', 'title_subjectivity'], {'eps': 0.07, 'min_samples': 70}),
]

for features, params in feature_pairs:
    var1, var2 = features
    df_pair = df_num[features].dropna()
    X = df_pair.values

    # DBSCAN
    dbscan = DBSCAN(eps=params['eps'], min_samples=params['min_samples'])
    labels = dbscan.fit_predict(X)
    outlier_mask = labels == -1
    normal_mask = labels != -1

    print(f"{features}: 이상치 개수 = {np.sum(outlier_mask)}")

    # 가장 가까운 정상치로 대체
    X_replaced = X.copy()
    if np.sum(outlier_mask) > 0:
        nn = NearestNeighbors(n_neighbors=1)
        nn.fit(X[normal_mask])
        distances, indices = nn.kneighbors(X[outlier_mask])
        X_replaced[outlier_mask] = X[normal_mask][indices.flatten()]

    # 반영
    df_num.loc[df_pair.index, var1] = X_replaced[:, 0]
    df_num.loc[df_pair.index, var2] = X_replaced[:, 1]

# DBSCAN 처리된 변수만 df_clipped에 덮어쓰기
for features, _ in feature_pairs:
    var1, var2 = features
    df_clipped[var1] = df_num[var1]
    df_clipped[var2] = df_num[var2]

# 4. 최종 결합
df_final = pd.concat([id_col, df_clipped[target_cols], df_cat, shares_col, y_col], axis=1)

# 5. 저장
df_final.to_csv("train_outlier.csv", index=False)

['kw_max_min', 'kw_avg_min']: 이상치 개수 = 46
['kw_avg_avg', 'kw_max_avg']: 이상치 개수 = 59
['global_rate_negative_words', 'rate_negative_words']: 이상치 개수 = 205
['abs_title_sentiment_polarity', 'title_subjectivity']: 이상치 개수 = 254


In [45]:

# 1. 데이터 불러오기
df = pd.read_csv('test_clean_reg.csv')

# 기본 컬럼 분리
id_col = df[['id']]
cat_cols = df.select_dtypes(include='object').columns
df_cat = df[cat_cols]

# 수치형 변수 추출
numeric_cols = df.select_dtypes(include=np.number).columns
exclude_cols = ['id']
target_cols = [col for col in numeric_cols if col not in exclude_cols]

# DBSCAN 적용할 변수쌍 정의
dbscan_replaced_cols = [
    'kw_max_min', 'kw_avg_min',
    'kw_avg_avg', 'kw_max_avg',
    'global_rate_negative_words', 'rate_negative_words',
    'abs_title_sentiment_polarity', 'title_subjectivity'
]
other_cols = [col for col in target_cols if col not in dbscan_replaced_cols]

# 2. 클리핑 적용 (DBSCAN 대상이 아닌 변수 전부 상하한 클리핑)
df_clipped = df.copy()
for col in other_cols:
    lower = df[col].quantile(0.05)
    upper = df[col].quantile(0.95)
    df_clipped[col] = np.clip(df[col], lower, upper)

# 3. DBSCAN 이상치 대체
df_num = df_clipped[target_cols].copy()  # 클리핑된 값에서 시작

# 변수쌍별 DBSCAN 파라미터
feature_pairs = [
    (['kw_max_min', 'kw_avg_min'], {'eps': 2000, 'min_samples': 20}),
    (['kw_avg_avg', 'kw_max_avg'], {'eps': 2500, 'min_samples': 8}),
    (['global_rate_negative_words', 'rate_negative_words'], {'eps': 0.015, 'min_samples': 40}),
    (['abs_title_sentiment_polarity', 'title_subjectivity'], {'eps': 0.07, 'min_samples': 70}),
]

for features, params in feature_pairs:
    var1, var2 = features
    df_pair = df_num[features].dropna()
    X = df_pair.values

    # DBSCAN
    dbscan = DBSCAN(eps=params['eps'], min_samples=params['min_samples'])
    labels = dbscan.fit_predict(X)
    outlier_mask = labels == -1
    normal_mask = labels != -1

    print(f"{features}: 이상치 개수 = {np.sum(outlier_mask)}")

    # 가장 가까운 정상치로 대체
    X_replaced = X.copy()
    if np.sum(outlier_mask) > 0:
        nn = NearestNeighbors(n_neighbors=1)
        nn.fit(X[normal_mask])
        distances, indices = nn.kneighbors(X[outlier_mask])
        X_replaced[outlier_mask] = X[normal_mask][indices.flatten()]

    # 반영
    df_num.loc[df_pair.index, var1] = X_replaced[:, 0]
    df_num.loc[df_pair.index, var2] = X_replaced[:, 1]

# DBSCAN 처리된 변수만 df_clipped에 덮어쓰기
for features, _ in feature_pairs:
    var1, var2 = features
    df_clipped[var1] = df_num[var1]
    df_clipped[var2] = df_num[var2]

# 4. 최종 결합
df_final = pd.concat([id_col, df_clipped[target_cols], df_cat], axis=1)

# 5. 저장
df_final.to_csv("test_outlier.csv", index=False)

['kw_max_min', 'kw_avg_min']: 이상치 개수 = 50
['kw_avg_avg', 'kw_max_avg']: 이상치 개수 = 41
['global_rate_negative_words', 'rate_negative_words']: 이상치 개수 = 151
['abs_title_sentiment_polarity', 'title_subjectivity']: 이상치 개수 = 546


#Encoding

In [46]:
from sklearn.preprocessing import OneHotEncoder

train = pd.read_csv("./train_outlier.csv")
test = pd.read_csv("./test_outlier.csv")
categorical_cols = ["data_channel", "weekday"]

#One-Hot Encoder 설정
ohe = OneHotEncoder(handle_unknown="ignore", sparse_output=False)

train_ohe = pd.DataFrame(
    ohe.fit_transform(train[categorical_cols]),
    columns=ohe.get_feature_names_out(categorical_cols),
    index=train.index)

test_ohe = pd.DataFrame(
    ohe.transform(test[categorical_cols]), #train set과 동일하게 정리
    columns=ohe.get_feature_names_out(categorical_cols),
    index=test.index)

#원본에서 범주형 컬럼 제거하고, 인코딩된 컬럼 붙이기
train  = pd.concat([train.drop(columns=categorical_cols), train_ohe ], axis=1)
test  = pd.concat([test.drop(columns=categorical_cols), test_ohe ], axis=1)

train.to_csv("./train_encoded.csv", index =False)
test.to_csv("./test_encoded.csv",  index=False)

#Variable Selection

In [47]:
from sklearn.feature_selection import VarianceThreshold
from statsmodels.stats.outliers_influence import variance_inflation_factor

def drop_outliers(df: pd.DataFrame, drop_path: str) -> pd.DataFrame:
    if os.path.exists(drop_path):
        rows_to_drop = pd.read_csv(drop_path, header=None).squeeze().tolist()
        df = df.drop(index=rows_to_drop)
        print(f"✅ Dropped {len(rows_to_drop)} rows from train")
    else:
        print("⚠️ rows_to_drop.csv not found.")
    return df

def select_features(df: pd.DataFrame) -> list:
    dummy_cols = [col for col in df.columns if col.startswith('data_channel_') or col.startswith('weekday_')]
    numeric_cols = [col for col in df.columns if col not in dummy_cols and col not in {'id', 'shares', 'y'}]

    df_numeric = df[numeric_cols]

    # 1. 분산 기준 완화
    vt = VarianceThreshold(threshold=0.0005)  # 기존 0.001에서 완화
    vt.fit(df_numeric)
    low_var = df_numeric.columns[~vt.get_support()]

    # 2. 상관계수 기준 완화
    corr = df_numeric.corr().abs()
    upper = corr.where(np.triu(np.ones(corr.shape), k=1).astype(bool))
    high_corr = [c for c in upper.columns if any(upper[c] > 0.98)]  # 기존 0.95에서 완화

    # 3. VIF 기준 완화
    vif_vals = [variance_inflation_factor(df_numeric.values, i) for i in range(df_numeric.shape[1])]
    vif_drop = df_numeric.columns[np.array(vif_vals) > 20]  # 기존 15에서 완화

    # 4. 사용자 정의 변수 보호
    protected_vars = {'kw_avg_min', 'global_sentiment_polarity'}
    drop_cols = set(low_var) | set(high_corr) | set(vif_drop) - protected_vars

    # 5. 최종 변수 리스트
    final_numeric = [c for c in numeric_cols if c not in drop_cols]
    final_cols = final_numeric + dummy_cols
    return final_cols


if __name__ == "__main__":
    train_raw = pd.read_csv("./train_encoded.csv")
    test = pd.read_csv("./test_encoded.csv")

    # 1. 이상치 제거
    train_cleaned = drop_outliers(train_raw, "./rows_to_drop.csv")

    # 2. y 분리
    y = train_cleaned['y']
    train_features_only = train_cleaned.drop(columns=['y'])

    # 3. 변수 선택
    selected_cols = select_features(train_cleaned)

    # 4. train: 선택된 컬럼 + y 병합
    X_train_selected = train_features_only[selected_cols]
    train_selected = pd.concat([X_train_selected, y], axis=1)
    train_selected.to_csv("./train_selected.csv", index=False)

    # 5. test: 선택된 컬럼만 사용
    test_selected = test[[col for col in selected_cols if col in test.columns]]
    test_selected.to_csv("./test_selected.csv", index=False)

✅ Dropped 299 rows from train


#Standardization

In [48]:
from sklearn.preprocessing import StandardScaler

df = pd.read_csv("./test_selected.csv")

# 뒤쪽 13개 열 떼어내기 (범주형 변수 인코딩한 열)
df_tail = df.iloc[:, -13:]

# 앞쪽 수치형 변수만 선택
df_numeric = df.iloc[:, :-13]

# 왜도 확인
skewness = df_numeric.skew()

# 왜도 절대값 > 1인 열들에 로그 변환 적용
high_skew_cols = skewness[skewness.abs() > 1.0].index

for col in high_skew_cols:
    min_val = df_numeric[col].min()

    # 모든 값이 0보다 크면 그대로 로그 변환
    if min_val >= 0:
        df_numeric[col] = np.log1p(df_numeric[col])  # log(1 + x) 안정적
    else:
        # 음수나 0이 있는 경우: 값을 양수로 이동 후 로그 변환
        shift = 1 - min_val
        df_numeric[col] = np.log1p(df_numeric[col] + shift)


# 수치형 전체에 StandardScaler 적용
scaler = StandardScaler()
scaled_data = scaler.fit_transform(df_numeric)

# 최종 버전 구성
df_scaled = pd.DataFrame(scaled_data, columns=df_numeric.columns)
df_final = pd.concat([df_scaled, df_tail], axis=1)

# 저장 및 확인
df_final.to_csv("test_final.csv", index=False)

#Prediction

In [49]:
!pip install catboost

import pandas as pd
import numpy as np
from google.colab import files
from scipy.stats import uniform, randint
from sklearn.ensemble import StackingClassifier, RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from xgboost import XGBClassifier

import joblib
model_path = './stacking_model.pkl'
model = joblib.load(model_path)

test = pd.read_csv("./test_final.csv")
test_raw = pd.read_csv("./test.csv")

# ID 처리
test_id = test_raw["id"]  # ✅ test.csv 기준으로 id 사용

y_prob = model.predict_proba(test)[:, 1]
y_pred = (y_prob >= 0.44).astype(int)  # ✅ Best Threshold = 0.44

# 4. 결과 저장
pd.DataFrame({
    "id": test_id,
    "y_prob": y_prob,
    "y_predict": y_pred
}).to_csv("./prediction.csv", index=False)
files.download("./prediction.csv")



<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>