In [25]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score

# 데이터 불러오기
train_df = pd.read_csv('./train.csv')
test_df = pd.read_csv('./test.csv')

train_df

Unnamed: 0,id,date,store_nbr,family,sales,onpromotion
0,0,2013-01-01,1,AUTOMOTIVE,0.000,0
1,1,2013-01-01,1,BABY CARE,0.000,0
2,2,2013-01-01,1,BEAUTY,0.000,0
3,3,2013-01-01,1,BEVERAGES,0.000,0
4,4,2013-01-01,1,BOOKS,0.000,0
...,...,...,...,...,...,...
3000883,3000883,2017-08-15,9,POULTRY,438.133,0
3000884,3000884,2017-08-15,9,PREPARED FOODS,154.553,1
3000885,3000885,2017-08-15,9,PRODUCE,2419.729,148
3000886,3000886,2017-08-15,9,SCHOOL AND OFFICE SUPPLIES,121.000,8


In [27]:
train_df.head()
train_df.info()
train_df.describe()
train_df.isnull().sum()
train_df.isna().sum()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3000888 entries, 0 to 3000887
Data columns (total 6 columns):
 #   Column       Dtype  
---  ------       -----  
 0   id           int64  
 1   date         object 
 2   store_nbr    int64  
 3   family       object 
 4   sales        float64
 5   onpromotion  int64  
dtypes: float64(1), int64(3), object(2)
memory usage: 137.4+ MB


id             0
date           0
store_nbr      0
family         0
sales          0
onpromotion    0
dtype: int64

In [35]:
numeric_cols = ['store_nbr', 'onpromotion']
categorical_cols = ['family']
date_col = 'date'
id_col = 'id'

In [39]:
train_df['date'] = pd.to_datetime(train_df['date'], errors='coerce')
test_df['date'] = pd.to_datetime(test_df['date'], errors='coerce')

In [41]:
for df in [train_df, test_df]:
    df['year'] = df['date'].dt.year
    df['month'] = df['date'].dt.month
    df['day'] = df['date'].dt.day
    df['dayofweek'] = df['date'].dt.dayofweek

# 날짜 컬럼 삭제 (모델링에 필요 없으므로)
train_df.drop('date', axis=1, inplace=True)
test_df.drop('date', axis=1, inplace=True)

In [57]:
# 5. 범주형 변수 인코딩

from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder

column_transformer = ColumnTransformer(
    transformers=[
        ('ohe', OneHotEncoder(drop='first'), categorical_cols)  # drop='first'는 다중공선성 방지
    ],
    remainder='passthrough'  # 나머지 열은 그대로 유지
)

In [59]:
y_train = train_df['sales']
X_train = train_df.drop(['sales', 'id'], axis=1)  # 'id'는 제거
X_test = test_df.drop(['sales', 'id'], axis=1, errors='ignore')

In [63]:
from sklearn.pipeline import Pipeline

pipeline = Pipeline(steps=[
    ('preprocessor', column_transformer),
    ('regressor', LinearRegression())
])

In [67]:
pipeline.fit(X_train, y_train)

The format of the columns of the 'remainder' transformer in ColumnTransformer.transformers_ will change in version 1.7 to match the format of the other transformers.
At the moment the remainder columns are stored as indices (of type int). With the same ColumnTransformer configuration, in the future they will be stored as column names (of type str).



In [69]:
y_pred = pipeline.predict(X_test)

In [73]:
test_df_with_id = test_df[['id']].copy()
test_df_with_id['sales'] = y_pred
test_df_with_id.to_csv('test_with_predictions.csv', index=False)