In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score

# 데이터 불러오기
df = pd.read_csv('./train.csv')
df

In [None]:
# 데이터 탐색
df.head()
df.info()
df.describe()
df.isnull().sum()

In [None]:
# 결측치 처리
df = df.fillna(df.mean())

In [None]:

# 독립 변수와 종속 변수 정의
X = df.drop('sales', axis=1)
y = df['sales']

# 데이터 분할
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# 모델 생성 및 학습
model = LinearRegression()
model.fit(X_train, y_train)

# 예측
y_pred = model.predict(X_test)

# 평가
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)
f'평균 제곱 오차(MSE): {mse}'
f'결정 계수(R²): {r2}'

# 시각화
plt.figure(figsize=(10,6))
plt.scatter(y_test, y_pred, alpha=0.7, color='b')
plt.plot([y_test.min(), y_test.max()], [y_test.min(), y_test.max()], 'r--')
plt.xlabel('실제 판매량')
plt.ylabel('예측 판매량')
plt.title('실제 판매량 vs 예측 판매량')
plt.show()