### 머신러닝(회귀)

In [51]:
import pandas as pd

train = pd.read_csv("train.csv")  #학습용
test = pd.read_csv("test.csv")    #평가용 

train.info()
train.shape, test.shape
# train.head()

# 통계 요약 - 'Item_Identifier' unique 수가 다름
print(train.describe(include='object'))
print(test.describe(include='object'))

# 결측치 확인 및 처리 
print(train.isnull().sum())
print(test.isnull().sum())

# Item_Identifier 삭제 
train = train.drop("Item_Identifier", axis=1)
test = test.drop("Item_Identifier", axis=1)

# 'Item_Weight' - median()
median = train["Item_Weight"].median()
median
train["Item_Weight"] = train["Item_Weight"].fillna(median)
test["Item_Weight"] = test["Item_Weight"].fillna(median)

# Outlet_Size - 최빈값 
freq = train["Outlet_Size"].mode()[0]
freq
train["Outlet_Size"] = train["Outlet_Size"].fillna(freq)
test["Outlet_Size"] = test["Outlet_Size"].fillna(freq)

# 레이블 인코딩
from sklearn.preprocessing import LabelEncoder
encoder = LabelEncoder()

cols = train.select_dtypes(include="object").columns
cols

for col in cols:
    train[col] = encoder.fit_transform(train[col])
    test[col] = encoder.fit_transform(test[col])
    
# 데이터 분할
from sklearn.model_selection import train_test_split
# X = train.drop("Item_Outlet_Sales", axis=1)  #독립변수
# y = train["Item_Outlet_Sales"]  #종속변수
target = train.pop("Item_Outlet_Sales")

X_train, X_test, y_train, y_test = train_test_split(
              train, target, test_size=0.2, random_state=37)
print(X_train.shape, X_test.shape, y_train.shape, y_test.shape)

# 머신러닝 학습 및 평가
from sklearn.metrics import mean_squared_error
from sklearn.metrics import mean_absolute_error

def rmse(y_test, y_pred):
    return mean_squared_error(y_test, y_pred) ** 0.5

# 선형회귀
from sklearn.linear_model import LinearRegression

model = LinearRegression()  #모델 생성
model.fit(X_train, y_train) #학습 
pred = model.predict(X_test) #예측
print(pred)

# 성능평가 - 평균 제곱 오차 
result = mean_squared_error(y_test, pred)
print('MSE:', result)

# 평균 제곱근 오차 
result = rmse(y_test, pred)
print('RMSE:', result)

# 평균 절대값 오차 
result = mean_absolute_error(y_test, pred)
print('MAE:', result)

# 평가용 예측
pred2 = model.predict(test)
print(pred2)

# 데이터프레임 만들고 파일로 저장
result = pd.DataFrame({
    "pred": pred2
})

result.to_csv("result.csv", index=False)

# csv 파일 읽기
pd.read_csv("result.csv")

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6818 entries, 0 to 6817
Data columns (total 12 columns):
 #   Column                     Non-Null Count  Dtype  
---  ------                     --------------  -----  
 0   Item_Identifier            6818 non-null   object 
 1   Item_Weight                5656 non-null   float64
 2   Item_Fat_Content           6818 non-null   object 
 3   Item_Visibility            6818 non-null   float64
 4   Item_Type                  6818 non-null   object 
 5   Item_MRP                   6818 non-null   float64
 6   Outlet_Identifier          6818 non-null   object 
 7   Outlet_Establishment_Year  6818 non-null   int64  
 8   Outlet_Size                4878 non-null   object 
 9   Outlet_Location_Type       6818 non-null   object 
 10  Outlet_Type                6818 non-null   object 
 11  Item_Outlet_Sales          6818 non-null   float64
dtypes: float64(4), int64(1), object(7)
memory usage: 639.3+ KB
       Item_Identifier Item_Fat_Content    It

Unnamed: 0,pred
0,963.350701
1,627.943228
2,2345.700763
3,1876.148528
4,2841.763390
...,...
1700,1020.221510
1701,2927.415403
1702,4183.742879
1703,450.255313


In [61]:
# 변수 선택
print(list(train.columns[train.dtypes == object]))

cols = ['Item_Fat_Content', 'Item_Type', 'Outlet_Identifier', 'Outlet_Size', 'Outlet_Location_Type', 'Outlet_Type']
cols

target = train.pop('Item_Outlet_Sales')
print(train.shape, test.shape)

# 훈련/검증 데이터 합치기
df = pd.concat([train, test])
print(df.shape)

# 레이블 인코딩
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
for col in cols:
    df[col] = le.fit_transform(df[col])

# 훈련/검증 데이터로 분리 -깊은 복사
train = df.iloc[:len(train)].copy()
test = df.iloc[len(train):].copy()

print(train.shape, test.shape)
print(train.info())
print(test.info())

# 결측치 처리
train['Item_Weight'] = train['Item_Weight'].fillna(train['Item_Weight'].min())
train['Outlet_Size'] = train['Outlet_Size'].fillna(train['Outlet_Size'].mode()[0])

test['Item_Weight'] = test['Item_Weight'].fillna(train['Item_Weight'].min())
test['Outlet_Size'] = test['Outlet_Size'].fillna(train['Outlet_Size'].mode()[0])
print(train.shape, test.shape)

train.drop('Item_Identifier', axis=1, inplace=True)
test.drop('Item_Identifier', axis=1, inplace=True)
print(train.shape, test.shape)
print(train.info())
print(test.info())

# 검증 데이터 나누기
from sklearn.model_selection import train_test_split
X_train, X_val, y_train, y_val = train_test_split(
    train,
    target,
    test_size=0.2,
    random_state=0)
print(X_train.shape, X_val.shape, y_train.shape, y_val.shape)

['Item_Identifier', 'Item_Fat_Content', 'Item_Type', 'Outlet_Identifier', 'Outlet_Size', 'Outlet_Location_Type', 'Outlet_Type']
(6818, 11) (1705, 11)
(8523, 11)
(6818, 11) (1705, 11)
<class 'pandas.core.frame.DataFrame'>
Index: 6818 entries, 0 to 6817
Data columns (total 11 columns):
 #   Column                     Non-Null Count  Dtype  
---  ------                     --------------  -----  
 0   Item_Identifier            6818 non-null   object 
 1   Item_Weight                5656 non-null   float64
 2   Item_Fat_Content           6818 non-null   int32  
 3   Item_Visibility            6818 non-null   float64
 4   Item_Type                  6818 non-null   int32  
 5   Item_MRP                   6818 non-null   float64
 6   Outlet_Identifier          6818 non-null   int32  
 7   Outlet_Establishment_Year  6818 non-null   int64  
 8   Outlet_Size                6818 non-null   int32  
 9   Outlet_Location_Type       6818 non-null   int32  
 10  Outlet_Type                6818 non-nu

In [63]:
# 머신러닝 학습 및 평가
from sklearn.metrics import mean_squared_error
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import r2_score

def rmse(y_val, y_pred):
    return mean_squared_error(y_val, y_pred) ** 0.5

# 선형회귀
from sklearn.linear_model import LinearRegression
lr = LinearRegression()
lr.fit(X_train, y_train)
y_pred = lr.predict(X_val)

result = mean_squared_error(y_val, y_pred)
print('MSE:', result)

result = mean_absolute_error(y_val, y_pred)
print('MAE:', result)

result = r2_score(y_val, y_pred)
print('R2:', result)

result = rmse(y_val, y_pred)
print('RMSE:', result)

MSE: 1282923.0729833895
MAE: 865.1968401416271
R2: 0.5058168396924843
RMSE: 1132.6619411737067


In [75]:
# 모델 학습 및 평가 - 랜덤포레스트
from sklearn.ensemble import RandomForestRegressor
import numpy as np

rf = RandomForestRegressor(random_state=0)
rf.fit(X_train, y_train)
y_pred = rf.predict(X_val)

mse = mean_squared_error(y_val, y_pred)
print('MSE:', mse)

mae = mean_absolute_error(y_val, y_pred)
print('MAE:', mae)

r2 = r2_score(y_val, y_pred)
print('R2:', r2)

# rmse = rmse(y_val, y_pred)
# print('RMSE:', rmse)

rmse = np.sqrt(mse)
print('RMSE:', rmse)

# LightGBM
"""
import lightgbm as lgb
model = lgb.LGBMRegressor(random_state=0, verbose=-1)
model.fit(X_train, y_train)
y_pred = model.predict(X_val)

result = mean_squared_error(y_val, y_pred)
print('MSE:', result)

result = mean_absolute_error(y_val, y_pred)
print('MAE:', result)

result = r2_score(y_val, y_pred)
print('R2:', result)

result = rmse(y_val, y_pred)
print('RMSE:', result)

pred2 = model.predict(test)
print(pred2)
"""

# 테스트 데이터 평가(예측)
pred2 = rf.predict(test)
print(pred2)

# 파일 생성
submit = pd.DataFrame({'pred':pred2})
submit.to_csv("result.csv", index=False)

pd.read_csv("result.csv")

MSE: 1101802.8117346708
MAE: 747.2971172067448
R2: 0.5755845326933362
RMSE: 1049.6679530854844
[1536.79956   787.135392 2192.499374 ... 4095.70199   967.493954
 2001.12848 ]


Unnamed: 0,pred
0,1536.799560
1,787.135392
2,2192.499374
3,1700.612992
4,2746.924350
...,...
1700,239.095438
1701,854.194768
1702,4095.701990
1703,967.493954
