In [13]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import root_mean_squared_error

In [14]:
#1. 데이터 불러오기
train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')

In [15]:
train.shape, test.shape

((6818, 12), (1705, 11))

In [16]:
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6818 entries, 0 to 6817
Data columns (total 12 columns):
 #   Column                     Non-Null Count  Dtype  
---  ------                     --------------  -----  
 0   Item_Identifier            6818 non-null   object 
 1   Item_Weight                5656 non-null   float64
 2   Item_Fat_Content           6818 non-null   object 
 3   Item_Visibility            6818 non-null   float64
 4   Item_Type                  6818 non-null   object 
 5   Item_MRP                   6818 non-null   float64
 6   Outlet_Identifier          6818 non-null   object 
 7   Outlet_Establishment_Year  6818 non-null   int64  
 8   Outlet_Size                4878 non-null   object 
 9   Outlet_Location_Type       6818 non-null   object 
 10  Outlet_Type                6818 non-null   object 
 11  Item_Outlet_Sales          6818 non-null   float64
dtypes: float64(4), int64(1), object(7)
memory usage: 639.3+ KB


In [17]:
x_train = train.iloc[:,:11]
y_train = train.iloc[:, -1]

In [18]:
x_test = test
x_full = pd.concat([x_train, x_test], axis=0)

In [19]:
x_full.shape

(8523, 11)

In [20]:
#2. 결측치 처리
x_full.isnull().sum()

Item_Identifier                 0
Item_Weight                  1463
Item_Fat_Content                0
Item_Visibility                 0
Item_Type                       0
Item_MRP                        0
Outlet_Identifier               0
Outlet_Establishment_Year       0
Outlet_Size                  2410
Outlet_Location_Type            0
Outlet_Type                     0
dtype: int64

In [21]:
x_full.describe()

Unnamed: 0,Item_Weight,Item_Visibility,Item_MRP,Outlet_Establishment_Year
count,7060.0,8523.0,8523.0,8523.0
mean,12.857645,0.066132,140.992782,1997.831867
std,4.643456,0.051598,62.275067,8.37176
min,4.555,0.0,31.29,1985.0
25%,8.77375,0.026989,93.8265,1987.0
50%,12.6,0.053931,143.0128,1999.0
75%,16.85,0.094585,185.6437,2004.0
max,21.35,0.328391,266.8884,2009.0


In [22]:
x_full.describe(include="O")

Unnamed: 0,Item_Identifier,Item_Fat_Content,Item_Type,Outlet_Identifier,Outlet_Size,Outlet_Location_Type,Outlet_Type
count,8523,8523,8523,8523,6113,8523,8523
unique,1559,5,16,10,3,3,4
top,FDW13,Low Fat,Fruits and Vegetables,OUT027,Medium,Tier 3,Supermarket Type1
freq,10,5089,1232,935,2793,3350,5577


In [23]:
x_full['Item_Weight'] = x_full['Item_Weight'].fillna(x_full['Item_Weight'].median())
x_full['Outlet_Size']=x_full['Outlet_Size'].fillna(x_full['Outlet_Size'].mode()[0])

x_full.shape

(8523, 11)

In [24]:
cols = list(train.columns[train.dtypes==object])
le = LabelEncoder()

for col in cols:
    x_full[col] = le.fit_transform(x_full[col])

x_train = x_full.iloc[:6818, :]
x_test = x_full.iloc[6818:, :]

In [25]:
#3. 스케일링
scaler = StandardScaler()
x_train = scaler.fit_transform(x_train)
x_test = scaler.transform(x_test)

In [26]:
#4. 검증 데이터 나누기
x_train, x_val, y_train, y_val = train_test_split(x_train, y_train, test_size=0.3, random_state=0)

In [27]:
#5. 모델 돌리기  / 랜덤 포러스트
rfr = RandomForestRegressor()
rfr_model = rfr.fit(x_train, y_train)
y_pred = rfr_model.predict(x_val)

In [28]:
#6. 평가하기
rmse = root_mean_squared_error(y_val, y_pred)
rmse

1120.1216259054004

In [29]:
from sklearn.linear_model import LinearRegression
lr = LinearRegression()
lr_model = lr.fit(x_train, y_train)
y_pred = lr_model.predict(x_val)

rmse2 = root_mean_squared_error(y_val, y_pred)
rmse2

1160.6937004518502

In [30]:
y_test_pred = rfr_model.predict(x_test)

In [31]:
submit = pd.DataFrame({'pred': y_test_pred})
submit.to_csv('result2.csv', index=False)

In [32]:
pd.read_csv('result2.csv')

Unnamed: 0,pred
0,1180.383504
1,769.571588
2,2127.663770
3,1523.849750
4,2753.509112
...,...
1700,297.133224
1701,604.186868
1702,3505.423684
1703,1079.195220
