사용할 패키지 불러오기

In [68]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from xgboost import XGBRegressor
from sklearn import metrics

pandas DataFrame으로 데이터 불러오기

In [69]:
# loading the data from csv file to Pandas DataFrame
big_mart_data = pd.read_csv('big_mart_sales_train.csv')

In [70]:

big_mart_data.head()

Unnamed: 0,Item_Identifier,Item_Weight,Item_Fat_Content,Item_Visibility,Item_Type,Item_MRP,Outlet_Identifier,Outlet_Establishment_Year,Outlet_Size,Outlet_Location_Type,Outlet_Type,Item_Outlet_Sales
0,FDA15,9.3,Low Fat,0.016047,Dairy,249.8092,OUT049,1999,Medium,Tier 1,Supermarket Type1,3735.138
1,DRC01,5.92,Regular,0.019278,Soft Drinks,48.2692,OUT018,2009,Medium,Tier 3,Supermarket Type2,443.4228
2,FDN15,17.5,Low Fat,0.01676,Meat,141.618,OUT049,1999,Medium,Tier 1,Supermarket Type1,2097.27
3,FDX07,19.2,Regular,0.0,Fruits and Vegetables,182.095,OUT010,1998,,Tier 3,Grocery Store,732.38
4,NCD19,8.93,Low Fat,0.0,Household,53.8614,OUT013,1987,High,Tier 3,Supermarket Type1,994.7052


In [73]:
# 데이터 갯수 및 피처 개수 보기
big_mart_data.shape

(8523, 12)

In [76]:
# 데이터셋 정보 보기
big_mart_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8523 entries, 0 to 8522
Data columns (total 12 columns):
 #   Column                     Non-Null Count  Dtype  
---  ------                     --------------  -----  
 0   Item_Identifier            8523 non-null   object 
 1   Item_Weight                7060 non-null   float64
 2   Item_Fat_Content           8523 non-null   object 
 3   Item_Visibility            8523 non-null   float64
 4   Item_Type                  8523 non-null   object 
 5   Item_MRP                   8523 non-null   float64
 6   Outlet_Identifier          8523 non-null   object 
 7   Outlet_Establishment_Year  8523 non-null   int64  
 8   Outlet_Size                6113 non-null   object 
 9   Outlet_Location_Type       8523 non-null   object 
 10  Outlet_Type                8523 non-null   object 
 11  Item_Outlet_Sales          8523 non-null   float64
dtypes: float64(4), int64(1), object(7)
memory usage: 799.2+ KB


In [77]:
# Categorical Features:
# Item_Identifier           # 제품ID
# Item_Fat_Content          # 저지장제품
# Item_Type                 # 제품이 속한 카테고리
# Outlet_Identifier         # 상점ID
# Outlet_Size               # 상점규모
# Outlet_Location_Type      # 상점이 위치한 도시의 유형
# Outlet_Type               # 상점의 유형 (식료품점인지, 슈퍼마켓인지)

In [78]:
# null값 찾기
big_mart_data.isnull().sum()

Item_Identifier                 0
Item_Weight                  1463
Item_Fat_Content                0
Item_Visibility                 0
Item_Type                       0
Item_MRP                        0
Outlet_Identifier               0
Outlet_Establishment_Year       0
Outlet_Size                  2410
Outlet_Location_Type            0
Outlet_Type                     0
Item_Outlet_Sales               0
dtype: int64

1. null 값 처리하기
2. null 값을  평균 값으로 대체하기

In [79]:
#  Item_Weight의 평균값 구하기
big_mart_data['Item_Weight'].mean()

12.857645184136183

In [80]:
# item_weight nan값에 평균 값 넣기
big_mart_data['Item_Weight'].fillna(big_mart_data['Item_Weight'].mean(), inplace=True)

In [82]:
#  Outlet_Size value 값이 high, medium, small 값으로 나타냄으로 최빈값을 구하여 Nan값에 넣어주기
big_mart_data['Outlet_Size'].mode()

0    Medium
dtype: object

In [None]:
# filling the missing values in "Outlet_Size" column with Mode
mode_of_Outlet_size = big_mart_data.pivot_table(values='Outlet_Size', columns='Outlet_Type', aggfunc=(lambda x: x.mode()[0]))

In [None]:
print(mode_of_Outlet_size)

In [None]:
miss_values = big_mart_data['Outlet_Size'].isnull()   

In [None]:
print(miss_values)

In [None]:
big_mart_data.loc[miss_values, 'Outlet_Size'] = big_mart_data.loc[miss_values,'Outlet_Type'].apply(lambda x: mode_of_Outlet_size[x])

In [None]:
# checking for missing values
big_mart_data.isnull().sum()

Data Analysis

In [None]:
big_mart_data.describe()

Numerical Features

In [None]:
sns.set()

In [None]:
# Item_Weight distribution
plt.figure(figsize=(6,6))
sns.distplot(big_mart_data['Item_Weight'])
plt.show()

In [None]:
# Item Visibility distribution
plt.figure(figsize=(6,6))
sns.distplot(big_mart_data['Item_Visibility'])
plt.show()

In [None]:
# Item MRP distribution
plt.figure(figsize=(6,6))
sns.distplot(big_mart_data['Item_MRP'])
plt.show()

In [None]:
# Item_Outlet_Sales distribution
plt.figure(figsize=(6,6))
sns.distplot(big_mart_data['Item_Outlet_Sales'])
plt.show()

In [None]:
# Outlet_Establishment_Year column
plt.figure(figsize=(6,6))
sns.countplot(x='Outlet_Establishment_Year', data=big_mart_data)
plt.show()

Categorical Features

In [None]:
# Item_Fat_Content column
plt.figure(figsize=(6,6))
sns.countplot(x='Item_Fat_Content', data=big_mart_data)
plt.show()

In [None]:
# Item_Type column
plt.figure(figsize=(30,6))
sns.countplot(x='Item_Type', data=big_mart_data)
plt.show()

In [None]:
# Outlet_Size column
plt.figure(figsize=(6,6))
sns.countplot(x='Outlet_Size', data=big_mart_data)
plt.show()

Data Pre-Processing

In [None]:
big_mart_data.head()

In [None]:
big_mart_data['Item_Fat_Content'].value_counts()

In [None]:
big_mart_data.replace({'Item_Fat_Content': {'low fat':'Low Fat','LF':'Low Fat', 'reg':'Regular'}}, inplace=True)

In [None]:
big_mart_data['Item_Fat_Content'].value_counts()

Label Encoding

In [None]:
encoder = LabelEncoder()

In [None]:
big_mart_data['Item_Identifier'] = encoder.fit_transform(big_mart_data['Item_Identifier'])

big_mart_data['Item_Fat_Content'] = encoder.fit_transform(big_mart_data['Item_Fat_Content'])

big_mart_data['Item_Type'] = encoder.fit_transform(big_mart_data['Item_Type'])

big_mart_data['Outlet_Identifier'] = encoder.fit_transform(big_mart_data['Outlet_Identifier'])

big_mart_data['Outlet_Size'] = encoder.fit_transform(big_mart_data['Outlet_Size'])

big_mart_data['Outlet_Location_Type'] = encoder.fit_transform(big_mart_data['Outlet_Location_Type'])

big_mart_data['Outlet_Type'] = encoder.fit_transform(big_mart_data['Outlet_Type'])

In [None]:
big_mart_data.head()

Splitting features and Target

In [None]:
X = big_mart_data.drop(columns='Item_Outlet_Sales', axis=1)
Y = big_mart_data['Item_Outlet_Sales']

In [None]:
print(X)

In [None]:
print(Y)

Splitting the data into Training data & Testing Data

In [None]:
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, random_state=2)

In [None]:
print(X.shape, X_train.shape, X_test.shape)

Machine Learning Model Training

XGBoost Regressor

In [None]:
regressor = XGBRegressor()

In [None]:
regressor.fit(X_train, Y_train)

Evaluation

In [None]:
# prediction on training data
training_data_prediction = regressor.predict(X_train)

In [None]:
# R squared Value
r2_train = metrics.r2_score(Y_train, training_data_prediction)

In [None]:
print('R Squared value = ', r2_train)

In [None]:
# prediction on test data
test_data_prediction = regressor.predict(X_test)

In [None]:
# R squared Value
r2_test = metrics.r2_score(Y_test, test_data_prediction)

In [None]:
print('R Squared value = ', r2_test)