# File descriptions

- sales_train.csv - the training set. Daily historical data from January 2013 to October 2015.
- test.csv - the test set. You need to forecast the sales for these shops and products for November 2015.
- sample_submission.csv - a sample submission file in the correct format.
- items.csv - supplemental information about the items/products.
- item_categories.csv  - supplemental information about the items categories.
- shops.csv- supplemental information about the shops.

# Data fields

- ID - an Id that represents a (Shop, Item) tuple within the test set
- shop_id - unique identifier of a shop
- item_id - unique identifier of a product
- item_category_id - unique identifier of item category
- item_cnt_day - number of products sold. You are predicting a monthly amount of this measure 판매된 제품 수
- item_price - current price of an item
- date - date in format dd/mm/yyyy
- date_block_num - a consecutive month number, used for convenience. January 2013 is 0, February 2013 is 1,..., October 2015 is 33 연속 월 숫자
- item_name - name of item
- shop_name - name of shop
- item_category_name - name of item category

In [1]:
import numpy as np
import pandas as pd

from sklearn.ensemble import RandomForestRegressor
from sklearn.preprocessing import LabelEncoder, OneHotEncoder

##데이터 불러오기
sales = pd.read_csv("./sales/sales_train.csv")
test = pd.read_csv("./sales/test.csv")
sample_submission = pd.read_csv("./sales/sample_submission.csv")
items = pd.read_csv("./sales/items.csv")
item_categories = pd.read_csv("./sales/item_categories.csv")
shop = pd.read_csv("./sales/shops.csv")

In [2]:
## sales 데이터개요 확인
sales.head(100)
# sales.loc[(sales.shop_id==50) & (sales.item_id==2252),:]
# shop.head()

Unnamed: 0,date,date_block_num,shop_id,item_id,item_price,item_cnt_day
0,02.01.2013,0,59,22154,999.00,1.0
1,03.01.2013,0,25,2552,899.00,1.0
2,05.01.2013,0,25,2552,899.00,-1.0
3,06.01.2013,0,25,2554,1709.05,1.0
4,15.01.2013,0,25,2555,1099.00,1.0
...,...,...,...,...,...,...
95,08.01.2013,0,25,2808,999.00,2.0
96,09.01.2013,0,25,2808,999.00,1.0
97,11.01.2013,0,25,2808,999.00,4.0
98,17.01.2013,0,25,2808,999.00,1.0


In [3]:
item_categories.head(100)
sales.item_id.unique()
sales.shop_id.head(100)

0     59
1     25
2     25
3     25
4     25
      ..
95    25
96    25
97    25
98    25
99    25
Name: shop_id, Length: 100, dtype: int64

In [4]:
## sales 데이터 변수 확인
sales.date #변수이름 확인
sales.date.size #몇개의 변수가 있는지 확인
sales.item_price #전체 데이터의 행, 열 개수
sales.dtypes #변수의 데이터 타입 (수치형으로 변환 위해)

date               object
date_block_num      int64
shop_id             int64
item_id             int64
item_price        float64
item_cnt_day      float64
dtype: object

In [5]:
#날짜별 아이템 아이디, 해당 가격 모으기
selected_cols = sales[["date","item_id","item_price"]]
selected_cols.iloc[:,0]
selected_cols

Unnamed: 0,date,item_id,item_price
0,02.01.2013,22154,999.00
1,03.01.2013,2552,899.00
2,05.01.2013,2552,899.00
3,06.01.2013,2554,1709.05
4,15.01.2013,2555,1099.00
...,...,...,...
2935844,10.10.2015,7409,299.00
2935845,09.10.2015,7460,299.00
2935846,14.10.2015,7459,349.00
2935847,22.10.2015,7440,299.00


In [6]:
selected_cols.groupby(["date","item_id"]).sum()

Unnamed: 0_level_0,Unnamed: 1_level_0,item_price
date,item_id,Unnamed: 2_level_1
01.01.2013,32,2093.44
01.01.2013,33,998.00
01.01.2013,53,170.00
01.01.2013,59,359.00
01.01.2013,85,299.00
...,...,...
31.12.2014,22106,299.00
31.12.2014,22111,349.00
31.12.2014,22118,198.00
31.12.2014,22139,1891.46


In [9]:
#카테고리형 데이터를 수치형 데이터로 변환
sales['date'].astype('category').cat.codes #단점: 어떤 클래스가 숫자로 맵핑되어 있는지 확인하기 어려움

from sklearn.preprocessing import LabelEncoder #이용하면 단점 채워줌
le = LabelEncoder()
result = le.fit_transform(sales['date'])
print(result)
le.classes_
#le.inverse_transform(result) #역변환

[ 34  68 136 ... 471 743  97]


array(['01.01.2013', '01.01.2014', '01.01.2015', ..., '31.10.2015',
       '31.12.2013', '31.12.2014'], dtype=object)

In [10]:
#겉보기에는 수치형 데이터 컬럼이지만, 실제로는 categorical 컬럼이라면 원핫인코딩
# get_dummies: DataFrame에서 category형 데이터 컬럼을 선택하여 자동으로 원핫인코딩
pd.get_dummies(sales['date'])

Unnamed: 0,01.01.2013,01.01.2014,01.01.2015,01.02.2013,01.02.2014,01.02.2015,01.03.2013,01.03.2014,01.03.2015,01.04.2013,...,31.07.2014,31.07.2015,31.08.2013,31.08.2014,31.08.2015,31.10.2013,31.10.2014,31.10.2015,31.12.2013,31.12.2014
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2935844,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2935845,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2935846,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2935847,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [12]:
# Numerical -> Categorical로 변환
sales['item_cnt_day'] = sales['item_cnt_day'].astype('category')
pd.get_dummies(sales['item_cnt_day'])

Unnamed: 0,-22.0,-16.0,-9.0,-6.0,-5.0,-4.0,-3.0,-2.0,-1.0,1.0,...,504.0,508.0,512.0,533.0,539.0,624.0,637.0,669.0,1000.0,2169.0
0,0,0,0,0,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2935844,0,0,0,0,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0
2935845,0,0,0,0,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0
2935846,0,0,0,0,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0
2935847,0,0,0,0,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0


In [None]:
train_df = pd.read_csv('data4/hyundaiCar.xlsx',sheet_name='train')
test_df =  pd.read_csv('data4/hyundaiCar.xlsx',sheet_name='test')
x_train = train_df.iloc[:,1:] # '가격;' 제외한 컬럼
y_train = train_df['가격']
x_train