### 1. 라이브러리 선언

In [1]:
import pandas as pd
import numpy as np

### 2. 데이터 불러오기

In [2]:
targetData = \
        pd.read_csv("../dataset/kopo_channel_seasonality_new.csv")

#### REGIONID : STR : 지역 정보 : 1 : 20 : 미국, 영국 등 법인 정보
#### PRODUCT : STR : 상품 정보 : 1 : 20 : TV, 냉장고 등
#### YEARWEEK : INT : 연주차 정보 : 1 : 6
#### QTY : FLOAT : 판매량 SELLOUT : 음수로 된 반품값 존재

In [3]:
### describe - 숫자형 데이터에 대한 기본 통계를 만들어준다.

In [4]:
targetData.describe()

Unnamed: 0,YEARWEEK,QTY
count,124658.0,124658.0
mean,201526.66879,8949.287
std,82.77994,42949.73
min,201401.0,-364.0
25%,201440.0,32.0
50%,201527.0,282.0
75%,201613.0,2223.0
max,201652.0,1663206.0


In [5]:
### 데이터 형태 변환

In [6]:
selloutData = targetData.astype ({ "REGIONID":str,
                                    "PRODUCT":str,
                                    "YEARWEEK":str,
                                    "QTY":float })

In [7]:
selloutData

Unnamed: 0,REGIONID,PRODUCT,YEARWEEK,QTY
0,A60,PRODUCT4,201402,71.0
1,A60,PRODUCT59,201402,22275.0
2,A60,PRODUCT34,201402,4463.0
3,A60,PRODUCT47,201402,0.0
4,A60,PRODUCT56,201402,23.0
...,...,...,...,...
124653,A10,PRODUCT60,201630,824.0
124654,A10,PRODUCT56,201630,275.0
124655,A10,PRODUCT61,201630,0.0
124656,A10,PRODUCT12,201630,15021.0


### [불량 데이터 처리] qty가 음수인경우 0, 양수인 경우 기존 qty값 유지

In [8]:
selloutData["QTY_NEW"] = np.where ( selloutData.QTY < 0, 0, selloutData.QTY)

In [9]:
## 검증로직
#selloutData.loc[selloutData.QTY_NEW < 0]
selloutData[selloutData.QTY < 0]["QTY"].count()

323

### [데이터 통합 실습문제] year, week 컬럼을 생성하고, week가 52이하인 데이터만 조회한 후 refinedSelloutData 변수에 담아라

#### 1. year, week 분리하자

In [10]:
selloutData["YEAR"] = selloutData.YEARWEEK.str[0:4]

In [11]:
selloutData["WEEK"] = selloutData.YEARWEEK.str[4:6]

#### 2. int로 바꿔주기.

In [12]:
newSelloutData = selloutData.astype ({ "REGIONID":str,
                                    "PRODUCT":str,
                                    "YEARWEEK":str,
                                    "QTY":float,
                                    "YEAR":int,
                                    "WEEK":int})

#### 3. 데이터 추출

In [13]:
refinedSelloutData = newSelloutData.loc[newSelloutData.WEEK <= 52]

#### 4. 검증로직

In [14]:
refinedSelloutData.loc[refinedSelloutData.WEEK >= 53]

Unnamed: 0,REGIONID,PRODUCT,YEARWEEK,QTY,QTY_NEW,YEAR,WEEK


In [15]:
refinedSelloutData

Unnamed: 0,REGIONID,PRODUCT,YEARWEEK,QTY,QTY_NEW,YEAR,WEEK
0,A60,PRODUCT4,201402,71.0,71.0,2014,2
1,A60,PRODUCT59,201402,22275.0,22275.0,2014,2
2,A60,PRODUCT34,201402,4463.0,4463.0,2014,2
3,A60,PRODUCT47,201402,0.0,0.0,2014,2
4,A60,PRODUCT56,201402,23.0,23.0,2014,2
...,...,...,...,...,...,...,...
124653,A10,PRODUCT60,201630,824.0,824.0,2016,30
124654,A10,PRODUCT56,201630,275.0,275.0,2016,30
124655,A10,PRODUCT61,201630,0.0,0.0,2016,30
124656,A10,PRODUCT12,201630,15021.0,15021.0,2016,30


In [16]:
## year, week컬럼을 만든다(컬럼이 20개를 넘어가지 않도록 최대한 노력한다.)

In [17]:
## 다른 방법->

In [18]:
selloutData["YEAR"] = selloutData.YEARWEEK.astype(str).str[0:4]
selloutData["YEAR"] = selloutData.YEARWEEK.astype(str).str[4:]

In [19]:
yearweekVal = "201514"

In [20]:
int(yearweekVal)%100

14

In [21]:
refinedSelloutData

Unnamed: 0,REGIONID,PRODUCT,YEARWEEK,QTY,QTY_NEW,YEAR,WEEK
0,A60,PRODUCT4,201402,71.0,71.0,2014,2
1,A60,PRODUCT59,201402,22275.0,22275.0,2014,2
2,A60,PRODUCT34,201402,4463.0,4463.0,2014,2
3,A60,PRODUCT47,201402,0.0,0.0,2014,2
4,A60,PRODUCT56,201402,23.0,23.0,2014,2
...,...,...,...,...,...,...,...
124653,A10,PRODUCT60,201630,824.0,824.0,2016,30
124654,A10,PRODUCT56,201630,275.0,275.0,2016,30
124655,A10,PRODUCT61,201630,0.0,0.0,2016,30
124656,A10,PRODUCT12,201630,15021.0,15021.0,2016,30


#### 지역, 상품, 연주차 컬럼순으로 오름차순 정렬하자

In [22]:
sortKey = ["REGIONID", "PRODUCT", "YEARWEEK"]

In [23]:
sortedData = refinedSelloutData.sort_values(by=sortKey,
                               ignore_index=True,
                               inplace=False)

In [24]:
sortedData

Unnamed: 0,REGIONID,PRODUCT,YEARWEEK,QTY,QTY_NEW,YEAR,WEEK
0,A00,PRODUCT34,201401,661.0,661.0,2014,1
1,A00,PRODUCT34,201402,679.0,679.0,2014,2
2,A00,PRODUCT34,201403,578.0,578.0,2014,3
3,A00,PRODUCT34,201404,532.0,532.0,2014,4
4,A00,PRODUCT34,201405,516.0,516.0,2014,5
...,...,...,...,...,...,...,...
123859,A77,PRODUCT12,201648,4152.0,4152.0,2016,48
123860,A77,PRODUCT12,201649,5086.0,5086.0,2016,49
123861,A77,PRODUCT12,201650,5846.0,5846.0,2016,50
123862,A77,PRODUCT12,201651,4933.0,4933.0,2016,51


#### 지역, 상품, 연도별 평균을 집계해보자

In [59]:
groupKey = ["REGIONID", "PRODUCT", "YEAR"]

In [60]:
groupData = sortedData.groupby(groupKey)["QTY_NEW"].agg(["mean"]).reset_index()

In [61]:
groupData = groupData.rename(columns = {"mean":"QTY_MEAN"})

#### groupby / describe

In [65]:
sortedData.groupby(groupKey)["QTY"].describe()

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,count,mean,std,min,25%,50%,75%,max
REGIONID,PRODUCT,YEAR,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
A00,PRODUCT34,2014,52.0,275.961538,162.179634,60.0,149.50,232.0,365.25,679.0
A00,PRODUCT34,2015,52.0,86.634615,73.246034,24.0,41.50,65.0,109.00,479.0
A00,PRODUCT34,2016,52.0,36.576923,94.897539,2.0,7.00,12.5,22.00,561.0
A00,PRODUCT58,2014,52.0,2.673077,3.889257,0.0,0.00,1.0,3.00,16.0
A00,PRODUCT58,2015,52.0,5.711538,6.675494,1.0,2.00,3.5,6.00,39.0
...,...,...,...,...,...,...,...,...,...,...
A77,PRODUCT1,2015,52.0,3030.019231,846.496783,1882.0,2487.00,2889.5,3342.25,6859.0
A77,PRODUCT1,2016,52.0,3375.326923,702.359238,1995.0,2918.00,3121.5,3700.75,5040.0
A77,PRODUCT12,2014,52.0,2035.788462,762.881491,739.0,1551.50,1990.0,2403.00,3916.0
A77,PRODUCT12,2015,52.0,3540.980769,852.593402,2314.0,2794.25,3500.0,4104.25,5601.0


### 이동집계함수

In [None]:
#데이터프레임명["컬럼명"].rolling(window=5,구간
#                               enter=True,기준점(true면 기준점 포함 위아래로, false면 기준점 위로만.) 
#                               min_periods=1).mean() 최소구간(해당 숫자만큼 계산될 수  있을 때 출력)
#                                             함수(std,meam)
#최소구간이 정의되지 않으면 값이 나오지 않을 수 있다.

In [75]:
sortedData["MA5"] = sortedData.QTY.rolling(window=5,
                      min_periods=1,
                       center=True).mean()

In [84]:
sortedData

Unnamed: 0,REGIONID,PRODUCT,YEARWEEK,QTY,QTY_NEW,YEAR,WEEK,MA5
0,A00,PRODUCT34,201401,661.0,661.0,2014,1,639.333333
1,A00,PRODUCT34,201402,679.0,679.0,2014,2,612.500000
2,A00,PRODUCT34,201403,578.0,578.0,2014,3,593.200000
3,A00,PRODUCT34,201404,532.0,532.0,2014,4,545.600000
4,A00,PRODUCT34,201405,516.0,516.0,2014,5,491.200000
...,...,...,...,...,...,...,...,...
123859,A77,PRODUCT12,201648,4152.0,4152.0,2016,48,4582.600000
123860,A77,PRODUCT12,201649,5086.0,5086.0,2016,49,4718.200000
123861,A77,PRODUCT12,201650,5846.0,5846.0,2016,50,5513.600000
123862,A77,PRODUCT12,201651,4933.0,4933.0,2016,51,5854.000000


In [85]:
## 주차별 추세선 대비 실제 판매량.

In [89]:
sortedData.QTY_NEW

0          661.0
1          679.0
2          578.0
3          532.0
4          516.0
           ...  
123859    4152.0
123860    5086.0
123861    5846.0
123862    4933.0
123863    7551.0
Name: QTY_NEW, Length: 123864, dtype: float64

In [86]:
sortedData["EFFECT"] = sortedData.QTY_NEW / sortedData.MA5

In [None]:
## 필요없는 컬럼 버리기
sortedData.drop(columns=["컬럼명"])

In [None]:
## 중복값 제거.
sortedData.drop_duplicates()

### 3개 연도의 지역, 상품, 주차별 평균 EFFECT 구하기

In [91]:
groupKey = ["REGIONID", "PRODUCT", "WEEK"]

In [93]:
ratioData = sortedData.groupby(groupKey)[["EFFECT"]].agg("mean").reset_index()

In [94]:
ratioData.to_csv("d:/finalresult.csv", index=False, encoding="ms949")

## 위 데이터엔 오류가 있다!

In [95]:
sortedData.to_csv("./middelResult.csv",index=False, encoding="ms949")

## PRODUCT와 YEARWEEK가 바뀌는 지점에서의 오류