# #2. 상관관계 분석 
* 날씨와 민감한 상품군을 파악하는 과정입니다. 

In [None]:
import warnings
warnings.filterwarnings(action='ignore')

import pandas as pd 
import numpy as np
from statsmodels.tsa.stattools import adfuller
pd.set_option("max_rows", 500)
pd.set_option("max_columns", 500)
pd.set_option('float_format', '{:f}'.format)

import dataload

import os 
import datetime
from tqdm import tqdm 
from functools import reduce
tqdm.pandas()

%matplotlib inline
import seaborn as sns
import matplotlib.pyplot as plt 
plt.style.use('seaborn')
plt.rc('font', family='AppleGothic')
plt.rc('axes', unicode_minus=False)

# Data Load 

In [None]:
weather = pd.read_csv(os.path.join('최종데이터', 'weather_final2.csv'), encoding='cp949')
weather = weather.drop(['연', '월', '일', '분기', '요일', '공휴일명', '공휴일여부', '주말여부', '계절', '폭염여부', '강수여부', 'year', 'month'], axis=1)
weather = weather.rename(columns = {'날짜':'date'})

print(weather.shape)
weather.shape[0] == weather.date.nunique() * weather.지역.nunique()

(11680, 16)


True

In [None]:
weather.head()

Unnamed: 0,date,지점번호,평균기온,최고기온,최저기온,일별강수량,1시간최대강수량,평균풍속,지역,PM10,PM25,1시간최대습도,일조시간합,최고현지기압,체감온도,열지수
0,2018-01-01,105,1.3,5.7,-2.1,0.0,0.0,3.7,강릉,20.066667,13.4,25.4,57.9,1023.0,7.502056,-39.701524
1,2018-01-01,112,-0.3,2.7,-2.7,0.0,0.0,1.6,인천,37.518681,18.641758,67.2,53.8,1020.3,9.992379,-42.924922
2,2018-01-01,119,-1.7,4.7,-6.9,0.0,0.0,1.0,수원,42.782895,21.375,84.7,52.7,1025.1,10.136402,-45.791893
3,2018-01-01,136,-1.0,4.7,-6.5,0.0,0.0,2.2,안동,39.75,28.583333,56.4,58.3,1010.6,8.356692,-44.375704
4,2018-01-01,152,2.1,6.2,-0.4,0.0,0.0,3.3,울산,38.548193,15.433735,42.8,58.3,1017.1,8.861429,-38.064334


In [None]:
buy_full = dataload.load_buy()
print(buy_full.shape)
buy_full.head()

(2056899, 6)


Unnamed: 0,date,sex,age,big_cat,sm_cat,qty
0,20180101,F,20,식품,가공란,37
1,20180101,F,30,식품,가공란,16
2,20180101,F,40,식품,가공란,9
3,20180101,F,50,식품,가공란,3
4,20180101,M,20,식품,가공란,13


In [None]:
weather["date"] = weather["date"].progress_apply(lambda x : pd.to_datetime(str(x), format="%Y-%m-%d"))
buy_full["date"] = buy_full["date"].progress_apply(lambda x : pd.to_datetime(str(x), format="%Y-%m-%d"))

100%|██████████| 11680/11680 [00:00<00:00, 12606.08it/s]
100%|██████████| 2056899/2056899 [02:35<00:00, 13188.54it/s]


In [None]:
buy = buy_full.groupby(['date', 'sm_cat']).sum('qty').reset_index()[["sm_cat", "date", "qty"]]
buy.shape

(267707, 3)

# Correlation Check 
* 날씨 변수 선별 : '평균기온', '일별강수량', '평균풍속', 'PM10', '1시간최대습도', '일조시간합'

In [None]:
weather = weather[["date", "지역", '평균기온', '일별강수량', '평균풍속', 'PM10', '1시간최대습도', '일조시간합']]

In [None]:
def weather_correlation(weather, product, loc) : 
    df1 = weather[weather.지역 == loc].reset_index(drop=True)
    df2 = buy[buy.sm_cat == product]
    df = df1.merge(df2, on="date", how='right')
    df['sm_cat'] = df['sm_cat'].fillna(product) ; df['qty'] = df['qty'].fillna(0)
    co = dict(np.abs(df.corr(method='spearman')["qty"][:-1]))
    co["category"] = product
    return co

In [None]:
result = pd.DataFrame()

for cat in tqdm(buy.sm_cat.unique()) : 
    result = result.append(pd.DataFrame(weather_correlation(weather, cat, '서울'), index=[0]), ignore_index=True)

100%|██████████| 383/383 [00:07<00:00, 51.02it/s]


In [None]:
buy_category = buy_full[["big_cat", 'sm_cat']].drop_duplicates().reset_index(drop=True)
buy_category = buy_category.merge(result, left_on='sm_cat', right_on="category", how='left')
buy_category = buy_category.drop('category', axis=1)

# score 
buy_category["score"] = buy_category.loc[:,list(buy_category.columns)[2:]].sum(axis=1)
buy_category = buy_category.sort_values('score', ascending=False).reset_index(drop=True)

In [None]:
buy_category.describe()

Unnamed: 0,평균기온,일별강수량,평균풍속,PM10,1시간최대습도,일조시간합,score
count,383.0,383.0,383.0,383.0,383.0,383.0,383.0
mean,0.24987,0.065854,0.080325,0.11832,0.085571,0.054952,0.654891
std,0.204447,0.055696,0.053475,0.093951,0.073784,0.04194,0.398753
min,0.001421,2.9e-05,0.000666,0.001224,0.000319,0.00073,0.050317
25%,0.086267,0.020545,0.03678,0.042375,0.023711,0.021595,0.351506
50%,0.188603,0.049055,0.072902,0.093782,0.06942,0.045784,0.549655
75%,0.366753,0.10164,0.114823,0.179307,0.124436,0.07634,0.856828
max,0.819175,0.290211,0.254873,0.55642,0.343261,0.189992,1.866211


In [None]:
buy_category.head(10)

Unnamed: 0,big_cat,sm_cat,평균기온,일별강수량,평균풍속,PM10,1시간최대습도,일조시간합,score
0,식품,감귤/한라봉/오렌지,0.781819,0.220087,0.143872,0.339007,0.264361,0.117065,1.866211
1,냉난방가전,복합식 가습기,0.799613,0.247738,0.070702,0.335301,0.319788,0.060826,1.833968
2,냉난방가전,초음파식 가습기,0.819175,0.249171,0.057451,0.315716,0.32328,0.062754,1.827546
3,식품,굴 생물,0.778428,0.20951,0.091692,0.335131,0.262712,0.121048,1.798522
4,냉난방가전,에어워셔,0.735215,0.240862,0.080973,0.346412,0.296342,0.04264,1.742445
5,식품,해초류,0.709691,0.196477,0.090531,0.358012,0.272941,0.068113,1.695766
6,냉난방가전,업소용 선풍기,0.742508,0.207005,0.060012,0.289943,0.274205,0.10156,1.675233
7,뷰티,뷰티 타투,0.722963,0.177168,0.080317,0.294297,0.26344,0.125605,1.66379
8,뷰티,핸드크림,0.731759,0.193824,0.10147,0.258004,0.245471,0.090728,1.621257
9,식품,코코아/핫초코,0.702257,0.137033,0.157499,0.256525,0.233131,0.113123,1.599569


# Data Save (추후 수정) 

## ADF Test (정상 / 비정상)

In [None]:
buy_full['year'] = buy_full['date'].apply(lambda x : x.year)

# 피벗테이블 정규화 함수
def normalization(data):
    avg_lst = []; std_lst = []
    for idx, item in tqdm(data.iteritems()):
        avg_lst.append(item.mean())
        std_lst.append(item.std())
    for i in tqdm(range(data.shape[1])):
        for j in range(365):
            data.iloc[j,i] = (data.iloc[j,i] - avg_lst[i])/std_lst[i]
    return data

adf_data = buy_full[['date', 'sm_cat', 'qty']].groupby(['date', 'sm_cat']).sum().reset_index()
adf_data = pd.pivot_table(adf_data, values='qty', index='date', columns='sm_cat').fillna(0)

In [None]:
# ADF Test
adf_res = pd.DataFrame()
sm_cat_list = list(adf_data.columns)

def adf_test(data, sm_cat):
    item = data[sm_cat]
    result = adfuller(item.values)
    if result[1] <= 0.05:
        test_res = '정상' # 기각
    else:
        test_res = '비정상' 
    return pd.DataFrame([{'sm_cat':sm_cat, 'ADF_stats':result[0], 'p-value':result[1], 'test_res':test_res}])

for sm_cat in sm_cat_list:
    adf_res = pd.concat([adf_res, adf_test(adf_data, sm_cat)], axis=0)
    
adf = adf_res[['sm_cat', 'test_res']] # 상품별 정상 / 비정상 정보
adf.to_csv(os.path.join("최종데이터", "adf_res.csv"), encoding="cp949")

In [None]:
# adf test
adf = pd.read_csv(os.path.join("최종데이터", "adf_res.csv"), encoding="cp949")
print(adf.shape)
adf.head()

(383, 3)


Unnamed: 0.1,Unnamed: 0,sm_cat,test_res
0,0,가공란,정상
1,0,가스온수기,정상
2,0,가열식 가습기,비정상
3,0,가자미,정상
4,0,갈비/찜/바비큐용 돈육,비정상


In [None]:
adf_0 = adf[adf["test_res"] == "비정상"]  # 비정상 
adf_1 = adf[adf["test_res"] == "정상"]  # 정상 

adf_0.shape, adf_1.shape

((154, 3), (229, 3))

In [None]:
corr0 = buy_category[buy_category.sm_cat.isin(adf_0.sm_cat.unique())]
corr1 = buy_category[buy_category.sm_cat.isin(adf_1.sm_cat.unique())]

In [None]:
corr0 = corr0.drop('일조시간합', axis=1) ; corr1 = corr1.drop('일조시간합', axis=1)

In [None]:
# 비정상 
corr0_high = corr0[(corr0.평균기온 >= 0.188603) | (corr0.일별강수량 >= 0.101640) | (corr0.평균풍속 >= 0.114823) | (corr0.PM10 >= 0.179307) | (corr0['1시간최대습도'] >= 0.124436)]
corr0_low = corr0[~corr0.sm_cat.isin(corr0_high.sm_cat.unique())]

corr0_high.shape, corr0_low.shape

((134, 8), (20, 8))

In [None]:
# 정상 
corr1_high = corr1[(corr1.평균기온 >= 0.188603) | (corr1.일별강수량 >= 0.101640) | (corr1.평균풍속 >= 0.114823) | (corr1.PM10 >= 0.179307) | (corr1['1시간최대습도'] >= 0.124436) ]
corr1_low = corr1[~corr1.sm_cat.isin(corr1_high.sm_cat.unique())]

corr1_high.shape, corr1_low.shape

((130, 8), (99, 8))

In [None]:
corr0_high["group"] = "비정상_high" ; corr0_low["group"] = "비정상_low"
corr1_high["group"] = "정상_high" ; corr1_low["group"] = "정상_low"

In [None]:
corr_result = pd.concat([corr0_high, corr0_low, corr1_high, corr1_low], axis=0)

In [None]:
corr_result = corr_result.sort_index()
corr_result = corr_result[["big_cat", "sm_cat", "group"]]

In [None]:
corr_result.to_csv(os.path.join("최종데이터", "adf_corr_group_0617.csv"), index=False, encoding="cp949")

In [None]:
corr_result.group.value_counts()

비정상_high    134
정상_high     130
정상_low       99
비정상_low      20
Name: group, dtype: int64