In [1]:
import pandas as pd
import numpy as np
from datetime import date, timedelta, datetime
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import cross_val_score
from sklearn.svm import LinearSVC
from sklearn.metrics import log_loss, accuracy_score
from sklearn.linear_model import LogisticRegression
import matplotlib.pyplot as plt
%matplotlib inline

plt.rcParams["figure.figsize"] = (20,3)

### 지점정보
1. 22453: 구룡포
2. 22490: 월포
---
3. 21229: 울릉도
4. 22105: 동해
5. 22106: 포항 앞바다
6. 22190: 울진
7. 22189: 울산

In [2]:
df = pd.read_csv('./all_merged_info_data.tsv',sep='\t',index_col='Unnamed: 0')
df['hour'] = df['hour'].astype(str)
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 34992 entries, 0 to 34991
Data columns (total 90 columns):
hour                     34992 non-null object
climate                  34992 non-null object
weather_deterioration    34992 non-null int64
swell                    34392 non-null float64
22453_일시                 33979 non-null float64
22453_수온(°C)             33082 non-null float64
22453_최대파고(m)            33632 non-null float64
22453_유의파고(m)            33632 non-null float64
22453_평균파고(m)            33632 non-null float64
22453_파주기(sec)           33634 non-null float64
22490_일시                 19639 non-null float64
22490_수온(°C)             19464 non-null float64
22490_최대파고(m)            19224 non-null float64
22490_유의파고(m)            19224 non-null float64
22490_평균파고(m)            19224 non-null float64
22490_파주기(sec)           19562 non-null float64
21229_GUST풍속 1(m/s)      8637 non-null float64
21229_GUST풍속(m/s)        26091 non-null float64
21229_기온(°C)             33915 n

In [3]:
df.set_index('hour',inplace=True)

In [4]:
st = 24 * 30

### shift하여 row에 시간정보를 포함시킨다

In [5]:
def check_isin(col_name):
    for name in expr_features:
        if name in col_name:
            return True
    return False

In [8]:
 expr_features = ['파고','파주기']

In [10]:
shift_features = [col for col in df.columns if check_isin(col)]
shift_features

['22453_최대파고(m)',
 '22453_유의파고(m)',
 '22453_평균파고(m)',
 '22453_파주기(sec)',
 '22490_최대파고(m)',
 '22490_유의파고(m)',
 '22490_평균파고(m)',
 '22490_파주기(sec)',
 '21229_유의파고(m)',
 '21229_최대파고(m)',
 '21229_파주기(sec)',
 '21229_평균파고(m)',
 '22105_유의파고(m)',
 '22105_최대파고(m)',
 '22105_파주기(sec)',
 '22105_평균파고(m)',
 '22106_유의파고(m)',
 '22106_최대파고(m)',
 '22106_파주기(sec)',
 '22106_평균파고(m)',
 '22190_최대파고(m)',
 '22190_유의파고(m)',
 '22190_평균파고(m)',
 '22190_파주기(sec)',
 '22189_최대파고(m)',
 '22189_유의파고(m)',
 '22189_평균파고(m)',
 '22189_파주기(sec)']

In [43]:
inseq=6
temp_cols = []
temp_names = []
for i in range(1,inseq):
    temp_cols.append(df[shift_features].shift(-i))
    temp_names += [features_name+'(t-{})'.format(i) for features_name in shift_features]

In [44]:
temp_df = pd.concat(temp_cols,axis=1)
temp_df.columns = temp_names

In [59]:
re_cols = [col for col in temp_df.columns if '평균파고' in col]
re_cols 

['22453_평균파고(m)(t-1)',
 '22490_평균파고(m)(t-1)',
 '21229_평균파고(m)(t-1)',
 '22105_평균파고(m)(t-1)',
 '22106_평균파고(m)(t-1)',
 '22190_평균파고(m)(t-1)',
 '22189_평균파고(m)(t-1)',
 '22453_평균파고(m)(t-2)',
 '22490_평균파고(m)(t-2)',
 '21229_평균파고(m)(t-2)',
 '22105_평균파고(m)(t-2)',
 '22106_평균파고(m)(t-2)',
 '22190_평균파고(m)(t-2)',
 '22189_평균파고(m)(t-2)',
 '22453_평균파고(m)(t-3)',
 '22490_평균파고(m)(t-3)',
 '21229_평균파고(m)(t-3)',
 '22105_평균파고(m)(t-3)',
 '22106_평균파고(m)(t-3)',
 '22190_평균파고(m)(t-3)',
 '22189_평균파고(m)(t-3)',
 '22453_평균파고(m)(t-4)',
 '22490_평균파고(m)(t-4)',
 '21229_평균파고(m)(t-4)',
 '22105_평균파고(m)(t-4)',
 '22106_평균파고(m)(t-4)',
 '22190_평균파고(m)(t-4)',
 '22189_평균파고(m)(t-4)',
 '22453_평균파고(m)(t-5)',
 '22490_평균파고(m)(t-5)',
 '21229_평균파고(m)(t-5)',
 '22105_평균파고(m)(t-5)',
 '22106_평균파고(m)(t-5)',
 '22190_평균파고(m)(t-5)',
 '22189_평균파고(m)(t-5)']

In [77]:
pos_cols = [col for col in re_cols if '22106' in col]

In [78]:
temp_df[re_cols].corr()['22453_평균파고(m)(t-1)'][pos_cols].sort_values(ascending=False)

22106_평균파고(m)(t-1)    0.816898
22106_평균파고(m)(t-2)    0.810810
22106_평균파고(m)(t-3)    0.800124
22106_평균파고(m)(t-4)    0.785060
22106_평균파고(m)(t-5)    0.767104
Name: 22453_평균파고(m)(t-1), dtype: float64