In [58]:
import numpy as np
from sklearn.model_selection import StratifiedShuffleSplit, train_test_split, KFold, cross_val_score
import pandas as pd

In [19]:
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
df = pd.read_csv("../utils/generator/dataset/dataset_diff.csv")
df = df.drop('일시', axis=1)
y = df['result']
X = df.drop('result', axis=1)
X_train, X_test, y_train, y_test = train_test_split( X, y, test_size=0.33, random_state=42)

In [20]:

from sklearn.linear_model import LinearRegression
from sklearn.linear_model import Lasso
from sklearn.linear_model import ElasticNet
from sklearn.tree import DecisionTreeRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.ensemble import GradientBoostingRegressor

In [37]:
pipelines = []
pipelines.append(('ScaledLR', Pipeline([('Scaler', StandardScaler()),('LR',LinearRegression())])))
pipelines.append(('ScaledLASSO', Pipeline([('Scaler', StandardScaler()),('LASSO', Lasso())])))
pipelines.append(('ScaledEN', Pipeline([('Scaler', StandardScaler()),('EN', ElasticNet())])))
pipelines.append(('ScaledKNN', Pipeline([('Scaler', StandardScaler()),('KNN', KNeighborsRegressor())])))
pipelines.append(('ScaledCART', Pipeline([('Scaler', StandardScaler()),('CART', DecisionTreeRegressor())])))
pipelines.append(('ScaledGBM', Pipeline([('Scaler', StandardScaler()),('GBM', GradientBoostingRegressor())])))

results = [[]]
idx = 0
names = []
scoring = ['neg_root_mean_squared_error', 'r2', 'explained_variance']
for name, model in pipelines:
    kfold = KFold(n_splits=10, random_state=21)
    cv_results = []
    cv_results.append(cross_val_score(model, X_train, y_train, cv=kfold, scoring='neg_root_mean_squared_error'))
    cv_results.append(cross_val_score(model, X_train, y_train, cv=kfold, scoring='r2'))
    cv_results.append(cross_val_score(model, X_train, y_train, cv=kfold, scoring='explained_variance'))
    # for res in cv_results:
    #     results[idx].append(res)
    # idx+=1
    names.append(name)
    print("=={}==".format(name))
    for idx in range (len(cv_results)):
        print("[+]{}".format(scoring[idx]))
        msg = "%f (%f)" %  (cv_results[idx].mean(), cv_results[idx].std())
        print(msg)
   




==ScaledLR==
[+]neg_root_mean_squared_error
-2.104776 (0.132958)
[+]r2
0.905598 (0.013326)
[+]explained_variance
0.905644 (0.013305)
==ScaledLASSO==
[+]neg_root_mean_squared_error
-2.401831 (0.091659)
[+]r2
0.877489 (0.010028)
[+]explained_variance
0.877573 (0.010006)
==ScaledEN==
[+]neg_root_mean_squared_error
-3.038641 (0.076895)
[+]r2
0.804160 (0.009237)
[+]explained_variance
0.804317 (0.009181)
==ScaledKNN==
[+]neg_root_mean_squared_error
-1.833678 (0.110991)
[+]r2
0.928376 (0.009619)
[+]explained_variance
0.928460 (0.009651)
==ScaledCART==
[+]neg_root_mean_squared_error
-2.329668 (0.128870)
[+]r2
0.883764 (0.014387)
[+]explained_variance
0.882873 (0.011587)
==ScaledGBM==
[+]neg_root_mean_squared_error
-1.770464 (0.141930)
[+]r2
0.933004 (0.012183)
[+]explained_variance
0.932986 (0.012176)


In [94]:
df = pd.read_csv("../utils/generator/dataset/dataset_diff.csv")

df.rename(columns={'일시':'datetime'}, inplace=True)
df.head(5)

Unnamed: 0,datetime,result,temperature,percipitation,wind_degree,wind_speed,air_pressure,sea_level_pressure,humidity,solar_radiation,solar_intensity
0,2019-07-01 00:00:00,0.0,23.273333,0.0,216.073333,1.193333,997.38,1003.88,84.453333,0.0,0.0
1,2019-07-01 00:15:00,0.0,23.2,0.0,225.486667,1.306667,997.266667,1003.766667,84.853333,0.0,0.0
2,2019-07-01 00:30:00,0.0,23.2,0.0,223.14,0.646667,997.2,1003.7,85.28,0.0,0.0
3,2019-07-01 00:45:00,0.0,23.2,0.0,252.066667,1.053333,997.2,1003.7,85.56,0.0,0.0
4,2019-07-01 01:00:00,0.0,23.2,0.0,230.933333,0.973333,997.133333,1003.633333,85.74,0.0,0.0


In [95]:
type(pd.to_datetime(df.datetime[1], infer_datetime_format=True)) 
pd.to_datetime(df.datetime[1], infer_datetime_format=True)




Timestamp('2019-07-01 00:15:00')

In [96]:
df.datetime = pd.to_datetime(df.datetime, infer_datetime_format=True)

In [97]:
df["difference"] = df["datetime"].sub(pd.to_datetime("2019-01-01", infer_datetime_format=True), axis=0)/ np.timedelta64(1, 'D')

In [109]:
# df.loc[df.difference >= 365] = df.loc[df.difference >= 365].difference - 365

for x in range(len(df.difference)):
    if df.difference.iloc[x] > 365:
        df.difference.iloc[x] -= 365 

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_with_indexer(indexer, value)


In [111]:
df.to_csv("datefrom1st.csv")
