In [54]:
import matplotlib.pyplot as plt
import seaborn as sns
import module
from module import create_model_split,create_model_cv
import pandas as pd
import numpy as np

"""
変更点：目的変数を「偏差値」,
        説明変数に以下を追加
        「曜日」,[決算翌日フラグ], [長期休暇翌日フラグ]

"""

plt.rcParams['figure.figsize'] = (15.0, 15.0)



def read_csv2df():
    import pandas as pd

    dirpath = '/home/ubuntu/analysis/stockPrice/'
    csv_name = 'stockPriceData_dev_val_v2.csv'

    df = pd.read_csv(dirpath+csv_name)
    return df

def setting_dummy(train_df):

    d = pd.get_dummies(train_df['week_d'])
    train_df['price_mon']=d['月']
    train_df['price_tue']=d['火']
    train_df['price_wed']=d['水']
    train_df['price_thu']=d['木']
    train_df['price_fri']=d['金']
    train_df=train_df.drop(columns='week_d')

    #print(train_df)

    return train_df

def setting(train_df):
    #y_val = '富士通(株)：翌日比'
    y_val = '富士通(株)：翌日比偏差値'
    nonNeededCol = ['date', 'year','month', 'day']
    nonNeededCol.append(y_val)
    #独立変数の設定
    x_val = [x for x in train_df.columns if x not in nonNeededCol]

    return y_val, nonNeededCol, x_val


def exe_prediction():
    import subprocess as sp

    cmd = 'python3'
    program = 'prediction.py'
    #_dir = '/home/ubuntu/analysis/stockPrice/'
    command = [cmd,program]
    #print(command)
    sp.check_call(command)

def plot(train_df, y):
    plt.plot(train_df['date'],y, marker='o')
    plt.plot(train_df['date'],y, marker='X')
    plt.xlabel('date')
    plt.ylabel('stockVal_ratio')
    plt.tight_layout()
    plt.show()
    plt.figure()


#変数間の相関行列およびヒートマップ
def heatMap(df):
    corr_mat = df.corr(method='pearson')
    sns.heatmap(
        corr_mat, 
        vmax=1.0, 
        vmin=-1.0,
        center=0, 
        fmt='.1ft',    
    )

#実数と予測の比較グラフ
def compare_view(y_test_df):
    pred = y_test_df['y_test_pred']
    test = y_test_df['y_test']

    plt.plot(pred, label='prediction')
    plt.plot(test, label='True')
    plt.legend()

def feature_view(feature_imp,col):

    sns.barplot(x=feature_imp, y=col)
    plt.xlabel('Feature Importance Score')
    plt.ylabel('Features')
    plt.title("Visualizing Important Features")
    plt.figure(figsize=(30,50))
    plt.show()

def compare_scaler(x,y):
    from sklearn.preprocessing import StandardScaler
    from sklearn.preprocessing import MinMaxScaler
    from sklearn.preprocessing import RobustScaler

    SS = StandardScaler()
    MMS = MinMaxScaler()
    RS = RobustScaler()
    scalers = [SS,MMS,RS]
    scores = {}
    for scaler in scalers:
        #y = scaler.fit_transform(np.array(y).reshape(-1,1))
        #y = np.reshape(y,(-1))
        x = scaler.fit_transform(x)
        val = module.exe_ml(x, y)
        score = {
            str(scaler):val[1]
        }
        scores.update(score)
        #print(str(scaler)+': '+score[1])
    return scores




"""
1. 前準備
    1-1: Load CSV
    1-2: setting
"""
#1-1
train_df = read_csv2df()#train_df.hist()
train_df = setting_dummy(train_df)


#1-2
y_val, nonNeedCol, x_val = setting(train_df)

x = train_df[x_val]
y = train_df[y_val]


In [6]:
import scipy.stats
import numpy as np 

y.describe()

count    298.000000
mean      50.000302
std       10.000071
min      -11.120000
25%       45.012500
50%       50.520000
75%       54.760000
max      109.860000
Name: 富士通(株)：翌日比偏差値, dtype: float64

In [7]:
y.describe()

count    298.000000
mean      50.000302
std       10.000071
min      -11.120000
25%       45.012500
50%       50.520000
75%       54.760000
max      109.860000
Name: 富士通(株)：翌日比偏差値, dtype: float64

In [17]:
scipy.stats.mstats.winsorize(y.values, limits=[0.01, 0.01])

masked_array(data=[52.65, 62.09, 51.83, 54.62, 44.44, 50.02, 50.52, 45.75,
                   52.65, 53.63, 53.39, 55.36, 51.83, 53.47, 44.69, 54.46,
                   41.24, 42.96, 42.72, 32.79, 75.06, 45.75, 44.77, 65.78,
                   63.24, 54.62, 55.69, 47.89, 41.08, 37.55, 53.72, 64.88,
                   55.03, 45.92, 60.69, 50.68, 44.03, 50.27, 48.63, 42.72,
                   58.07, 40.26, 35.58, 48.46, 45.92, 59.46, 55.19, 53.55,
                   51.25, 59.87, 59.63, 44.11, 33.03, 36.89, 46.41, 49.94,
                   47.31, 39.19, 57.9 , 75.06, 49.78, 55.6 , 48.05, 52.98,
                   52.73, 44.77, 49.28, 51.58, 50.35, 47.97, 55.03, 49.61,
                   44.11, 46.  , 54.21, 49.94, 67.92, 51.91, 47.81, 52.57,
                   41.08, 53.55, 53.39, 42.55, 52.57, 44.36, 55.11, 48.96,
                   53.63, 38.86, 48.79, 53.8 , 63.81, 49.37, 52.16, 52.49,
                   51.66, 48.63, 39.27, 47.81, 38.37, 58.89, 55.85, 52.32,
                   34.59,

In [10]:
import scipy.stats
import numpy as np 
a = np.array([92, 19, 101, 58, 1053, 91, 26, 78, 10, 13, -40, 101, 86, 85, 15, 89, 89, 28, -5, 41]) 
scipy.stats.mstats.winsorize(a, limits=[0.05, 0.05])

masked_array(data=[ 92,  19, 101,  58, 101,  91,  26,  78,  10,  13,  -5,
                   101,  86,  85,  15,  89,  89,  28,  -5,  41],
             mask=False,
       fill_value=999999)

In [23]:
av = np.mean(y)

In [25]:
sd = np.std(y)
sd

9.983278279743649

In [50]:
out_min = av -(sd)*2
out_max = av +(sd)*2

In [33]:
print(out_max)
print(out_min)

79.95013685265377
20.050467174191873


In [56]:
y.loc[y < out_min] = av
#y[y > out_max] = av

In [58]:
y.describe()

count    298.000000
mean      50.852358
std        8.407730
min       30.820000
25%       45.940000
50%       50.520000
75%       54.760000
max      109.860000
Name: 富士通(株)：翌日比偏差値, dtype: float64

In [66]:
a=2**np.arange(-5, 11, dtype=float)
a.tolist()


[0.03125,
 0.0625,
 0.125,
 0.25,
 0.5,
 1.0,
 2.0,
 4.0,
 8.0,
 16.0,
 32.0,
 64.0,
 128.0,
 256.0,
 512.0,
 1024.0]