In [1]:
%matplotlib inline

In [2]:
import pandas
import matplotlib.pylab as plt
import numpy as np
import math
import os
import sys
import glob
from scipy import stats
from scipy.optimize import curve_fit
from sklearn.cluster import KMeans
import time

def my_print( this_string, print_flag ):
    if print_flag == 1:
        print(this_string)

def get_list_file_path( path_representation ):
    #list = glob.glob('/home/raohongfu/*.php')
    list_return = glob.glob( path_representation  )
    #for this_path in list_return:
    #    print(this_path)
    return list_return

def get_list_symbol_from_outputs():
    file_path = './outputs/tw/*'
    list_symbol_path = get_list_file_path( file_path  )

    list_return = []
    for this_path in list_symbol_path:
        list_temp = this_path.split('/')
        list_return.append( list_temp[-1] )

    return sorted(list_return)


def read_csv_to_dict( input_table ):
    list_return = []

    list_header = []

    with open(input_table, 'r') as f:
        run_flag = 0

        for line in f:
            run_flag = run_flag + 1

            line = line.strip()

            if run_flag == 1:
                list_header = line.split(',')

            else:
                list_row_data = line.split(',')

                dict_row_data = {}
                for i in range(len(list_header)):
                    dict_row_data[ list_header[i] ] = list_row_data[i]

                list_return.append( dict_row_data )

    return list_return

def write_dict_to_csv( list_datas, list_header, output_file_path ):
    
    # https://stackoverflow.com/questions/12517451/automatically-creating-directories-with-file-output
    os.makedirs(os.path.dirname(output_file_path), exist_ok=True)
        
    with open(output_file_path, 'w') as f:
        f.write( ','.join(list_header)  + '\n')
              
        for this_data in list_datas:
            run_flag = 0
            output_string = ''
                
            for this_key in list_header:
                run_flag = run_flag + 1
                if run_flag == 1:
                    output_string = str(this_data[this_key])
                else:
                    output_string = output_string + ',' + str(this_data[this_key])
        
            f.write(output_string + '\n')
            
    

def get_pandas_dataframe_Profile_qty_price_of_this_stock(this_symbol):
    pandas_dataframe = pandas.DataFrame()

    file_path = './outputs/tw/' + this_symbol + '/' + 'profile_qty_price.csv'
    if not os.path.isfile(file_path):
        return pandas_dataframe

    pandas_dataframe = pandas.read_csv(file_path)
    
    return pandas_dataframe

def get_list_Profile_qty_price_of_this_stock(this_symbol):
    list_all_data = []

    file_path = './outputs/tw/' + this_symbol + '/' + 'profile_qty_price.csv'
    if not os.path.isfile(file_path):
        return []

    list_all_data = read_csv_to_dict( file_path )
    for item in list_all_data:
        item['price'] = float(item['price'])
        item['qty'] = int(item['qty'])
    
    return list_all_data

def get_list_Profile_qty_price_of_this_stock_of_this_day( this_symbol, this_day ):
    list_return = []

    list_all_data = get_list_Profile_qty_price_of_this_stock(this_symbol)

    for this_row in list_all_data:
        if this_row['date'] == this_day:
            list_return.append(this_row)

    return list_return


def get_skewness_of_this_day( this_symbol, this_day):
    list_data = get_list_Profile_qty_price_of_this_stock_of_this_day( this_symbol, this_day)
    if len(list_data) == 0:
        return [0,0,0]

    list_data = sorted( list_data, key = lambda x: float( x['price'] ) )
    
    list_histogram_data = []
    for item in list_data:
        list_histogram_data.extend( [ float( item['price']) ] * int( item['qty'] ) ) 

    skewness_original = stats.skew( list_histogram_data )

    total_price_item_count = len(list_data)
    total_qty_this_day = sum( int(x['qty']) for x in list_data)

    # the skewness_normalize_max_qty is the same as skewness_original
    #max_qty = max(list_qty_data)
    #for i in range(len(list_qty_data)):
    #    list_qty_data[i] = list_qty_data[i] / max_qty
    #skewness_normalize_max_qty = stats.skew( list_qty_data  )

    return [skewness_original, total_price_item_count, total_qty_this_day]

def write_skewness_file_of_this_day( this_day  ):
    output_dir = './outputs/stats/skewness'

    list_symbol = get_list_symbol_from_outputs()
    list_skewness = []
    for this_symbol in list_symbol:
        list_stats = get_skewness_of_this_day(this_symbol, this_day)
        skewness = list_stats[0] 
        total_price_item_count = list_stats[1] 
        total_qty_this_day = list_stats[2]
        #print( 'symbol: {} \t skewness: {:f}'.format(this_symbol, skewness) )
        
        dict_temp = {}
        dict_temp['date'] = this_day
        dict_temp['symbol'] = this_symbol
        dict_temp['skewness'] = skewness
        dict_temp['price_item_count'] = total_price_item_count
        dict_temp['total_qty'] = total_qty_this_day

        list_skewness.append(dict_temp)

    list_skewness = sorted(list_skewness, \
                    key = lambda x : ( x['price_item_count'], x['total_qty']), \
                    reverse=True)
            
    with open(output_dir + '/skewness_' + this_day + '.txt', 'w') as f:
        f.write('date,symbol,skewness,price_item_count,total_qty\n')
        
        for this_data in list_skewness:
            f.write('{},{},{:f},{},{}\n'.format( \
                        this_data['date'], \
                        this_data['symbol'], \
                        this_data['skewness'], \
                        this_data['price_item_count'], \
                        this_data['total_qty'] \
                        ))

    return list_skewness

def query_single_stock_daily_summary( this_symbol ):
    # suppose not over 10000 per stock, so query all for now
    file_path = './outputs/tw/' + this_symbol + '/' + 'profile_daily_summary.csv'
    if not os.path.isfile(file_path):
        return []

    list_all_data = read_csv_to_dict( file_path )

    return list_all_data

def get_future_price_for_predict( this_symbol, future_date , target_price):
    
    price_return = -999
    if target_price == 'high_price' or target_price == 'low_price':
        list_all_data = query_single_stock_daily_summary( this_symbol )
        try:
            this_data_of_future_date = next(x for x in list_all_data if x['date'] == future_date)
        except:
            return 'thid_day_no_profile_daily_summary_data'
        #print( target_price + ': ' + str(this_data_of_future_date[target_price]) )              
        price_return = this_data_of_future_date[target_price]
              
              
    return price_return
    

def query_single_stock_daily_summary_by_single_date(this_symbol, this_day):
    list_all_data = query_single_stock_daily_summary(this_symbol)
    
    if len(list_all_datat) == 0:
        return 'no_data'
    
    return next(x for x in list_all_data if x['date'] == this_day)

def get_string_of_kmeans_cluster_center(input_np_array):
    # nparray to list
    list_input = get_list_of_kmeans_cluster_center(input_np_array)
    list_input = [str(x) for x in list_input]
    return ' , '.join(list_input)

def get_list_of_kmeans_cluster_center(input_np_array):
    # nparray to list
    list_input = input_np_array.tolist()
    list_input = ['{:.2f}'.format(round(x[0],2)) for x in list_input]
    list_input = [float(x) for x in list_input]
    return list_input

# fit bar plot data using curve_fit
def func(x, a, b, c):
    # a Gaussian distribution
    return a * np.exp(-(x-b)**2/(2*c**2))

def get_kmeans_cluster_center_by_cluster_count(list_histogram_data, cluster_count):
    
    # get cluster_center values by K-means
    n = len(list_histogram_data)
    x = np.array(list_histogram_data) 
    kmeans_nparray = KMeans(n_clusters=cluster_count).fit(x.reshape(n,1))
    
    # make {:.2f} list
    list_kmeans_cluster_centers = get_list_of_kmeans_cluster_center(kmeans_nparray.cluster_centers_)
    #print(kmeans_nparray.inertia_)

    return list_kmeans_cluster_centers

def get_this_qty_by_price(this_price, list_profile_qty_price):
    for item in list_profile_qty_price:
        if item['price'] == this_price:
            return item['qty']

def get_start_index( center_index, left_side_price_point_for_sum_qty, right_side_price_point_for_sum_qty, total_length ):
    if center_index - left_side_price_point_for_sum_qty >=0:
        start_index = center_index - left_side_price_point_for_sum_qty
    else:
        start_index = 0
    if center_index + right_side_price_point_for_sum_qty <= total_length - 1:
        end_index = center_index + right_side_price_point_for_sum_qty
    else:
        end_index = total_length - 1
        
    #https://stackoverflow.com/questions/18265935/python-create-list-with-numbers-between-2-values
    return list(range(int(start_index), int(end_index)+1))
        
        
def get_index_based_on_price_point_for_sum_qty(center_index, total_length, price_point_for_sum_qty, is_current_index_center):
    
    list_return = []
    
    # handy flag for latter use
    if price_point_for_sum_qty % 2 == 0:
        even_flag_price_point_for_sum_qty = 1
        odd_flag_price_point_for_sum_qty = 0
    else:
        even_flag_price_point_for_sum_qty = 0
        odd_flag_price_point_for_sum_qty = 1
    
    # detail with 
    if is_current_index_center == 1:
        
        # situation-1: cluster_price is center_price and there is odd number of price_point_for_sum_qty
        # example: cluster_price = 5, center_price = 5, price_point_for_sum_qty = 3, total_length > 3
        # generate index 45c6
        if odd_flag_price_point_for_sum_qty == 1:
            left_side_price_point_for_sum_qty = (price_point_for_sum_qty-1)/2
            right_side_price_point_for_sum_qty = (price_point_for_sum_qty-1)/2
            
            list_index = get_start_index( center_index, left_side_price_point_for_sum_qty, right_side_price_point_for_sum_qty, total_length )
            list_return.append(list_index)
            
            return list_return
            
        # situation-2: cluster_price is center_price and there is even number of price_point_for_sum_qty
        # example: cluster_price = 5, center_price = 5, price_point_for_sum_qty = 4, total_length > 3
        # generate index 34 5c 6, 4 5c 67 
        if even_flag_price_point_for_sum_qty == 1:
            left_side_price_point_for_sum_qty = (price_point_for_sum_qty/2)
            right_side_price_point_for_sum_qty = (price_point_for_sum_qty/2) - 1
            list_index = get_start_index( center_index, left_side_price_point_for_sum_qty, right_side_price_point_for_sum_qty, total_length )
            list_return.append(list_index)
            
            left_side_price_point_for_sum_qty = (price_point_for_sum_qty/2) - 1
            right_side_price_point_for_sum_qty = (price_point_for_sum_qty/2)
            list_index = get_start_index( center_index, left_side_price_point_for_sum_qty, right_side_price_point_for_sum_qty, total_length )
            list_return.append(list_index)
            
            return list_return
        
    if is_current_index_center == 0:
        # situation-3: cluster_price is not center_price and there is even number of price_point_for_sum_qty
        # example: cluster_price = 5.5, center_price = 5, price_point_for_sum_qty = 4, total_length > 3
        # generate index 4 5c-6 7 for cluster_center between 5 and 6
        if even_flag_price_point_for_sum_qty == 1:
            left_side_price_point_for_sum_qty = (price_point_for_sum_qty/2) - 1
            right_side_price_point_for_sum_qty = (price_point_for_sum_qty/2)
            
            list_index = get_start_index( center_index, left_side_price_point_for_sum_qty, right_side_price_point_for_sum_qty, total_length )
            list_return.append(list_index)
            
            return list_return

        # situation-4: cluster_price is not center_price and there is odd number of price_point_for_sum_qty
        # example: cluster_price = 5.5, center_price = 5, price_point_for_sum_qty = 5, total_length > 3
        # generate index 345c-67 and 45c-678 for cluster_center between 5 and 6
        if odd_flag_price_point_for_sum_qty == 1:
            left_side_price_point_for_sum_qty = (price_point_for_sum_qty+1)/2 - 1
            right_side_price_point_for_sum_qty = (price_point_for_sum_qty+1)/2 - 1
            
            list_index = get_start_index( center_index, left_side_price_point_for_sum_qty, right_side_price_point_for_sum_qty, total_length )
            list_return.append(list_index)
            
            left_side_price_point_for_sum_qty = (price_point_for_sum_qty+1)/2 - 2
            right_side_price_point_for_sum_qty = (price_point_for_sum_qty+1)/2
            
            list_index = get_start_index( center_index, left_side_price_point_for_sum_qty, right_side_price_point_for_sum_qty, total_length )
            list_return.append(list_index)
            
            return list_return
                
        
def get_dict_price_item_count_to_price_point_for_sum_qty():
    file_path = './cluster_count_vs_price_item_coverage_3cluster.csv'
    list_datas = read_csv_to_dict(file_path)
    
    dict_return = {}
    for this_item in list_datas:
        dict_return[int(this_item['price_item_count'])] = int(this_item['adjust_price_point_for_sum_qty'])
        
    return dict_return
    
def get_this_qty_sum(this_list_index, list_price, list_profile_qty_price):
    qty_return = 0
    
    for this_index in this_list_index:
        this_price = list_price[this_index]
        qty_return = qty_return + get_this_qty_by_price(this_price, list_profile_qty_price)
        
    return qty_return
    
        
def get_qty_around_kmeans_cluster_centers(list_profile_qty_price):
    
    total_qty = sum(x['qty'] for x in list_profile_qty_price)
    set_price = set(x['price'] for x in list_profile_qty_price)
    list_price = sorted(list(set_price))
    price_item_count = len(list_price)
    
    # read file every time for easy flow
    dict_price_item_count_to_price_point_for_sum_qty = get_dict_price_item_count_to_price_point_for_sum_qty()
    this_price_point_for_sum_qty = dict_price_item_count_to_price_point_for_sum_qty[price_item_count]
    
    list_histogram_data = []
    for item in list_profile_qty_price:
        list_histogram_data.extend( [ float( item['price'] ) ] * int(item['qty']) )
        
    list_kmeans_cluster_centers = get_kmeans_cluster_center_by_cluster_count(list_histogram_data, cluster_count=3)
    
    list_of_dict_kmeans_cluster_center_to_qty_ratio = []
    # loop each center for sum of qty
    for this_center in list_kmeans_cluster_centers:
        dict_temp = {}
        
        for i in range(len(list_price)):
            this_qty_sum = 0
            
            if list_price[i] == this_center:
                list_of_list_index = get_index_based_on_price_point_for_sum_qty(center_index=i, total_length=len(list_price), price_point_for_sum_qty=this_price_point_for_sum_qty, is_current_index_center=1)
                #print(i)
                #print(len(list_price))
                #print(this_price_point_for_sum_qty)
                #print(1)
                #for xxxx in list_of_list_index:
                #    print(xxxx)
                
                this_qty_sum = -99999
                for this_list_index in list_of_list_index:
                    temp_this_qty_sum = get_this_qty_sum(this_list_index, list_price, list_profile_qty_price)
                    if temp_this_qty_sum > this_qty_sum:
                        this_qty_sum = temp_this_qty_sum
                
                # sum i-1 , i , i+1 qty
                
                #if i-1 >= 0: 
                #    this_qty_sum = this_qty_sum + get_this_qty_by_price(list_price[i-1], list_profile_qty_price)
                
                #this_qty_sum = this_qty_sum + get_this_qty_by_price(list_price[i], list_profile_qty_price)
                
                #if i+1 < len(list_price): 
                #    this_qty_sum = this_qty_sum + get_this_qty_by_price(list_price[i+1], list_profile_qty_price)
                    
                dict_temp['kmeans_cluster_center'] = this_center
                dict_temp['price'] = list_price[i]
                dict_temp['qty'] = this_qty_sum
                dict_temp['qty_ratio'] = round(float(this_qty_sum) / float(total_qty), 4)
                dict_temp['qty_total'] = total_qty
                dict_temp['price_item_count'] = price_item_count
                
            if list_price[i] != this_center and list_price[i] < this_center and this_center < list_price[i+1]:
                # sum i - 1, i, i+1, i+2 qty
                list_of_list_index = get_index_based_on_price_point_for_sum_qty(center_index=i, total_length=len(list_price), price_point_for_sum_qty=this_price_point_for_sum_qty, is_current_index_center=0)
                #print(i)
                #print(len(list_price))
                #print(this_price_point_for_sum_qty)
                #print(0)
                #for xxxx in list_of_list_index:
                #    print(xxxx)
                    
                this_qty_sum = -99999
                for this_list_index in list_of_list_index:
                    temp_this_qty_sum = get_this_qty_sum(this_list_index, list_price, list_profile_qty_price)
                    if temp_this_qty_sum > this_qty_sum:
                        this_qty_sum = temp_this_qty_sum
                    
                #if i-1 >= 0: 
                #    this_qty_sum = this_qty_sum + get_this_qty_by_price(list_price[i-1], list_profile_qty_price)
                
                #this_qty_sum = this_qty_sum + get_this_qty_by_price(list_price[i], list_profile_qty_price)
                #offset_between_price_i_to_this_center = abs(list_price[i] - this_center)
                
                #if i+1 < len(list_price): 
                #    this_qty_sum = this_qty_sum + get_this_qty_by_price(list_price[i+1], list_profile_qty_price)
                #    offset_between_price_iadd1_to_this_center = abs(list_price[i+1] - this_center)
                    
                #if i+2 < len(list_price): 
                #    this_qty_sum = this_qty_sum + get_this_qty_by_price(list_price[i+2], list_profile_qty_price)
                            
                dict_temp['kmeans_cluster_center'] = this_center
                
                # take price that close to cluster_center
                #if offset_between_price_i_to_this_center <= offset_between_price_iadd1_to_this_center:
                #    dict_temp['price'] = list_price[i]
                #else:
                #    dict_temp['price'] = list_price[i+1]
                 
                dict_temp['price'] = this_center
                    
                dict_temp['qty'] = this_qty_sum
                dict_temp['qty_ratio'] = round(float(this_qty_sum) / float(total_qty), 4)
                dict_temp['qty_total'] = total_qty
                dict_temp['price_item_count'] = price_item_count
        
        list_of_dict_kmeans_cluster_center_to_qty_ratio.append(dict_temp)        
    
    list_of_dict_kmeans_cluster_center_to_qty_ratio = sorted(list_of_dict_kmeans_cluster_center_to_qty_ratio, key = lambda x: x['price'])
    # DEBUG USE
    #for this_dict_kmeans_cluster_center_to_qty_ratio in list_of_dict_kmeans_cluster_center_to_qty_ratio:
        #print(this_dict_kmeans_cluster_center_to_qty_ratio)
        
    return list_of_dict_kmeans_cluster_center_to_qty_ratio
    

def query_single_stock_skew_stats_by_single_date( this_symbol, this_day ):
    
    # get this_day's 
    list_daily_summary = query_single_stock_daily_summary( this_symbol )
    this_daily_summary = next(x for x in list_daily_summary if x['date'] == this_day)
    open_price = this_daily_summary['open_price']
    close_price = this_daily_summary['close_price']
    
    # get this_day's skewness, total_price_item_count, total_qty_this_day
    list_stats_1 = get_skewness_of_this_day(this_symbol, this_day)
    skewness = list_stats_1[0]
    total_price_item_count = list_stats_1[1]
    total_qty_this_day = list_stats_1[2]
    
    # get this_day's profile_qty_price
    list_data = get_list_Profile_qty_price_of_this_stock_of_this_day( this_symbol, this_day)
    if len(list_data) == 0:
        print('\n\n\n\n\n\n------- {} no data --------\n\n\n\n\n\n'.format(this_day))
        return
    
    list_data = sorted( list_data, key = lambda x: float( x['price'] ) )

    list_histogram_data = []
    for item in list_data:
        list_histogram_data.extend( [ float( item['price'] ) ] * int(item['qty']) ) 
        
    # create histogram
    bins = len(list_data)    
    n, bins, patches = plt.hist(list_histogram_data, bins, normed=True)
    #print(n)
    #print(bins)
    #for x in patches:
    #    print(x)
    #print(len(bins))
    #print(len(patches))
    
    bins_list = bins.tolist()
    
    for i in range(len(bins_list)-1):
        if float(open_price) >= float(bins_list[i]) and float(open_price) <= float(bins_list[i+1]):   
            patches[i].set_fc('g')
    for i in range(len(bins_list)-1):
        if float(close_price) >= float(bins_list[i]) and float(close_price) <= float(bins_list[i+1]):   
            patches[i].set_fc('r')
      
    subplot_index = 1
    xt = plt.xticks()[0]
    xmin, xmax = min(xt), max(xt)
    lnspc = np.linspace(xmin, xmax, len(list_data))
    m, s = stats.norm.fit(list_histogram_data)
    pdf_g = stats.norm.pdf(lnspc, m, s)
    this_subplot = plt.subplot(1,1,subplot_index)
    this_subplot.set_title('{} ({:2f})'.format(this_symbol , skewness))   
    # create best fit curve
    plt.plot(lnspc, pdf_g, label="Norm") 
    plt.show()

    # get cluster_center values by K-means
    n = len(list_histogram_data)
    x = np.array(list_histogram_data)
    kmeans_1_nparray = KMeans(n_clusters=1).fit(x.reshape(n,1))        
    kmeans_2_nparray = KMeans(n_clusters=2).fit(x.reshape(n,1))
    kmeans_3_nparray = KMeans(n_clusters=3).fit(x.reshape(n,1))

    kmeans_1_string = get_string_of_kmeans_cluster_center(kmeans_1_nparray.cluster_centers_)
    kmeans_2_string = get_string_of_kmeans_cluster_center(kmeans_2_nparray.cluster_centers_)
    kmeans_3_string = get_string_of_kmeans_cluster_center(kmeans_3_nparray.cluster_centers_)
    
    # print stats
    print('symbol                 : {} ({})'.format(this_symbol, this_day))
    print('skewness               : {:.4f}'.format(skewness))
    print('total_price_item_count : {}'.format(total_price_item_count))
    print('total_qty_count        : {}'.format(total_qty_this_day))
    print('kmeans_1               : {}'.format(kmeans_1_string))
    print('kmeans_2               : {}'.format(kmeans_2_string))
    print('kmeans_3               : {}'.format(kmeans_3_string))
    print('open_price             : {}'.format(open_price))
    print('close_price            : {}'.format(close_price))
    
    
def show_day_by_day_stats_of_this_symbol(this_symbol, list_date):
    
    for this_day in list_date:
        query_single_stock_skew_stats_by_single_date(this_symbol, this_day)
        
    return
    
def query_single_stock_multiple_day_profile_qty_price( this_symbol, day_count ):
    
    # get all this_symbol profile_qty_price 
    list_all_daily_profile_qty_price = get_list_Profile_qty_price_of_this_stock( this_symbol )
    #pandas_dataframe_all_daily_profile_qty_price = get_pandas_dataframe_Profile_qty_price_of_this_stock(this_symbol)
    #with pandas.option_context('display.max_rows', None, 'display.max_columns', None):
    #    display(pandas_dataframe_all_daily_profile_qty_price)
    
    set_date = set()
    for this_daily_profile in list_all_daily_profile_qty_price:
        set_date.add(this_daily_profile['date'])
        
    list_date = list(set_date)
    list_date = sorted(list_date, key = lambda x : int(x))
    
    list_group_of_date = []
    for i in range(len(list_date)):
        start_index = i
        end_index = start_index + day_count        
        # we can break for all days has its group
        if end_index > len(list_date):
            break            
        #print(list_date[start_index:end_index])
        list_group_of_date.append(list_date[start_index:end_index])
    
    
    define_item = {}
    define_item['sequence'] = ''
    define_item['symbol'] = ''
    define_item['date_string'] = ''
    define_item['qty'] = -999
    define_item['price'] = -999
    
    # create machine_learning csv file
    list_machine_learning_item = []
    
    list_date_group_item = []
    this_sequence = 1
    idx = -1
    for this_date_group in list_group_of_date:
        # this_date_group: ['20180523', '20180524', '20180525']
        
        list_sum_of_qty_price_of_day_count_by_group_of_date = []
        temp_set_price_key = set()
        for this_profile in list_all_daily_profile_qty_price:
            # this_profile: date, symbol, qty, price
            if this_profile['date'] in this_date_group:
                # DEBUG USE GROUP-1
                #print(this_profile)
                if len(temp_set_price_key) == 0 or this_profile['price'] not in temp_set_price_key:
                    temp_set_price_key.add(this_profile['price'])
                    define_item = {}
                    define_item['sequence'] = -999
                    define_item['symbol'] = this_symbol
                    define_item['date_string'] = '_'.join(this_date_group)
                    define_item['qty'] = this_profile['qty']
                    define_item['price'] = this_profile['price']
                    list_sum_of_qty_price_of_day_count_by_group_of_date.append(define_item)
                else:
                    for i in range(len(list_sum_of_qty_price_of_day_count_by_group_of_date)):
                        if list_sum_of_qty_price_of_day_count_by_group_of_date[i]['price'] == this_profile['price']:
                            # add qty of this price
                            list_sum_of_qty_price_of_day_count_by_group_of_date[i]['qty'] = list_sum_of_qty_price_of_day_count_by_group_of_date[i]['qty'] + this_profile['qty']
                # DEBUG USE GROUP-1
                #for this_define_item in list_sum_of_qty_price_of_day_count_by_group_of_date:
                #    print(this_define_item)
        
        list_sum_of_qty_price_of_day_count_by_group_of_date = sorted(list_sum_of_qty_price_of_day_count_by_group_of_date, key = lambda x : x['price'])
        for this_define_item in list_sum_of_qty_price_of_day_count_by_group_of_date:
            this_define_item['sequence'] = this_sequence
            #print(this_define_item)

        list_date_group_item.extend(list_sum_of_qty_price_of_day_count_by_group_of_date)
        this_sequence = this_sequence + 1
            
        # get kmeas cluster center's and sum for qty around centers
        list_of_dict_kmeans_cluster_center_to_qty_ratio = get_qty_around_kmeans_cluster_centers(list_sum_of_qty_price_of_day_count_by_group_of_date)
        
        # get_future_price_for_predict
        index_last_date_of_this_date_group = list_date.index(this_date_group[-1])
        if index_last_date_of_this_date_group + 2 > len(list_date) - 1:
            #print('future_date not exist yet.\n\n')
            continue
        
        future_buy_date = list_date[index_last_date_of_this_date_group+1]
        future_buy_price = get_future_price_for_predict( this_symbol, future_buy_date , target_price='high_price')
        if 'thid_day_no_profile_daily_summary_data' == future_buy_price: continue
        
        future_sell_date = list_date[index_last_date_of_this_date_group+2]
        future_sell_price = get_future_price_for_predict( this_symbol, future_sell_date , target_price='low_price' )
        if 'thid_day_no_profile_daily_summary_data' == future_sell_price: continue
        
        
        # create machine_learning item
        dict_temp = {}
        
        dict_temp['idx'] = -99999
        dict_temp['symbol'] = this_symbol
        dict_temp['price_1'] = list_of_dict_kmeans_cluster_center_to_qty_ratio[0]['price']
        dict_temp['qty_ratio_1'] = list_of_dict_kmeans_cluster_center_to_qty_ratio[0]['qty_ratio']
        dict_temp['price_2'] = list_of_dict_kmeans_cluster_center_to_qty_ratio[1]['price']
        dict_temp['qty_ratio_2'] = list_of_dict_kmeans_cluster_center_to_qty_ratio[1]['qty_ratio']
        dict_temp['price_3'] = list_of_dict_kmeans_cluster_center_to_qty_ratio[2]['price']
        dict_temp['qty_ratio_3'] = list_of_dict_kmeans_cluster_center_to_qty_ratio[2]['qty_ratio']
        dict_temp['qty_total'] = list_of_dict_kmeans_cluster_center_to_qty_ratio[0]['qty_total']
        dict_temp['price_item_count'] = list_of_dict_kmeans_cluster_center_to_qty_ratio[0]['price_item_count']
        
        dict_temp['future_buy_price'] = future_buy_price
        dict_temp['future_sell_price'] = future_sell_price
        
        #print(dict_temp)
        
        list_machine_learning_item.append(dict_temp)
        
        #print('\n\n')
        
    return list_machine_learning_item

def write_all_stock_multiple_day_profile_qty_price():
    #now
    
    list_symbol = get_list_symbol_from_outputs()
    
    list_machine_learning_item_of_all_stock = []
    day_count = 3
    run_flag = 0
    start = time.time()
    for this_symbol in list_symbol:
        print('processing ' + this_symbol)
        run_flag = run_flag + 1
        #if run_flag > 5: break
            
        # multiple rows of every group of days by day_count
        list_machine_learning_item_of_single_stock = query_single_stock_multiple_day_profile_qty_price(this_symbol, day_count)
        list_machine_learning_item_of_all_stock.extend(list_machine_learning_item_of_single_stock)
    
    list_available_data_for_machine_learning = []
    count_for_available_data = 0
    idx = 0
    for this_data in list_machine_learning_item_of_all_stock:
        if this_data['price_item_count'] >= 18:
            this_data['idx'] = idx
            list_available_data_for_machine_learning.append(this_data)
            count_for_available_data = count_for_available_data + 1    
            idx = idx + 1
    
    output_dir = './outputs/stats/profile_qty_price_multiple_day'            
    output_file_path = output_dir + '/' + 'profile_qty_price_multiple_day_for_machine_learning.csv'            
    list_header = ['idx', 'symbol', 'price_1', 'qty_ratio_1', 'price_2', 'qty_ratio_2', 'price_3', 'qty_ratio_3', 'qty_total', 'price_item_count', 'future_buy_price', 'future_sell_price']
    write_dict_to_csv( list_available_data_for_machine_learning, list_header, output_file_path )      
    
    count_for_available_data = 0
    for this_data in list_machine_learning_item_of_all_stock:
        if this_data['price_item_count'] >= 18:
            count_for_available_data = count_for_available_data + 1
    
    print('\nTotal rows:' + str(len(list_machine_learning_item_of_all_stock)))
    print('Available rows: ' + str(count_for_available_data))
    
    end = time.time()
    print('\n----- Run time ------')
    print(end - start)
    
if __name__ == '__main__':
    
    # test-3 --------------------------------------------------------------
    write_all_stock_multiple_day_profile_qty_price()
    
    #this_symbol = '3062'
    #day_count = 3
    #query_single_stock_multiple_day_profile_qty_price(this_symbol, day_count)
    sys.exit()
    
    
    # test-2 --------------------------------------------------------------
    this_symbol = '3062'
    list_date = ['20180523', '20180524', '20180525', '20180528', '20180529', '20180530', '20180531']
    show_day_by_day_stats_of_this_symbol(this_symbol, list_date)
    
    sys.exit()
    
    
    # test-1 --------------------------------------------------------------
    this_day = '20180523'
    list_skewness = write_skewness_file_of_this_day(this_day)
    
    list_chosen_stocks = []
    for this_item in list_skewness:
        if this_item['price_item_count'] > 20 and this_item['total_qty'] > 500:
            list_chosen_stocks.append(this_item)
        #if this_item['symbol'] == '3062':
        #    list_chosen_stocks.append(this_item)
            
    
    list_chosen_stocks = sorted(list_chosen_stocks, key=lambda x: x['skewness'])
    
    subplot_index = 1
    #plt.figure(figsize=(20,200))
    
    for this_stock in list_chosen_stocks:
        #if subplot_index == 2:
        #    break
            
        this_symbol = this_stock['symbol']
        query_single_stock_skew_stats_by_single_date(this_symbol, this_day)
        continue
        
        
        this_symbol = this_stock['symbol']
        print(this_symbol)
        
        skewness = get_skewness_of_this_day(this_symbol, this_day)[0]

        list_data = get_list_Profile_qty_price_of_this_stock_of_this_day( this_symbol, this_day)
        
        list_data = sorted( list_data, key = lambda x: float( x['price'] ) )

        list_histogram_data = []
        for item in list_data:
            list_histogram_data.extend( [ float( item['price'] ) ] * int(item['qty']) ) 
        
        n = len(list_histogram_data)
        x = np.array(list_histogram_data)
        kmeans_1 = KMeans(n_clusters=1).fit(x.reshape(n,1))        
        kmeans_2 = KMeans(n_clusters=2).fit(x.reshape(n,1))
        kmeans_3 = KMeans(n_clusters=3).fit(x.reshape(n,1))
        print(kmeans_1.cluster_centers_)
        print(kmeans_2.cluster_centers_)
        print(kmeans_3.cluster_centers_)
        
        #this_subplot = plt.subplot(100,1,subplot_index)
        #this_subplot.set_title(skewness)
            
        #plt.title( '{} ({:2f})'.format(this_symbol , skewness) )
        plt.hist(list_histogram_data, len(list_data), normed=True)
        #plt.hist(list_histogram_data, len(list_data))
        #plt.hist(list_histogram_data, normed=True)
        xt = plt.xticks()[0]
        xmin, xmax = min(xt), max(xt)
        lnspc = np.linspace(xmin, xmax, len(list_data))
        m, s = stats.norm.fit(list_histogram_data)
        pdf_g = stats.norm.pdf(lnspc, m, s)
        this_subplot = plt.subplot(1,1,subplot_index)
        this_subplot.set_title('{} ({:2f})'.format(this_symbol , skewness))
        plt.plot(lnspc, pdf_g, label="Norm")
        #subplot_index = subplot_index + 1

        #plt.savefig(this_day + '.png', bbox_inches='tight')
        plt.show()
        
    


processing 1101
processing 1102
processing 1103
processing 1104
processing 1108
processing 1109
processing 1110
processing 1201
processing 1203
processing 1210
processing 1213
processing 1215
processing 1216
processing 1217
processing 1218
processing 1219
processing 1220
processing 1225
processing 1227
processing 1229
processing 1231
processing 1232
processing 1233
processing 1234
processing 1235
processing 1236
processing 1256
processing 1258
processing 1259
processing 1262
processing 1264
processing 1268
processing 1301
processing 1303
processing 1304
processing 1305
processing 1307
processing 1308
processing 1309
processing 1310
processing 1312
processing 1313
processing 1314
processing 1315
processing 1316
processing 1319
processing 1321
processing 1323
processing 1324
processing 1325
processing 1326
processing 1333
processing 1336
processing 1337
processing 1338
processing 1339
processing 1340
processing 1402
processing 1409
processing 1410
processing 1413
processing 1414
processi

processing 2745
processing 2748
processing 2801
processing 2809
processing 2812
processing 2816
processing 2820
processing 2823
processing 2832
processing 2834
processing 2836
processing 2838
processing 2841
processing 2845
processing 2849
processing 2850
processing 2851
processing 2852
processing 2855
processing 2856
processing 2867
processing 2880
processing 2881
processing 2882
processing 2883
processing 2884
processing 2885
processing 2886
processing 2887
processing 2888
processing 2889
processing 2890
processing 2891
processing 2892
processing 2897
processing 2901
processing 2903
processing 2904
processing 2905
processing 2906
processing 2908
processing 2910
processing 2911
processing 2912
processing 2913
processing 2915
processing 2916
processing 2923
processing 2924
processing 2926
processing 2928
processing 2929
processing 2936
processing 2937
processing 2939
processing 3002
processing 3003
processing 3004
processing 3005
processing 3006
processing 3008
processing 3010
processi

processing 4934
processing 4935
processing 4938
processing 4939
processing 4942
processing 4943
processing 4944
processing 4946
processing 4947
processing 4950
processing 4952
processing 4953
processing 4956
processing 4958
processing 4960
processing 4965
processing 4966
processing 4968
processing 4971
processing 4972
processing 4973
processing 4974
processing 4976
processing 4977
processing 4979
processing 4984
processing 4987
processing 4991
processing 4994
processing 4995
processing 4999
processing 5007
processing 5009
processing 5011
processing 5013
processing 5014
processing 5015
processing 5016
processing 5102
processing 5201
processing 5202
processing 5203
processing 5205
processing 5206
processing 5209
processing 5210
processing 5211
processing 5212
processing 5213
processing 5215
processing 5220
processing 5225
processing 5227
processing 5230
processing 5234
processing 5243
processing 5245
processing 5251
processing 5255
processing 5258
processing 5259
processing 5263
processi

processing 8277
processing 8279
processing 8287
processing 8289
processing 8291
processing 8299
processing 8341
processing 8342
processing 8349
processing 8354
processing 8358
processing 8374
processing 8383
processing 8390
processing 8401
processing 8403
processing 8404
processing 8406
processing 8409
processing 8410
processing 8411
processing 8415
processing 8416
processing 8418
processing 8420
processing 8421
processing 8422
processing 8423
processing 8424
processing 8426
processing 8427
processing 8429
processing 8431
processing 8432
processing 8433
processing 8435
processing 8436
processing 8437
processing 8440
processing 8442
processing 8443
processing 8444
processing 8446
processing 8450
processing 8454
processing 8455
processing 8462
processing 8463
processing 8464
processing 8466
processing 8467
processing 8472
processing 8473
processing 8476
processing 8477
processing 8478
processing 8480
processing 8481
processing 8488
processing 8489
processing 8497
processing 8499
processi

SystemExit: 

  warn("To exit: use 'exit', 'quit', or Ctrl-D.", stacklevel=1)
