In [1]:
%load_ext autoreload
%autoreload 2

import datetime
import numpy as np
import pandas as pd
from dateutil.relativedelta import relativedelta
from tqdm.notebook import tqdm
from tqdm.contrib import tzip, tenumerate, tmap

from pyspark.sql import SparkSession
from pyspark.sql import DataFrame
from pyspark.storagelevel import StorageLevel
from pyspark.sql.functions import col
import pyspark.sql.types as pstype
import pyspark.sql.functions as F
import pyspark as ps

import matplotlib as mlt
import matplotlib.pyplot as plt
import japanize_matplotlib

from time_series_model import *

%matplotlib inline
%matplotlib ipympl

In [2]:
np.set_printoptions(threshold=100000, precision=4, linewidth=10000)
ps_conf = ps.SparkConf().set("spark.logConf", "false")\
            .set("spark.executor.memory", "12g")\
            .set("spark.driver.memory", "4g")\
            .set("spark.executor.cores", "7")\
            .set("spark.sql.shuffle.partitions", "500")\
            .set("spark.executor.extraJavaOptions", "-XX:+UseG1GC -XX:+UseStringDeduplication")\
            .set("spark.eventLog.gcMetrics.youngGenerationGarbageCollectors", "G1 Young Generation")\
            .set("spark.eventLog.gcMetrics.oldGenerationGarbageCollectors", "G1 Old Generation")\
			.set("spark.logConf", "false")
spark = SparkSession.builder.config(conf=ps_conf).getOrCreate()

Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
24/12/02 11:52:40 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


In [3]:
SPECIFIED_PATH = "csv_data/"
SPECIFIED_DATE = "20241120"
SPECIFIED_CSV  = SPECIFIED_PATH + SPECIFIED_DATE

In [4]:
input_path  = SPECIFIED_CSV + "_raw_data_sumida_bunka.csv"
df_raw_data = spark.read.option("inferSchema", "True").option("header", "True").csv(input_path)
df_raw_data.persist(StorageLevel.MEMORY_AND_DISK_DESER)

utid_list = sorted(df_raw_data.select("unit_id").drop_duplicates().rdd.flatMap(lambda x: x).collect())

24/12/02 11:52:53 WARN GarbageCollectionMetrics: To enable non-built-in garbage collector(s) List(G1 Concurrent GC), users should configure it(them) to spark.eventLog.gcMetrics.youngGenerationGarbageCollectors or spark.eventLog.gcMetrics.oldGenerationGarbageCollectors
                                                                                

In [5]:
df_raw_data\
		.filter(col("unit_id").isin(utid_list))\
		.filter(col("randomized") == 1)\
		.select(['date', 'datetime', 'unit_id', 'aibeaconid'])\
        .withColumn('minute', F.date_trunc('minute', 'datetime'))\
        .withColumn('minute', F.unix_timestamp('minute'))\
        .withColumn('minute', col('minute') - (col('minute') % 60))\
        .withColumn('minute', F.from_unixtime('minute'))\
        .select(['date', 'minute', 'unit_id', 'aibeaconid'])\
        .drop_duplicates()\
        .groupBy(['date', 'minute', 'unit_id'])\
        .agg(F.count('*').alias('1min_count'))\
        .orderBy(['date', 'minute', 'unit_id'])\
        .toPandas()\
        .to_csv(SPECIFIED_CSV + "_1min_data_sumida_bunka.csv", header=True, index=False)

24/12/02 11:53:13 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
24/12/02 11:53:13 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
24/12/02 11:53:13 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
24/12/02 11:53:13 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
24/12/02 11:53:13 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
24/12/02 11:53:13 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
24/12/02 11:53:13 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
24/12/02 11:53:13 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
24/12/02 11:53:13 WARN RowBasedKeyValueBatch: Calling spill() on

In [6]:
SPECIFIED_PAST = (datetime.datetime.strptime(SPECIFIED_DATE, '%Y%m%d') - relativedelta(days=1) + datetime.timedelta(hours=9)).strftime('%Y-%m-%d')

start_time = f'{SPECIFIED_PAST} 00:00:00'
end_time = f'{SPECIFIED_PAST} 23:59:00'
# 1分単位の時間列を作成
time_range = pd.date_range(start=start_time, end=end_time, freq='min')
# DataFrameに変換
df_time = pd.DataFrame(time_range, columns=['datetime'])
df_time = spark.createDataFrame(df_time)\
				.withColumn('hour', F.hour(col('datetime')))

df_by1min = spark.read\
				.option('header', True)\
				.option('inferSchema', True)\
           		.csv(SPECIFIED_CSV + "_1min_data_sumida_bunka.csv")

for unit_id in utid_list:
    df_tmp  = df_by1min\
        		.filter(col('unit_id') == unit_id)\
          		.select(['minute', '1min_count'])\
            	.withColumnRenamed('minute',     'datetime')\
                .withColumnRenamed('1min_count', unit_id)
    df_time = df_time.join(df_tmp, on='datetime', how='left')
df_time = df_time\
			.fillna(0)\
    		.orderBy('datetime')
df_time.persist(StorageLevel.MEMORY_AND_DISK_DESER)
df_time.show()

24/12/02 11:53:24 WARN SparkStringUtils: Truncated the string representation of a plan since it was too large. This behavior can be adjusted by setting 'spark.sql.debug.maxToStringFields'.

+-------------------+----+--------+--------+--------+--------+--------+--------+--------+--------+--------+--------+--------+--------+--------+--------+--------+--------+--------+--------+--------+--------+--------+--------+--------+--------+--------+--------+--------+--------+--------+--------+--------+--------+--------+--------+--------+--------+--------+--------+--------+--------+--------+--------+--------+--------+--------+--------+--------+--------+--------+--------+--------+--------+--------+--------+--------+--------+--------+--------+--------+--------+--------+--------+--------+--------+--------+--------+--------+--------+--------+--------+
|           datetime|hour|1002A25B|1002A25C|1002A25D|1002A5BA|1002A5BB|1002A5BD|1002A5BE|1002A5BF|1002A5C1|1002A5C3|1002A5C4|1002A5C5|1002A5C6|1002A5C7|1002A5C9|1002A5CA|1002A5CB|1002A5CC|1002A5CE|1002A5CF|1002A5D0|1002A5D1|1002A6A5|1002A6A7|1002A6A8|1002A6AF|1002A6B0|1002A6B2|1002A6B3|1002A6B6|1002A6B9|1002A6BA|1002A6BC|1002A6BD|1002A6BE|10

                                                                                

In [7]:
SPECIFIED_HOUR = 14

pd_data = df_time.toPandas()
pd_data = pd_data[pd_data['hour'] == SPECIFIED_HOUR]
pd_data = pd_data[utid_list]
pd_data

                                                                                

Unnamed: 0,1002A25B,1002A25C,1002A25D,1002A5BA,1002A5BB,1002A5BD,1002A5BE,1002A5BF,1002A5C1,1002A5C3,...,1002ABE5,1002ABE8,1002ABF2,1002ABF3,1002ACA9,1002ACAA,1002ACB0,1002ACB2,1002ACB8,1002ACB9
840,27,29,41,31,30,36,1,18,24,0,...,41,49,5,42,57,21,67,2,11,8
841,19,18,53,25,23,36,3,18,24,3,...,45,52,5,32,49,17,64,6,20,2
842,10,14,52,18,15,13,1,14,12,0,...,56,67,14,48,65,10,71,8,15,12
843,7,23,72,27,13,20,3,15,8,0,...,52,43,3,45,56,18,66,0,21,0
844,6,24,65,27,15,27,2,11,10,0,...,61,72,5,51,80,14,81,3,28,1
845,5,16,57,22,15,22,2,14,5,0,...,48,59,5,43,68,10,73,2,13,2
846,9,20,42,20,16,21,2,14,16,0,...,67,91,7,68,97,14,84,7,15,5
847,11,18,41,24,19,19,5,14,18,0,...,71,105,6,71,101,17,87,7,13,5
848,5,17,64,23,11,19,3,7,10,0,...,73,84,6,64,85,13,89,5,16,3
849,11,24,48,30,13,21,4,13,10,2,...,79,65,5,61,75,8,96,5,18,2


In [8]:
test_data = pd_data.values.tolist()
test_data

[[27,
  29,
  41,
  31,
  30,
  36,
  1,
  18,
  24,
  0,
  37,
  59,
  10,
  12,
  21,
  4,
  59,
  55,
  31,
  52,
  50,
  30,
  51,
  16,
  4,
  44,
  12,
  39,
  51,
  46,
  10,
  3,
  37,
  3,
  33,
  26,
  40,
  47,
  0,
  34,
  42,
  40,
  56,
  38,
  18,
  16,
  49,
  66,
  76,
  24,
  5,
  46,
  0,
  41,
  7,
  55,
  76,
  33,
  22,
  40,
  41,
  49,
  5,
  42,
  57,
  21,
  67,
  2,
  11,
  8],
 [19,
  18,
  53,
  25,
  23,
  36,
  3,
  18,
  24,
  3,
  28,
  52,
  22,
  15,
  20,
  6,
  62,
  58,
  22,
  34,
  40,
  27,
  45,
  24,
  6,
  42,
  15,
  45,
  47,
  42,
  11,
  6,
  33,
  3,
  33,
  35,
  23,
  59,
  0,
  36,
  39,
  25,
  75,
  41,
  23,
  22,
  55,
  69,
  65,
  17,
  9,
  51,
  0,
  44,
  7,
  45,
  71,
  14,
  12,
  36,
  45,
  52,
  5,
  32,
  49,
  17,
  64,
  6,
  20,
  2],
 [10,
  14,
  52,
  18,
  15,
  13,
  1,
  14,
  12,
  0,
  22,
  59,
  16,
  5,
  14,
  15,
  75,
  53,
  25,
  34,
  48,
  12,
  70,
  22,
  16,
  59,
  24,
  55,
  54,
  44,
  12,
 

In [9]:
model = Sparse_Vector_Auto_Regressive(test_data, norm_α=2, l1_ratio=0.1, tol=1e-10, max_iterate=300000, isCentralization=True)

lag = 4
model.fit(lags=lag, solver="external library", visible_flg=True)

x_tmp = np.array([np.array(test_data)[t-lag : t][::-1].ravel() for t in range(lag, len(test_data))])
y_tmp = test_data[lag:]
mean  = model.predict(x_tmp)

#print(f"alpha0:{model.alpha0}\nalpha:{model.alpha}", flush=True)
mse = np.sum((y_tmp - mean) ** 2) / len(y_tmp)
mse

平均二乗誤差(MSE): 8.7165827878384
L2正則化項(l2 norm): 45.53096765209321
L1正則化項(l1 norm): 428.0689097535977
目的関数(Objective):  7333.196876965268
目的関数(Objective)の微分:  57688.85581032584


np.float64(8.716582787838396)

In [10]:
model = Sparse_Vector_Auto_Regressive(test_data, norm_α=2, l1_ratio=0.1, tol=1e-10, max_iterate=300000, isCentralization=True)

lag = 4
model.fit(lags=lag, solver="coordinate descent", visible_flg=True)

x_tmp = np.array([np.array(test_data)[t-lag : t][::-1].ravel() for t in range(lag, len(test_data))])
y_tmp = test_data[lag:]
mean = model.predict(x_tmp)

# print(f"alpha0:{model.alpha0}\nalpha:{model.alpha}", flush=True)
mse = np.sum((y_tmp - mean) ** 2) / len(y_tmp)
mse

ite:1  mse:4.8427892022506995  ΔDiff:16.468035563662085
ite:2  mse:0.5440381016150824  ΔDiff:5.519613545389261
ite:3  mse:0.09906841060066185  ΔDiff:2.3553834069291275
ite:4  mse:0.05012491751884644  ΔDiff:1.675409019032487
ite:5  mse:0.03253594891667843  ΔDiff:1.3498196691906634
ite:6  mse:0.02403786187994248  ΔDiff:1.1602242306023345
ite:7  mse:0.018445352500201708  ΔDiff:1.0163364305245068
ite:8  mse:0.014427794838862343  ΔDiff:0.8988640113923191
ite:9  mse:0.011130368853146078  ΔDiff:0.7894939238374038
ite:10  mse:0.00878687888551606  ΔDiff:0.7014736043422442
ite:11  mse:0.007032349154252815  ΔDiff:0.6275440642999961
ite:12  mse:0.005720768674338858  ΔDiff:0.5660062241380178
ite:13  mse:0.00472322440374147  ΔDiff:0.5142961856843995
ite:14  mse:0.003970244935467763  ΔDiff:0.4715227633807245
ite:15  mse:0.003372791214178807  ΔDiff:0.43459901978031795
ite:16  mse:0.002901436713945916  ΔDiff:0.4030886453138705
ite:17  mse:0.002496164911207304  ΔDiff:0.37387863676279903
ite:18  mse:0.00

np.float64(8.716582787650665)

In [11]:
model = Sparse_Vector_Auto_Regressive(test_data, norm_α=2, l1_ratio=0.1, tol=1e-10, max_iterate=300000, isCentralization=True)

lag = 4
model.fit(lags=lag, solver="ISTA", visible_flg=True)

x_tmp = np.array([np.array(test_data)[t-lag : t][::-1].ravel() for t in range(lag, len(test_data))])
y_tmp = test_data[lag:]
mean  = model.predict(x_tmp)

#print(f"alpha0:{model.alpha0}\nalpha:{model.alpha}", flush=True)
mse = np.sum((y_tmp - mean) ** 2) / len(y_tmp)
mse

ite:1  mse:2975544229.1422915  update_diff:1863.9803650077033 diff:2975544229.1422915
ite:1001  mse:5755.054416296263  update_diff:6.957875112423519e-06 diff:19.46996311867224
ite:2001  mse:1349.6828351417007  update_diff:5.456628104968389e-07 diff:0.4479803222602641
ite:3001  mse:1160.6430091289042  update_diff:4.2112642860079075e-07 diff:0.09112923353063707
ite:4001  mse:1086.2443707119028  update_diff:4.3025321791179433e-07 diff:0.06543424774713458
ite:5001  mse:1024.5611054667165  update_diff:4.4644233311398333e-07 diff:0.05540788636449179
ite:6001  mse:970.3686350868629  update_diff:4.7117975489955394e-07 diff:0.05135577622831988
ite:7001  mse:922.2120114453634  update_diff:4.984210834648855e-07 diff:0.04606682834992171
ite:8001  mse:878.47538434438  update_diff:5.267238011330368e-07 diff:0.04118790983136478
ite:9001  mse:838.1831376563154  update_diff:5.533837978920393e-07 diff:0.038260087250932884
ite:10001  mse:804.0617976809017  update_diff:5.868100347975531e-07 diff:0.0316551

np.float64(8.716582811714762)

In [12]:
model = Sparse_Vector_Auto_Regressive(test_data, norm_α=2, l1_ratio=0.1, tol=1e-10, max_iterate=300000, isCentralization=True)

lag = 4
model.fit(lags=lag, solver="FISTA", visible_flg=True)

x_tmp = np.array([np.array(test_data)[t-lag : t][::-1].ravel() for t in range(lag, len(test_data))])
y_tmp = test_data[lag:]
mean  = model.predict(x_tmp)

# print(f"alpha0:{model.alpha0}\nalpha:{model.alpha}", flush=True)
mse = np.sum((y_tmp - mean) ** 2) / len(y_tmp)
mse

ite:1  mse:2924204826.451988  update_diff:1830.909653239246 diff:2924204826.451988
ite:1001  mse:503.18985687173085  update_diff:1.3461320534465327e-06 diff:0.6035451141384556
ite:2001  mse:492.839762269193  update_diff:1.3148113048575506e-06 diff:0.028017889405418828
ite:3001  mse:488.3896445740719  update_diff:1.2742463986719574e-06 diff:0.009673019895330981
ite:4001  mse:488.3983312475249  update_diff:1.274264660291798e-06 diff:0.008682301128374093
ite:5001  mse:488.25993817874144  update_diff:1.2741778557315051e-06 diff:0.0014341688984131906
ite:6001  mse:488.1860761248608  update_diff:1.2741669694373963e-06 diff:0.0005639643281369899
ite:7001  mse:488.1244293984606  update_diff:1.2741261501819096e-06 diff:0.0008297323734041129
ite:8001  mse:488.13250858461424  update_diff:1.2741445128024097e-06 diff:2.0177755686745513e-05
ite:9001  mse:488.1414686169868  update_diff:1.2741714914011856e-06 diff:0.00012467704800656065
ite:10001  mse:488.13548848727885  update_diff:1.2741588418000005

np.float64(8.716583533344474)

In [13]:
SPECIFIED_PATH = "csv_data/"
SPECIFIED_DATE = "2024-11-13"
SPECIFIED_CSV  = SPECIFIED_PATH + SPECIFIED_DATE

pd_test_data = spark.read\
				.option('header', True)\
				.option('inferSchema', True)\
           		.csv(SPECIFIED_CSV + "_time_series_data.csv")\
				.toPandas()

In [14]:
mean_sorted_list = pd_test_data.mean().sort_values(ascending=False).index
pd_test_data     = pd_test_data[mean_sorted_list].fillna(0).reset_index(drop=True)
pd_test_data     = pd_test_data.values.tolist()

In [None]:
model = Sparse_Vector_Auto_Regressive(pd_test_data, norm_α=2, l1_ratio=0.1, tol=1e-10, max_iterate=300000, isCentralization=True)

lag = 4
model.fit(lags=lag, solver="external library", visible_flg=True)

x_tmp = np.array([np.array(pd_test_data)[t-lag : t][::-1].ravel() for t in range(lag, len(pd_test_data))])
y_tmp = pd_test_data[lag:]
mean  = model.predict(x_tmp)

#print(f"alpha0:{model.alpha0}\nalpha:{model.alpha}", flush=True)
mse = np.sum((y_tmp - mean) ** 2) / len(y_tmp)
mse

  model = cd_fast.enet_coordinate_descent(


平均二乗誤差(MSE): 134.83706015819152
L2正則化項(l2 norm): 99.04386846809814
L1正則化項(l1 norm): 746.3530131736179
目的関数(Objective):  86855.32647117059
目的関数(Objective)の微分:  209509.5613959126


np.float64(134.83706015819152)

In [16]:
model = Sparse_Vector_Auto_Regressive(pd_test_data, norm_α=2, l1_ratio=0.1, tol=1e-10, max_iterate=300000, isCentralization=True)

lag = 4
model.fit(lags=lag, solver="coordinate descent", visible_flg=True)

x_tmp = np.array([np.array(pd_test_data)[t-lag : t][::-1].ravel() for t in range(lag, len(pd_test_data))])
y_tmp = pd_test_data[lag:]
mean  = model.predict(x_tmp)

#print(f"alpha0:{model.alpha0}\nalpha:{model.alpha}", flush=True)
mse = np.sum((y_tmp - mean) ** 2) / len(y_tmp)
mse

np.float64(134.83706019199326)

In [17]:
model = Sparse_Vector_Auto_Regressive(pd_test_data, norm_α=2, l1_ratio=0.1, tol=1e-10, max_iterate=300000, isCentralization=True)

lag = 4
model.fit(lags=lag, solver="ISTA", visible_flg=True)

x_tmp = np.array([np.array(pd_test_data)[t-lag : t][::-1].ravel() for t in range(lag, len(pd_test_data))])
y_tmp = pd_test_data[lag:]
mean  = model.predict(x_tmp)

#print(f"alpha0:{model.alpha0}\nalpha:{model.alpha}", flush=True)
mse = np.sum((y_tmp - mean) ** 2) / len(y_tmp)
mse

ite:1  mse:134267750119.4437  update_diff:2477.2807860390367 diff:134267750119.4437
ite:1001  mse:6393985.184108518  update_diff:6.496203359582224e-05 diff:6879.647358855233
ite:2001  mse:2911579.563194098  update_diff:1.6496254078895753e-05 diff:1769.1480012773536
ite:3001  mse:1739410.4801353717  update_diff:7.104511251744618e-06 diff:768.8898663097061
ite:4001  mse:1176205.0701694195  update_diff:3.7528100513943163e-06 diff:409.1081196705345
ite:5001  mse:858989.8691637409  update_diff:2.23108074763559e-06 diff:244.5366285641212
ite:6001  mse:661780.4550798435  update_diff:1.439781291321386e-06 diff:158.41870466340333
ite:7001  mse:530171.3423316039  update_diff:9.887419312114117e-07 diff:109.06191045045853
ite:8001  mse:437473.1270104967  update_diff:7.131985142870871e-07 diff:78.68285178131191
ite:9001  mse:369331.95539103914  update_diff:5.352001359816574e-07 diff:58.93179521354614
ite:10001  mse:317527.3885333212  update_diff:4.1481173877893886e-07 diff:45.50012672372395
ite:110

np.float64(135.7329963049099)

In [18]:
model = Sparse_Vector_Auto_Regressive(pd_test_data, norm_α=2, l1_ratio=0.1, tol=1e-10, max_iterate=300000, isCentralization=True)

lag = 4
model.fit(lags=lag, solver="FISTA", visible_flg=True)

x_tmp = np.array([np.array(pd_test_data)[t-lag : t][::-1].ravel() for t in range(lag, len(pd_test_data))])
y_tmp = pd_test_data[lag:]
mean  = model.predict(x_tmp)

#print(f"alpha0:{model.alpha0}\nalpha:{model.alpha}", flush=True)
mse = np.sum((y_tmp - mean) ** 2) / len(y_tmp)
mse

ite:1  mse:135482650274.19376  update_diff:2496.737529241869 diff:135482650274.19376
ite:1001  mse:42179.87656695887  update_diff:5.9203431934427047e-08 diff:5.800483959559642
ite:2001  mse:38412.43755868397  update_diff:4.67986601883802e-08 diff:0.252729063933657
ite:3001  mse:38437.21993559502  update_diff:4.694258536326051e-08 diff:0.3442043146278593
ite:4001  mse:38339.97355916105  update_diff:4.5239609543613435e-08 diff:0.5482609357204637
ite:5001  mse:38352.73699277368  update_diff:4.409151770590381e-08 diff:0.01805995497124968
ite:6001  mse:38331.074772820444  update_diff:4.3204844105726724e-08 diff:0.046688057722349185
ite:7001  mse:38303.04345519474  update_diff:4.297216648046054e-08 diff:0.20609971339581534
ite:8001  mse:38331.154867800746  update_diff:4.280947233939951e-08 diff:0.022990330835455097
ite:9001  mse:38305.355451681964  update_diff:4.2632372962570604e-08 diff:0.030430616017838474
ite:10001  mse:38315.89812893527  update_diff:4.2618838815648716e-08 diff:0.03012249

np.float64(134.83705169256615)