In [1]:
%load_ext autoreload
%autoreload 2

import datetime
import numpy as np
import pandas as pd
from dateutil.relativedelta import relativedelta
from tqdm.notebook import tqdm
from tqdm.contrib import tzip, tenumerate, tmap

from pyspark.sql import SparkSession
from pyspark.sql import DataFrame
from pyspark.storagelevel import StorageLevel
from pyspark.sql.functions import col
import pyspark.sql.types as pstype
import pyspark.sql.functions as F
import pyspark as ps

import matplotlib as mlt
import matplotlib.pyplot as plt
import japanize_matplotlib

from time_series_model import *

%matplotlib inline
%matplotlib ipympl

In [2]:
np.set_printoptions(threshold=100000, precision=4, linewidth=10000)
ps_conf = ps.SparkConf().set("spark.logConf", "false")\
            .set("spark.executor.memory", "12g")\
            .set("spark.driver.memory", "4g")\
            .set("spark.executor.cores", "7")\
            .set("spark.sql.shuffle.partitions", "500")\
            .set("spark.executor.extraJavaOptions", "-XX:+UseG1GC -XX:+UseStringDeduplication")\
            .set("spark.eventLog.gcMetrics.youngGenerationGarbageCollectors", "G1 Young Generation")\
            .set("spark.eventLog.gcMetrics.oldGenerationGarbageCollectors", "G1 Old Generation")\
			.set("spark.logConf", "false")
spark = SparkSession.builder.config(conf=ps_conf).getOrCreate()

Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
24/11/26 09:49:09 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


In [3]:
SPECIFIED_PATH = "csv_data/"
SPECIFIED_DATE = "20241120"
SPECIFIED_CSV  = SPECIFIED_PATH + SPECIFIED_DATE

In [4]:
input_path  = SPECIFIED_CSV + "_raw_data_sumida_bunka.csv"
df_raw_data = spark.read.option("inferSchema", "True").option("header", "True").csv(input_path)
df_raw_data.persist(StorageLevel.MEMORY_AND_DISK_DESER)

utid_list = sorted(df_raw_data.select("unit_id").drop_duplicates().rdd.flatMap(lambda x: x).collect())

24/11/26 09:49:20 WARN GarbageCollectionMetrics: To enable non-built-in garbage collector(s) List(G1 Concurrent GC), users should configure it(them) to spark.eventLog.gcMetrics.youngGenerationGarbageCollectors or spark.eventLog.gcMetrics.oldGenerationGarbageCollectors
                                                                                

In [5]:
df_raw_data\
		.filter(col("unit_id").isin(utid_list))\
		.filter(col("randomized") == 1)\
		.select(['date', 'datetime', 'unit_id', 'aibeaconid'])\
        .withColumn('minute', F.date_trunc('minute', 'datetime'))\
        .withColumn('minute', F.unix_timestamp('minute'))\
        .withColumn('minute', col('minute') - (col('minute') % 60))\
        .withColumn('minute', F.from_unixtime('minute'))\
        .select(['date', 'minute', 'unit_id', 'aibeaconid'])\
        .drop_duplicates()\
        .groupBy(['date', 'minute', 'unit_id'])\
        .agg(F.count('*').alias('1min_count'))\
        .orderBy(['date', 'minute', 'unit_id'])\
        .toPandas()\
        .to_csv(SPECIFIED_CSV + "_1min_data_sumida_bunka.csv", header=True, index=False)

24/11/26 09:49:44 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
24/11/26 09:49:44 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
24/11/26 09:49:44 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
24/11/26 09:49:44 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
24/11/26 09:49:44 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
24/11/26 09:49:44 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
24/11/26 09:49:44 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
24/11/26 09:49:44 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
24/11/26 09:49:44 WARN RowBasedKeyValueBatch: Calling spill() on

In [6]:
SPECIFIED_PAST = (datetime.datetime.strptime(SPECIFIED_DATE, '%Y%m%d') - relativedelta(days=1) + datetime.timedelta(hours=9)).strftime('%Y-%m-%d')

start_time = f'{SPECIFIED_PAST} 00:00:00'
end_time = f'{SPECIFIED_PAST} 23:59:00'
# 1分単位の時間列を作成
time_range = pd.date_range(start=start_time, end=end_time, freq='min')
# DataFrameに変換
df_time = pd.DataFrame(time_range, columns=['datetime'])
df_time = spark.createDataFrame(df_time)\
				.withColumn('hour', F.hour(col('datetime')))

df_by1min = spark.read\
				.option('header', True)\
				.option('inferSchema', True)\
           		.csv(SPECIFIED_CSV + "_1min_data_sumida_bunka.csv")

for unit_id in utid_list:
    df_tmp  = df_by1min\
        		.filter(col('unit_id') == unit_id)\
          		.select(['minute', '1min_count'])\
            	.withColumnRenamed('minute',     'datetime')\
                .withColumnRenamed('1min_count', unit_id)
    df_time = df_time.join(df_tmp, on='datetime', how='left')
df_time = df_time\
			.fillna(0)\
    		.orderBy('datetime')
df_time.persist(StorageLevel.MEMORY_AND_DISK_DESER)
df_time.show()

24/11/26 09:49:53 WARN SparkStringUtils: Truncated the string representation of a plan since it was too large. This behavior can be adjusted by setting 'spark.sql.debug.maxToStringFields'.
                                                                                

+-------------------+----+--------+--------+--------+--------+--------+--------+--------+--------+--------+--------+--------+--------+--------+--------+--------+--------+--------+--------+--------+--------+--------+--------+--------+--------+--------+--------+--------+--------+--------+--------+--------+--------+--------+--------+--------+--------+--------+--------+--------+--------+--------+--------+--------+--------+--------+--------+--------+--------+--------+--------+--------+--------+--------+--------+--------+--------+--------+--------+--------+--------+--------+--------+--------+--------+--------+--------+--------+--------+--------+--------+
|           datetime|hour|1002A25B|1002A25C|1002A25D|1002A5BA|1002A5BB|1002A5BD|1002A5BE|1002A5BF|1002A5C1|1002A5C3|1002A5C4|1002A5C5|1002A5C6|1002A5C7|1002A5C9|1002A5CA|1002A5CB|1002A5CC|1002A5CE|1002A5CF|1002A5D0|1002A5D1|1002A6A5|1002A6A7|1002A6A8|1002A6AF|1002A6B0|1002A6B2|1002A6B3|1002A6B6|1002A6B9|1002A6BA|1002A6BC|1002A6BD|1002A6BE|10

In [42]:
SPECIFIED_HOUR = 14

pd_data = df_time.toPandas()
pd_data = pd_data[pd_data['hour'] == SPECIFIED_HOUR]
pd_data = pd_data[utid_list]
pd_data

                                                                                

Unnamed: 0,1002A25B,1002A25C,1002A25D,1002A5BA,1002A5BB,1002A5BD,1002A5BE,1002A5BF,1002A5C1,1002A5C3,...,1002ABE5,1002ABE8,1002ABF2,1002ABF3,1002ACA9,1002ACAA,1002ACB0,1002ACB2,1002ACB8,1002ACB9
840,27,29,41,31,30,36,1,18,24,0,...,41,49,5,42,57,21,67,2,11,8
841,19,18,53,25,23,36,3,18,24,3,...,45,52,5,32,49,17,64,6,20,2
842,10,14,52,18,15,13,1,14,12,0,...,56,67,14,48,65,10,71,8,15,12
843,7,23,72,27,13,20,3,15,8,0,...,52,43,3,45,56,18,66,0,21,0
844,6,24,65,27,15,27,2,11,10,0,...,61,72,5,51,80,14,81,3,28,1
845,5,16,57,22,15,22,2,14,5,0,...,48,59,5,43,68,10,73,2,13,2
846,9,20,42,20,16,21,2,14,16,0,...,67,91,7,68,97,14,84,7,15,5
847,11,18,41,24,19,19,5,14,18,0,...,71,105,6,71,101,17,87,7,13,5
848,5,17,64,23,11,19,3,7,10,0,...,73,84,6,64,85,13,89,5,16,3
849,11,24,48,30,13,21,4,13,10,2,...,79,65,5,61,75,8,96,5,18,2


In [43]:
test_data = pd_data.values.tolist()
test_data

[[27,
  29,
  41,
  31,
  30,
  36,
  1,
  18,
  24,
  0,
  37,
  59,
  10,
  12,
  21,
  4,
  59,
  55,
  31,
  52,
  50,
  30,
  51,
  16,
  4,
  44,
  12,
  39,
  51,
  46,
  10,
  3,
  37,
  3,
  33,
  26,
  40,
  47,
  0,
  34,
  42,
  40,
  56,
  38,
  18,
  16,
  49,
  66,
  76,
  24,
  5,
  46,
  0,
  41,
  7,
  55,
  76,
  33,
  22,
  40,
  41,
  49,
  5,
  42,
  57,
  21,
  67,
  2,
  11,
  8],
 [19,
  18,
  53,
  25,
  23,
  36,
  3,
  18,
  24,
  3,
  28,
  52,
  22,
  15,
  20,
  6,
  62,
  58,
  22,
  34,
  40,
  27,
  45,
  24,
  6,
  42,
  15,
  45,
  47,
  42,
  11,
  6,
  33,
  3,
  33,
  35,
  23,
  59,
  0,
  36,
  39,
  25,
  75,
  41,
  23,
  22,
  55,
  69,
  65,
  17,
  9,
  51,
  0,
  44,
  7,
  45,
  71,
  14,
  12,
  36,
  45,
  52,
  5,
  32,
  49,
  17,
  64,
  6,
  20,
  2],
 [10,
  14,
  52,
  18,
  15,
  13,
  1,
  14,
  12,
  0,
  22,
  59,
  16,
  5,
  14,
  15,
  75,
  53,
  25,
  34,
  48,
  12,
  70,
  22,
  16,
  59,
  24,
  55,
  54,
  44,
  12,
 

In [150]:
model = Sparse_Vector_Auto_Regressive(test_data, norm_α=2, l1_ratio=0.5, tol=1e-10, learning_rate=0.001, isStandardization=True)

lag = 4
model.fit(lags=lag, solver="external library", visible_flg=True)

x_tmp = np.array([np.array(test_data)[t-lag : t][::-1].ravel() for t in range(lag, len(test_data))])
y_tmp = test_data[lag:]
mean  = model.predict(x_tmp)

#print(f"alpha0:{model.alpha0}\nalpha:{model.alpha}", flush=True)
mse = np.sum((y_tmp - mean) ** 2) / len(y_tmp)
mse

平均二乗誤差(MSE): 70.0
L2正則化項(l2 norm): 1.6365137698464e-30
L1正則化項(l1 norm): 8.469415644997623e-15
目的関数(Objective):  1960.0000000000005
目的関数(Objective)の微分:  197833.67211774457


np.float64(13685.772002551019)

In [147]:
model = Sparse_Vector_Auto_Regressive(test_data, norm_α=2, l1_ratio=0.5, tol=1e-10, learning_rate=0.001, isStandardization=True)

lag = 4
model.fit(lags=lag, solver="coordinate descent", visible_flg=True)

x_tmp = np.array([np.array(test_data)[t-lag : t][::-1].ravel() for t in range(lag, len(test_data))])
y_tmp = test_data[lag:]
mean = model.predict(x_tmp)

# print(f"alpha0:{model.alpha0}\nalpha:{model.alpha}", flush=True)
mse = np.sum((y_tmp - mean) ** 2) / len(y_tmp)
mse

ite:1  mse:7.281109038683648e-33  ΔDiff:6.385468707669659e-16
平均二乗誤差(MSE): 70.0
L2正則化項(l2 norm): 4.0774210616628435e-31
L1正則化項(l1 norm): 4.446096268928557e-15
目的関数(Objective):  1960.0000000000002
目的関数(Objective)の微分:  198169.67211774457


np.float64(13685.772002551019)

In [148]:
model = Sparse_Vector_Auto_Regressive(test_data, norm_α=2, l1_ratio=0.5, tol=1e-10, learning_rate=0.001, max_iterate=30000, isStandardization=True)

lag = 4
model.fit(lags=lag, solver="ISTA", visible_flg=True)

x_tmp = np.array([np.array(test_data)[t-lag : t][::-1].ravel() for t in range(lag, len(test_data))])
y_tmp = test_data[lag:]
mean  = model.predict(x_tmp)

#print(f"alpha0:{model.alpha0}\nalpha:{model.alpha}", flush=True)
mse = np.sum((y_tmp - mean) ** 2) / len(y_tmp)
mse

ite:1  mse:195659.8354420253  update_diff:19.6699999745816 diff:195659.8354420253
ite:5001  mse:206.13898480228565  update_diff:0.009130118034156047 diff:0.22648621974502703
平均二乗誤差(MSE): 70.0
L2正則化項(l2 norm): 9.897091275211015e-31
L1正則化項(l1 norm): 5.635146795002902e-15
目的関数(Objective):  1960.0000000000002
目的関数(Objective)の微分:  198169.67211774457


np.float64(13685.772002551019)