In [1]:
%load_ext autoreload
%autoreload 2

import datetime
import numpy as np
import pandas as pd
from dateutil.relativedelta import relativedelta
from tqdm.notebook import tqdm
from tqdm.contrib import tzip, tenumerate, tmap

from pyspark.sql import SparkSession
from pyspark.sql import DataFrame
from pyspark.storagelevel import StorageLevel
from pyspark.sql.functions import col
import pyspark.sql.types as pstype
import pyspark.sql.functions as F
import pyspark as ps

import matplotlib as mlt
import matplotlib.pyplot as plt
import japanize_matplotlib

from time_series_model import *

%matplotlib inline
%matplotlib ipympl

In [2]:
np.set_printoptions(threshold=100000, precision=4, linewidth=10000)
ps_conf = ps.SparkConf().set("spark.logConf", "false")\
            .set("spark.executor.memory", "12g")\
            .set("spark.driver.memory", "4g")\
            .set("spark.executor.cores", "7")\
            .set("spark.sql.shuffle.partitions", "500")\
            .set("spark.executor.extraJavaOptions", "-XX:+UseG1GC -XX:+UseStringDeduplication")\
            .set("spark.eventLog.gcMetrics.youngGenerationGarbageCollectors", "G1 Young Generation")\
            .set("spark.eventLog.gcMetrics.oldGenerationGarbageCollectors", "G1 Old Generation")\
			.set("spark.logConf", "false")
spark = SparkSession.builder.config(conf=ps_conf).getOrCreate()

Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
24/11/22 12:29:34 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


In [3]:
SPECIFIED_PATH = "csv_data/"
SPECIFIED_DATE = "20241120"
SPECIFIED_CSV  = SPECIFIED_PATH + SPECIFIED_DATE

24/11/22 12:29:48 WARN GarbageCollectionMetrics: To enable non-built-in garbage collector(s) List(G1 Concurrent GC), users should configure it(them) to spark.eventLog.gcMetrics.youngGenerationGarbageCollectors or spark.eventLog.gcMetrics.oldGenerationGarbageCollectors


In [4]:
input_path  = SPECIFIED_CSV + "_raw_data_sumida_bunka.csv"
df_raw_data = spark.read.option("inferSchema", "True").option("header", "True").csv(input_path)
df_raw_data.persist(StorageLevel.MEMORY_AND_DISK_DESER)

utid_list = sorted(df_raw_data.select("unit_id").drop_duplicates().rdd.flatMap(lambda x: x).collect())

                                                                                

In [5]:
df_raw_data\
		.filter(col("unit_id").isin(utid_list))\
		.filter(col("randomized") == 1)\
		.select(['date', 'datetime', 'unit_id', 'aibeaconid'])\
        .withColumn('minute', F.date_trunc('minute', 'datetime'))\
        .withColumn('minute', F.unix_timestamp('minute'))\
        .withColumn('minute', col('minute') - (col('minute') % 60))\
        .withColumn('minute', F.from_unixtime('minute'))\
        .select(['date', 'minute', 'unit_id', 'aibeaconid'])\
        .drop_duplicates()\
        .groupBy(['date', 'minute', 'unit_id'])\
        .agg(F.count('*').alias('1min_count'))\
        .orderBy(['date', 'minute', 'unit_id'])\
        .toPandas()\
        .to_csv(SPECIFIED_CSV + "_1min_data_sumida_bunka.csv", header=True, index=False)

24/11/22 12:30:22 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
24/11/22 12:30:22 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
24/11/22 12:30:22 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
24/11/22 12:30:22 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
24/11/22 12:30:22 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
24/11/22 12:30:22 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
24/11/22 12:30:22 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
24/11/22 12:30:22 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
24/11/22 12:30:22 WARN RowBasedKeyValueBatch: Calling spill() on

In [6]:
SPECIFIED_PAST = (datetime.datetime.strptime(SPECIFIED_DATE, '%Y%m%d') - relativedelta(days=1) + datetime.timedelta(hours=9)).strftime('%Y-%m-%d')

start_time = f'{SPECIFIED_PAST} 00:00:00'
end_time = f'{SPECIFIED_PAST} 23:59:00'
# 1分単位の時間列を作成
time_range = pd.date_range(start=start_time, end=end_time, freq='min')
# DataFrameに変換
df_time = pd.DataFrame(time_range, columns=['datetime'])
df_time = spark.createDataFrame(df_time)\
				.withColumn('hour', F.hour(col('datetime')))

df_by1min = spark.read\
				.option('header', True)\
				.option('inferSchema', True)\
           		.csv(SPECIFIED_CSV + "_1min_data_sumida_bunka.csv")

for unit_id in utid_list:
    df_tmp  = df_by1min\
        		.filter(col('unit_id') == unit_id)\
          		.select(['minute', '1min_count'])\
            	.withColumnRenamed('minute',     'datetime')\
                .withColumnRenamed('1min_count', unit_id)
    df_time = df_time.join(df_tmp, on='datetime', how='left')
df_time = df_time\
			.fillna(0)\
    		.orderBy('datetime')
df_time.persist(StorageLevel.MEMORY_AND_DISK_DESER)
df_time.show()

24/11/22 12:30:31 WARN SparkStringUtils: Truncated the string representation of a plan since it was too large. This behavior can be adjusted by setting 'spark.sql.debug.maxToStringFields'.

+-------------------+----+--------+--------+--------+--------+--------+--------+--------+--------+--------+--------+--------+--------+--------+--------+--------+--------+--------+--------+--------+--------+--------+--------+--------+--------+--------+--------+--------+--------+--------+--------+--------+--------+--------+--------+--------+--------+--------+--------+--------+--------+--------+--------+--------+--------+--------+--------+--------+--------+--------+--------+--------+--------+--------+--------+--------+--------+--------+--------+--------+--------+--------+--------+--------+--------+--------+--------+--------+--------+--------+--------+
|           datetime|hour|1002A25B|1002A25C|1002A25D|1002A5BA|1002A5BB|1002A5BD|1002A5BE|1002A5BF|1002A5C1|1002A5C3|1002A5C4|1002A5C5|1002A5C6|1002A5C7|1002A5C9|1002A5CA|1002A5CB|1002A5CC|1002A5CE|1002A5CF|1002A5D0|1002A5D1|1002A6A5|1002A6A7|1002A6A8|1002A6AF|1002A6B0|1002A6B2|1002A6B3|1002A6B6|1002A6B9|1002A6BA|1002A6BC|1002A6BD|1002A6BE|10

                                                                                

In [7]:
SPECIFIED_HOUR = 13

pd_data = df_time.toPandas()
pd_data = pd_data[pd_data['hour'] == SPECIFIED_HOUR]
pd_data = pd_data[utid_list]
pd_data

                                                                                

Unnamed: 0,1002A25B,1002A25C,1002A25D,1002A5BA,1002A5BB,1002A5BD,1002A5BE,1002A5BF,1002A5C1,1002A5C3,...,1002ABE5,1002ABE8,1002ABF2,1002ABF3,1002ACA9,1002ACAA,1002ACB0,1002ACB2,1002ACB8,1002ACB9
780,8,21,46,29,12,31,9,8,6,0,...,71,84,2,60,94,15,93,1,21,2
781,13,20,51,20,13,12,13,16,15,1,...,57,78,6,63,85,15,110,0,16,6
782,23,19,57,27,29,18,7,34,32,0,...,62,66,8,56,71,28,93,4,14,10
783,17,18,65,26,24,16,6,29,21,0,...,65,83,7,65,97,28,103,6,22,9
784,5,15,64,22,14,14,8,35,5,0,...,80,92,23,53,84,14,90,19,28,20
785,12,17,49,26,19,16,13,13,17,1,...,68,68,10,53,89,16,93,10,14,8
786,10,22,45,19,13,23,8,13,12,0,...,71,95,8,62,82,18,102,11,21,8
787,7,14,60,23,7,28,6,9,11,0,...,54,80,6,65,102,7,94,11,14,16
788,5,11,55,21,9,16,6,13,13,0,...,63,72,7,37,88,6,88,8,18,5
789,8,18,40,26,10,25,4,8,12,0,...,69,90,3,73,80,12,91,4,8,5


In [8]:
test_data = pd_data.values.tolist()
test_data

[[8,
  21,
  46,
  29,
  12,
  31,
  9,
  8,
  6,
  0,
  33,
  78,
  38,
  10,
  16,
  1,
  115,
  92,
  9,
  40,
  71,
  19,
  93,
  35,
  1,
  84,
  32,
  49,
  84,
  55,
  2,
  1,
  68,
  1,
  54,
  26,
  20,
  108,
  0,
  31,
  42,
  30,
  67,
  33,
  39,
  36,
  83,
  109,
  107,
  8,
  4,
  47,
  0,
  109,
  18,
  102,
  120,
  22,
  21,
  72,
  71,
  84,
  2,
  60,
  94,
  15,
  93,
  1,
  21,
  2],
 [13,
  20,
  51,
  20,
  13,
  12,
  13,
  16,
  15,
  1,
  28,
  79,
  33,
  11,
  13,
  12,
  102,
  98,
  15,
  41,
  63,
  19,
  86,
  26,
  9,
  64,
  32,
  49,
  90,
  54,
  7,
  4,
  68,
  7,
  45,
  31,
  16,
  97,
  1,
  43,
  43,
  28,
  57,
  34,
  29,
  27,
  81,
  106,
  123,
  11,
  3,
  51,
  0,
  87,
  22,
  98,
  97,
  22,
  15,
  63,
  57,
  78,
  6,
  63,
  85,
  15,
  110,
  0,
  16,
  6],
 [23,
  19,
  57,
  27,
  29,
  18,
  7,
  34,
  32,
  0,
  32,
  71,
  26,
  13,
  15,
  9,
  87,
  81,
  32,
  47,
  68,
  29,
  79,
  22,
  9,
  62,
  30,
  62,
  62,
  52,


In [13]:
model = Sparse_Vector_Auto_Regressive(test_data, norm_α=1, l1_ratio=0.1, tol=1e-3, learning_rate=0.0001, isStandardization=True)

lag = 4
model.fit(lags=lag, solver="external library")

x_tmp = np.array([np.array(test_data)[t-lag : t][::-1].ravel() for t in range(lag, len(test_data))])
y_tmp = test_data[lag:]
mean  = model.predict(x_tmp)

print(f"alpha0:{model.alpha0}\nalpha:{model.alpha}", flush=True)
mse = np.sum((y_tmp - mean) ** 2) / len(y_tmp)
mse

alpha0:[[ 6.6890e-18  7.2634e-17  1.0888e-16  1.4055e-16 -6.4910e-17 -1.5957e-16  6.8179e-17  2.0090e-17  1.3359e-16  4.1476e-18  1.3382e-16  1.0919e-18 -3.3109e-17 -6.5456e-17 -1.9067e-16 -1.0686e-16  1.8682e-16  4.3722e-16  2.0173e-16 -1.6130e-16  2.7034e-16 -1.7654e-16  1.1638e-16 -1.3778e-16  8.1317e-17  4.0750e-17  5.0067e-18 -3.0451e-16 -1.1849e-16 -3.6913e-17  3.3810e-17 -4.8835e-17 -2.4232e-16 -1.9944e-17 -2.5798e-16  1.5095e-16 -1.1394e-16  1.0189e-16 -9.4404e-18 -1.4382e-16 -2.6711e-17 -3.5614e-16  3.8359e-17  3.8178e-16 -2.5612e-17  1.3782e-16 -2.1352e-16  3.1985e-16 -2.6042e-16  1.1221e-16  1.2919e-16  1.2023e-16 -1.3759e-18  3.3357e-16 -1.2960e-16 -1.9402e-16  3.3419e-16  1.4390e-16  4.2394e-17  6.0787e-17 -2.4647e-16 -2.7562e-16  5.9880e-17  1.6764e-18 -3.6920e-16 -7.7721e-17 -1.1547e-16  7.5317e-17 -6.3662e-17  6.0948e-17]]
alpha:[[ 1.3493e-03  0.0000e+00 -0.0000e+00  0.0000e+00  0.0000e+00  0.0000e+00 -4.6202e-02  0.0000e+00  0.0000e+00 -0.0000e+00 -0.0000e+00  0.0000e+

np.float64(3751.1196634815155)

In [14]:
model = Sparse_Vector_Auto_Regressive(test_data, norm_α=1, l1_ratio=0.1, tol=1e-3, learning_rate=0.0001, isStandardization=True)

lag = 4
model.fit(lags=lag, solver="coordinate descent")

x_tmp = np.array([np.array(test_data)[t-lag : t][::-1].ravel() for t in range(lag, len(test_data))])
y_tmp = test_data[lag:]
mean = model.predict(x_tmp)

print(f"alpha0:{model.alpha0}\nalpha:{model.alpha}", flush=True)
mse = np.sum((y_tmp - mean) ** 2) / len(y_tmp)
mse

alpha0:[[-7.3041e-18  3.0782e-17  3.9390e-17  9.3910e-17 -3.3390e-17 -5.4781e-17  3.0260e-17  9.3910e-18  4.1738e-17 -1.4608e-17  9.2866e-17  1.9825e-17 -3.1303e-17 -2.2956e-17 -8.8171e-17 -7.0954e-17  9.5997e-17  2.1912e-16  8.2432e-17 -1.0200e-16  1.4660e-16 -6.5215e-17  7.6171e-17 -1.0643e-16  4.3825e-17  3.0260e-17 -1.3565e-17 -1.6330e-16 -4.7998e-17 -4.1738e-18  2.7130e-17 -2.8695e-17 -1.2104e-16  7.3041e-18 -1.3669e-16  6.6780e-17 -3.8086e-17  6.9911e-17 -2.3999e-17 -7.7736e-17 -2.6086e-17 -1.7217e-16  8.8693e-18  2.0530e-16 -1.1217e-17  7.7736e-17 -1.3043e-16  1.4817e-16 -1.4295e-16  5.1129e-17  4.8520e-17  6.8606e-17  5.9998e-18  1.6747e-16 -6.4693e-17 -9.8084e-17  1.8156e-16  1.0017e-16  2.0869e-17  5.1129e-17 -1.1478e-16 -1.5495e-16  1.5130e-17 -4.1738e-18 -2.1704e-16 -3.8086e-17 -4.8520e-17  3.5477e-17 -3.8086e-17  2.9216e-17]]
alpha:[[ 1.2980e-03  0.0000e+00 -0.0000e+00  0.0000e+00  0.0000e+00  0.0000e+00 -4.6229e-02  0.0000e+00  0.0000e+00 -0.0000e+00 -0.0000e+00  0.0000e+

np.float64(3750.9263714969034)

In [16]:
model = Sparse_Vector_Auto_Regressive(test_data, norm_α=1, l1_ratio=0.1, tol=1e-3, learning_rate=0.0001, isStandardization=True)

lag = 4
model.fit(lags=lag, solver="ISTA")

x_tmp = np.array([np.array(test_data)[t-lag : t][::-1].ravel() for t in range(lag, len(test_data))])
y_tmp = test_data[lag:]
mean  = model.predict(x_tmp)

print(f"alpha0:{model.alpha0}\nalpha:{model.alpha}", flush=True)
mse = np.sum((y_tmp - mean) ** 2) / len(y_tmp)
mse

alpha0:[[ 1.3217e-17  4.0752e-17  5.4520e-17  7.9302e-17 -5.9476e-17 -1.4428e-16  5.0665e-17  2.7535e-17  7.0490e-17 -1.7623e-17  6.6085e-17  2.4231e-17 -4.4056e-17 -3.5245e-17 -8.8113e-17 -6.8288e-17  1.0574e-16  2.3350e-16  1.0574e-16 -1.1565e-16  1.1895e-16 -6.1679e-17  4.8462e-17 -7.3795e-17  3.5245e-17  7.0490e-17  3.0840e-17 -1.6081e-16 -9.0316e-17 -9.9127e-18  2.2028e-17 -3.5245e-17 -1.1895e-16 -1.2116e-17 -1.2336e-16  7.4896e-17 -1.3217e-17  4.1854e-17 -1.5420e-17 -7.5997e-17 -5.0665e-17 -1.8063e-16  1.9825e-17  1.8944e-16 -2.3130e-17  5.3969e-17 -1.1675e-16  1.5970e-16 -1.3217e-16  6.2780e-17  7.7099e-17  7.3244e-17  9.9127e-18  1.5199e-16 -7.7099e-17 -9.0316e-17  1.7182e-16  8.8113e-17  4.6259e-17  4.6259e-17 -1.2336e-16 -1.6081e-16  2.2028e-17 -1.1014e-17 -2.1147e-16 -3.8549e-17 -5.2868e-17  5.2868e-17 -3.8549e-17  5.1491e-17]]
alpha:[[ 1.2969e-03  0.0000e+00 -0.0000e+00  0.0000e+00  0.0000e+00  0.0000e+00 -4.6229e-02  0.0000e+00  0.0000e+00 -0.0000e+00 -0.0000e+00  0.0000e+

np.float64(3750.9551955760717)

In [None]:
from sklearn.linear_model import ElasticNet

model = ElasticNet(alpha=1, l1_ratio=0.1, max_iter=1000000, tol=1e-3)

lag   = 4
x_tmp = np.array([np.array(test_data)[t-lag : t][::-1].ravel() for t in range(lag, len(test_data))])
y_tmp = test_data[lag:]
model.fit(x_tmp, y_tmp)
mean  = model.predict(x_tmp)

print(f"alpha0:{model.intercept_}\nalpha:{model.coef_}", flush=True)
mse = np.sum((y_tmp - mean) ** 2) / len(y_tmp)
mse