In [None]:
import os
from os.path import join
from dotenv import load_dotenv
import random, string

import pyspark as ps
from pyspark import StorageLevel
from pyspark.sql import SparkSession, types
from pyspark.sql import functions as F
from pyspark.sql.functions import col

In [None]:
ps_conf = ps.SparkConf()\
            .set("spark.sql.sources.commitProtocolClass", "org.apache.spark.sql.execution.datasources.SQLHadoopMapReduceCommitProtocol")\
            .set("mapreduce.fileoutputcommitter.marksuccessfuljobs", "false")\
            .set("spark.sql.shuffle.partitions", 100)\
            .set("spark.sql.dynamicPartitionPruning.enabled", True)
            # '_started'と'_committed_'で始まるファイルを書き込まないように設定
            # '_SUCCESS'で始まるファイルを書き込まないように設定
            # パーティション数を調整する
            # 動的パーティションプルーニングの有効化
spark = SparkSession.builder.config(conf=ps_conf).getOrCreate()

In [None]:
load_dotenv(join(os.getcwd(), '.env'))
BASE_PATH     = os.environ.get("BASE_PATH")
WORK_PATH     = BASE_PATH + os.environ.get("WORK_PATH")
PROJECT_NAME  = os.environ.get("PROJECT_NAME")
INSTRUCT_PATH = WORK_PATH + PROJECT_NAME

In [None]:
# 回遊1階層のテストデータ生成

# 75端末、24時間分
unit_terminal = 75
unit_id_num   = unit_terminal * 24

random_unit_list = [''.join(random.choices(string.digits, k=5)) for _ in range(0, unit_id_num)]
tmp_list = [[unit_id, random.random()] for unit_id in random_unit_list]

df_schema = types.StructType([
        types.StructField('ORIGIN',      types.StringType(), False),
        types.StructField('移動影響量',    types.FloatType(),  False),
    ])
df_migrate1 = spark.createDataFrame(tmp_list, df_schema)
df_migrate1\
	.orderBy(col('ORIGIN').asc())\
    .toPandas()\
    .to_csv(INSTRUCT_PATH + 'csv_data/test_tmp.csv', index=False, header=True)

In [None]:
# 回遊2階層のテストデータ生成

df_schema = types.StructType([
        types.StructField('ORIGIN',      types.StringType(), False),
        types.StructField('DESTINATION', types.StringType(), False),
        types.StructField('移動影響量',    types.FloatType(),  False),
    ])
df_migrate2 = spark.createDataFrame([], df_schema)

print("回遊1階層 一時保管開始")
df_migrate1.persist(StorageLevel.MEMORY_ONLY)
df_migrate1.count()
print("回遊1階層 一時保管終了")

for unit_id, move in tmp_list:
    df_tmp      = df_migrate1\
        				.withColumn('DESTINATION', F.lit(unit_id))\
						.withColumn('移動影響量',    col('移動影響量') * move)
    df_migrate2 = df_migrate2.unionByName(df_tmp)

print("回遊2階層 計算開始")
df_migrate2.persist(StorageLevel.MEMORY_ONLY)
df_migrate2.count()
print("回遊2階層 計算終了")

df_migrate2\
    .orderBy(col('ORIGIN').asc(), col('DESTINATION').asc())\
    .toPandas()\
    .to_csv(INSTRUCT_PATH + f'csv_data/test_terminal{unit_terminal}.csv', index=False, header=True)