In [1]:
# from pyspark.sql.functions import * とする場合もありますが、
# Fで関数の名前空間を明示した方がわかりやすくて好きです。
# ただ、FだとPEP8に違反していますが。。。
from pyspark.sql import functions as F
from pyspark.sql.types import FloatType, TimestampType, StringType
from pyspark.sql.window import Window
from pyspark import SparkContext
from pyspark.sql import SparkSession
from pyspark.sql.utils import AnalysisException
from pyspark.sql.functions import desc
import shutil

In [2]:
# spark initialization
spark = SparkSession.builder.appName("gamedata").getOrCreate()

Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).


23/08/23 14:31:08 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
23/08/23 14:31:08 WARN Utils: Service 'SparkUI' could not bind on port 4040. Attempting port 4041.


In [3]:
df = spark.read.csv("../hard_weekly.csv", header=True)

df = df.select(
    F.col("begin_date").cast("date"),
    F.col("end_date").cast("date"),
    "hw",
    F.col("units").cast("int"))

In [4]:
df.printSchema()

root
 |-- begin_date: date (nullable = true)
 |-- end_date: date (nullable = true)
 |-- hw: string (nullable = true)
 |-- units: integer (nullable = true)



In [5]:
shutil.rmtree("hard_weekly")
df.write.parquet("hard_weekly")

                                                                                

In [6]:
weekly = spark.read.parquet("hard_weekly")

In [7]:
weekly.orderBy("end_date").tail(10)

[Row(begin_date=datetime.date(2023, 7, 31), end_date=datetime.date(2023, 8, 6), hw='3DS', units=56),
 Row(begin_date=datetime.date(2023, 7, 31), end_date=datetime.date(2023, 8, 6), hw='Switch', units=93210),
 Row(begin_date=datetime.date(2023, 7, 31), end_date=datetime.date(2023, 8, 6), hw='PS4', units=789),
 Row(begin_date=datetime.date(2023, 7, 31), end_date=datetime.date(2023, 8, 6), hw='PS5', units=50358),
 Row(begin_date=datetime.date(2023, 7, 31), end_date=datetime.date(2023, 8, 6), hw='XSX', units=3276),
 Row(begin_date=datetime.date(2023, 8, 7), end_date=datetime.date(2023, 8, 13), hw='3DS', units=29),
 Row(begin_date=datetime.date(2023, 8, 7), end_date=datetime.date(2023, 8, 13), hw='Switch', units=91744),
 Row(begin_date=datetime.date(2023, 8, 7), end_date=datetime.date(2023, 8, 13), hw='PS4', units=1543),
 Row(begin_date=datetime.date(2023, 8, 7), end_date=datetime.date(2023, 8, 13), hw='PS5', units=50862),
 Row(begin_date=datetime.date(2023, 8, 7), end_date=datetime.date(20

In [8]:
hwinfo = spark.read.csv("../hard_info.csv", header=True)

hwinfo = hwinfo.select(
    "hw",
    F.col("launch_day").cast("date"),
    "maker",
    "full_name").sort("maker")

In [9]:
shutil.rmtree("hard_info")
hwinfo.write.parquet("hard_info")

In [10]:
hwinfo.printSchema()

root
 |-- hw: string (nullable = true)
 |-- launch_day: date (nullable = true)
 |-- maker: string (nullable = true)
 |-- full_name: string (nullable = true)



In [11]:
hwinfo.show(30)

+-------+----------+---------+--------------------+
|     hw|launch_day|    maker|           full_name|
+-------+----------+---------+--------------------+
|     WS|1999-03-04|   BANDAI|          WonderSwan|
|    XSX|2020-11-10|Microsoft|     Xbox Series X|S|
|  XB360|2005-12-10|Microsoft|             Xbox360|
|   Xbox|2002-02-22|Microsoft|                Xbox|
|  XBOne|2014-09-04|Microsoft|             XboxOne|
|   WiiU|2012-12-08| Nintendo|                WiiU|
|    3DS|2011-02-26| Nintendo|        Nintendo 3DS|
| Switch|2017-03-03| Nintendo|     Nintendo Switch|
|     GB|1989-04-21| Nintendo|            GAME BOY|
|    N64|1996-06-23| Nintendo|          NINTENDO64|
|    Wii|2006-12-02| Nintendo|                 Wii|
|     DS|2004-12-02| Nintendo|         Nintendo DS|
|     GC|2001-09-14| Nintendo|   Nintendo GAMECUBE|
|    GBA|2001-03-21| Nintendo|    GAME BOY ADVANCE|
|     DC|1998-11-27|     SEGA|           DreamCast|
| SATURN|1994-11-22|     SEGA|         SEGA SATURN|
|NeoGeoP|199

## 基本に対して追加するカラム

オリジナルデータに対して､データの追加､加工をしたよく使う分析用のデータフレームを用意する｡

データフレーム名 "hard_sales"

追加カラム

- 累計 "sum_units"
- end_dateの年 "year"
- end_dateの月 "month"
- 発売日 "launch_day"
- メーカー "maker"
- 名前 "full_name"
- 経過日数 "delta_day"
- 発売年 "launch_year"
- 経過年 "delta_year"


In [12]:
df2 = df.withColumn("year", F.year(F.col("end_date"))).withColumn("month", F.month(F.col("end_date")))

In [13]:
df2 = df2.join(hwinfo, "hw", 'left')

In [14]:
df2 = df2.withColumn("launch_year", F.year(F.col("launch_day")))

In [15]:
df2 = df2.withColumn("delta_day", F.datediff("end_date", "launch_day"))

In [16]:
df2 = df2.withColumn("delta_week", (F.col("delta_day") / 7).cast("int"))

In [17]:
df2 = df2.withColumn("delta_year", F.col("year") - F.col("launch_year"))

In [18]:
w = Window.partitionBy("hw").orderBy("end_date").rangeBetween(Window.unboundedPreceding, 0)
df2 = df2.withColumn("sum_units", F.sum("units").over(w)).sort("end_date")

In [19]:
df2 = df2.withColumn("week", F.weekofyear(F.col("end_date")))

In [20]:
df2.printSchema()

root
 |-- hw: string (nullable = true)
 |-- begin_date: date (nullable = true)
 |-- end_date: date (nullable = true)
 |-- units: integer (nullable = true)
 |-- year: integer (nullable = true)
 |-- month: integer (nullable = true)
 |-- launch_day: date (nullable = true)
 |-- maker: string (nullable = true)
 |-- full_name: string (nullable = true)
 |-- launch_year: integer (nullable = true)
 |-- delta_day: integer (nullable = true)
 |-- delta_week: integer (nullable = true)
 |-- delta_year: integer (nullable = true)
 |-- sum_units: long (nullable = true)
 |-- week: integer (nullable = true)



In [21]:
shutil.rmtree("hard_sales")
df2.write.parquet("hard_sales")

In [22]:
df2.orderBy(desc("end_date")).show()

+------+----------+----------+-----+----+-----+----------+---------+---------------+-----------+---------+----------+----------+---------+----+
|    hw|begin_date|  end_date|units|year|month|launch_day|    maker|      full_name|launch_year|delta_day|delta_week|delta_year|sum_units|week|
+------+----------+----------+-----+----+-----+----------+---------+---------------+-----------+---------+----------+----------+---------+----+
|   PS4|2023-08-07|2023-08-13| 1543|2023|    8|2014-02-22|     SONY|   PlayStation4|       2014|     3459|       494|         9|  9454173|  32|
|Switch|2023-08-07|2023-08-13|91744|2023|    8|2017-03-03| Nintendo|Nintendo Switch|       2017|     2354|       336|         6| 30258477|  32|
|   PS5|2023-08-07|2023-08-13|50862|2023|    8|2020-11-12|     SONY|   PlayStation5|       2020|     1004|       143|         3|  4147786|  32|
|   3DS|2023-08-07|2023-08-13|   29|2023|    8|2011-02-26| Nintendo|   Nintendo 3DS|       2011|     4551|       650|        12| 2438743