# PySparkのデータ処理例

In [1]:
# Create a sample DataFrame
data = [("Alice", 30), ("Bob", 35), ("Charlie", 25)]
columns = ["name", "age"]
df = spark.createDataFrame(data, columns)

In [1]:
# DataFrame全体を表示
df.show()

+-------+---+
|   name|age|
+-------+---+
|  Alice| 30|
|    Bob| 35|
|Charlie| 25|
+-------+---+



In [1]:
# スキーマを確認
df.printSchema()

root
 |-- name: string (nullable = true)
 |-- age: long (nullable = true)



In [1]:
# 最初の数行を取得
df.head(2)

[Row(name='Alice', age=30), Row(name='Bob', age=35)]

In [1]:
# 列の操作

from pyspark.sql.functions import col

# 列を選択
df.select("name").show()

+-------+
|   name|
+-------+
|  Alice|
|    Bob|
|Charlie|
+-------+



In [1]:
# 新しい列を追加
df = df.withColumn("age_plus_10", col("age") + 10)
df.show()

+-------+---+-----------+
|   name|age|age_plus_10|
+-------+---+-----------+
|  Alice| 30|         40|
|    Bob| 35|         45|
|Charlie| 25|         35|
+-------+---+-----------+



In [1]:
# 列の名前を変更
df = df.withColumnRenamed("age", "years")
df.show()

+-------+-----+-----------+
|   name|years|age_plus_10|
+-------+-----+-----------+
|  Alice|   30|         40|
|    Bob|   35|         45|
|Charlie|   25|         35|
+-------+-----+-----------+



In [1]:
# フィルタリング

# age > 30 の行だけ抽出
df.filter(col("years") > 30).show()

# 複数条件
df.filter((col("years") > 25) & (col("name") != "Alice")).show()

+----+-----+-----------+
|name|years|age_plus_10|
+----+-----+-----------+
| Bob|   35|         45|
+----+-----+-----------+

+----+-----+-----------+
|name|years|age_plus_10|
+----+-----+-----------+
| Bob|   35|         45|
+----+-----+-----------+



In [1]:
# 集計 / グループ化

from pyspark.sql.functions import avg, max, min

# 年齢の平均
df.select(avg("years")).show()

# 最大・最小
df.select(max("years"), min("years")).show()

+----------+
|avg(years)|
+----------+
|      30.0|
+----------+



+----------+----------+
|max(years)|min(years)|
+----------+----------+
|        35|        25|
+----------+----------+



In [1]:
# ソート処理

# 年齢順に昇順
df.orderBy("years").show()

# 年齢順に降順
df.orderBy(col("years").desc()).show()

+-------+-----+-----------+
|   name|years|age_plus_10|
+-------+-----+-----------+
|Charlie|   25|         35|
|  Alice|   30|         40|
|    Bob|   35|         45|
+-------+-----+-----------+

+-------+-----+-----------+
|   name|years|age_plus_10|
+-------+-----+-----------+
|    Bob|   35|         45|
|  Alice|   30|         40|
|Charlie|   25|         35|
+-------+-----+-----------+



In [1]:
# 集計処理

# 行数を数える
df.count()

# 重複を削除
df.dropDuplicates().show()


+-------+-----+-----------+
|   name|years|age_plus_10|
+-------+-----+-----------+
|  Alice|   30|         40|
|    Bob|   35|         45|
|Charlie|   25|         35|
+-------+-----+-----------+



In [1]:
# SQLクエリ

# 一時テーブルに登録
df.createOrReplaceTempView("people")

# SQLでクエリ
spark.sql("SELECT name, years FROM people WHERE years > 30").show()


+----+-----+
|name|years|
+----+-----+
| Bob|   35|
+----+-----+



# SparkSQLのデータ処理例

In [1]:
%sql
CREATE OR REPLACE TEMP VIEW people AS
SELECT * FROM VALUES
  ('Alice', 30),
  ('Bob', 35),
  ('Charlie', 25)
AS people(name, age);

OK

In [1]:
%sql
select * from people

In [1]:
%sql
-- -- age > 30 の行だけ抽出
SELECT * FROM people WHERE age > 30;


In [1]:
%sql
-- 複数条件 (age > 25 AND name != 'Alice')
SELECT *
FROM people
WHERE age > 25
  AND name <> 'Alice';
