# データ前処理 (Task 1.b)

In [3]:
import pandas as pd

file_path = "/content/air_pollution_neighbors.csv"

# 区切り文字とエンコーディングを指定
df = pd.read_csv(file_path, sep=",", engine="python", encoding="shift_jis")

print("行数:", len(df))
print("列数:", len(df.columns))

# Timestamp列を削除
df.drop(columns=["Timestamp"], inplace=True)

# NaNを0に置き換え
df.fillna(0, inplace=True)

# 数値型に変換（文字列をfloatに）
df = df.apply(pd.to_numeric, errors='coerce')

# 負の値と100以上を0に置換
df = df.applymap(lambda x: 0 if (x >= 100 or x < 0) else x)

print("データ最小値:", df.min().min())
print("データ最大値:", df.max().max())


行数: 720
列数: 27
データ最小値: 0.0
データ最大値: 45.0


  df = df.applymap(lambda x: 0 if (x >= 100 or x < 0) else x)


# トランザクションDB (Task 1.b.v)
ここでは、値 >= 15 をしきい値として、センサーが15以上の値を持つ列だけをトランザクションに変換します。

In [4]:
transactions = []
threshold = 15

for _, row in df.iterrows():
    items = [col for col, val in row.items() if val >= threshold]
    if items:
        transactions.append(items)

print("トランザクション数:", len(transactions))

# 保存
with open("PM24HeavyPollutionRecordingSensors.csv", "w") as f:
    for t in transactions:
        f.write(",".join(t) + "\n")


トランザクション数: 276


#FP-Growthで頻出パターン抽出 (Task 1.c)
PAMIのFP-Growthを使います。

In [6]:
from PAMI.frequentPattern.basic import FPGrowth as fp

min_sup = 0.05  # サポート率5%（0件なら0.01に下げる）

fp_obj = fp.FPGrowth(iFile="PM24HeavyPollutionRecordingSensors.csv", minSup=min_sup)
fp_obj.mine()
patterns = fp_obj.getPatterns()
print("抽出されたパターン数:", len(patterns))

# 保存
with open("frequentPatterns.txt", "w") as f:
    for items, sup in patterns.items():
        f.write(",".join(items) + " #SUP: " + str(sup) + "\n")


Frequent patterns were generated successfully using frequentPatternGrowth algorithm
抽出されたパターン数: 5


In [11]:
print(coords_df[coords_df['stationid'] == 1102020])
print(coords_df[coords_df['stationid'] == 1203100])


   stationid        lat         lon
6    1102020  43.082031  141.333105
    stationid        lat         lon
24    1203100  43.181682  141.021908


#最長パターンを地図上に可視化 (Task 1.d)
frequentPatterns.txtを読み込み、アイテム数が最も多いパターンを取得してPlotly Expressで地図に描画します。



In [14]:
import pandas as pd
import plotly.express as px

coords_df = pd.read_csv("stationinfo_coords.csv")
sensor_location = coords_df.set_index(coords_df['stationid'].astype(str))[["lat", "lon"]].to_dict(orient="index")

# frequentPatterns.txtから最長パターンを抽出
patterns = []
with open("frequentPatterns.txt", "r") as f:
    for line in f:
        items = line.strip().split(" #SUP:")[0].split(",")
        patterns.append(items)
longest_pattern = max(patterns, key=len)
print("最長パターン:", longest_pattern)

lats, lons, texts = [], [], []
for s in longest_pattern:
    sid = str(int(s.replace("Point(", "").replace(")", "")))
    if sid in sensor_location:
        lat = sensor_location[sid]["lat"]
        lon = sensor_location[sid]["lon"]
        lats.append(lat)
        lons.append(lon)
        texts.append(sid)
    else:
        print(f" 座標データが見つかりません: {sid}")

if not lats:
    raise ValueError("座標データが見つかりません。stationinfo_coords.csvを確認してください。")

fig = px.scatter_mapbox(lat=lats, lon=lons, text=texts,
                        mapbox_style="open-street-map", zoom=8)
fig.show()


最長パターン: ['Point(01102020)', 'Point(01203100)']
