In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

# Google Driveのマウント（Colab環境の場合）
from google.colab import drive
drive.mount('/content/drive/')

# ファイルパス指定（自分のGoogle Drive構成にあわせて変更してください）
train_path = "/content/drive/MyDrive/Colab Notebooks/standard/rossmann-store-sales/data/train.csv"
store_path = "/content/drive/MyDrive/Colab Notebooks/standard/rossmann-store-sales/data/store.csv"

# ----------------------------------------------------------
# 1. train.csv と store.csv の読み込み & マージ
# ----------------------------------------------------------
train_df = pd.read_csv(train_path).rename(columns=lambda x: x.strip())
store_df = pd.read_csv(store_path).rename(columns=lambda x: x.strip())
df = train_df.merge(store_df, on="Store", how="left")

print("[INFO] Merged DataFrame shape:", df.shape)
print(df.head(3))

# ----------------------------------------------------------
# 2. クリーニング & 軽い変形
# ----------------------------------------------------------

# (a) Dateをdatetimeに変換
df['Date'] = pd.to_datetime(df['Date'], errors='coerce')

# (b) カラムのカテゴリ変換
#   - StateHoliday の '0' を 'none' に変換して欠損にも対応
df['StateHoliday'] = df['StateHoliday'].replace({'0': 'none', 0: 'none'}).fillna('none')
df['StateHoliday'] = df['StateHoliday'].astype('category')

# StoreType, Assortment もカテゴリ型へ
df['StoreType'] = df['StoreType'].astype('category')
df['Assortment'] = df['Assortment'].astype('category')

# (c) 欠損値処理
#   - Open, Promo, SchoolHoliday は本来 0/1 なら NaN を 0 で補完して int化
for col in ['Open', 'Promo', 'SchoolHoliday']:
    df[col] = df[col].fillna(0).astype(int)

#   - Customers の欠損を中央値で補完
df['Customers'] = df['Customers'].fillna(df['Customers'].median()).astype(int)

#   - CompetitionDistance は中央値で補完
df['CompetitionDistance'] = df['CompetitionDistance'].fillna(df['CompetitionDistance'].median())

#   - CompetitionOpenSinceMonth, Year の欠損を 0 で埋める (不明扱い)
df['CompetitionOpenSinceMonth'] = df['CompetitionOpenSinceMonth'].fillna(0).astype(int)
df['CompetitionOpenSinceYear'] = df['CompetitionOpenSinceYear'].fillna(0).astype(int)

#   - PromoInterval の欠損を 'none' に
df['PromoInterval'] = df['PromoInterval'].fillna('none').astype('category')

# (d) 外れ値の簡易処理
#   - Sales の極端に高い値を上位0.1%でクリップする例
upper_limit = df['Sales'].quantile(0.999)
df.loc[df['Sales'] > upper_limit, 'Sales'] = upper_limit

#   - CompetitionDistance も上位0.1%でクリップ
upper_comp = df['CompetitionDistance'].quantile(0.999)
df.loc[df['CompetitionDistance'] > upper_comp, 'CompetitionDistance'] = upper_comp

# (e) 分析や可視化で使いやすい列の追加
df['DayOfWeek'] = df['DayOfWeek'].astype('category')
df['Year'] = df['Date'].dt.year
df['Month'] = df['Date'].dt.month
df['WeekOfYear'] = df['Date'].dt.isocalendar().week.astype(int)

# ----------------------------------------------------------
# 3. 処理後の確認 & ファイル出力
# ----------------------------------------------------------
print("\n[INFO] After Cleaning/Transformation")
print(df.info())
print(df.head(3))

# CSVへ出力 (次のEDAステップで読み込み予定)
cleaned_path = "/content/drive/MyDrive/Colab Notebooks/standard/rossmann-store-sales/data/cleaned_data.csv"
df.to_csv(cleaned_path, index=False)
print(f"[INFO] Cleaned data saved to: {cleaned_path}")


Drive already mounted at /content/drive/; to attempt to forcibly remount, call drive.mount("/content/drive/", force_remount=True).


  train_df = pd.read_csv(train_path).rename(columns=lambda x: x.strip())


[INFO] Merged DataFrame shape: (560478, 18)
   Store  DayOfWeek        Date  Sales  Customers  Open  Promo StateHoliday  \
0      1          5  2015-07-31   5263      555.0   1.0    1.0            0   
1      2          5  2015-07-31   6064      625.0   1.0    1.0            0   
2      3          5  2015-07-31   8314      821.0   1.0    1.0            0   

   SchoolHoliday StoreType Assortment  CompetitionDistance  \
0            1.0         c          a               1270.0   
1            1.0         a          a                570.0   
2            1.0         a          a              14130.0   

   CompetitionOpenSinceMonth  CompetitionOpenSinceYear  Promo2  \
0                        9.0                    2008.0       0   
1                       11.0                    2007.0       1   
2                       12.0                    2006.0       1   

   Promo2SinceWeek  Promo2SinceYear    PromoInterval  
0              NaN              NaN              NaN  
1             1

  df.loc[df['Sales'] > upper_limit, 'Sales'] = upper_limit



[INFO] After Cleaning/Transformation
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 560478 entries, 0 to 560477
Data columns (total 21 columns):
 #   Column                     Non-Null Count   Dtype         
---  ------                     --------------   -----         
 0   Store                      560478 non-null  int64         
 1   DayOfWeek                  560478 non-null  category      
 2   Date                       560478 non-null  datetime64[ns]
 3   Sales                      560478 non-null  float64       
 4   Customers                  560478 non-null  int64         
 5   Open                       560478 non-null  int64         
 6   Promo                      560478 non-null  int64         
 7   StateHoliday               560478 non-null  category      
 8   SchoolHoliday              560478 non-null  int64         
 9   StoreType                  560478 non-null  category      
 10  Assortment                 560478 non-null  category      
 11  CompetitionDis