In [2]:
from google.colab import drive
drive.mount('/content/drive/')

%cd /content/drive/My Drive/Colab Notebooks/standard/titanic

Drive already mounted at /content/drive/; to attempt to forcibly remount, call drive.mount("/content/drive/", force_remount=True).
/content/drive/My Drive/Colab Notebooks/standard/titanic


In [6]:
import pandas as pd
import numpy as np

####################################
# 1) データ読み込み用
####################################
def load_data(train_path, test_path=None):
    """
    任意のCSV読み込み用。必要があればtrain/test両方を返す。
    """
    train_df = pd.read_csv(train_path)
    test_df = pd.read_csv(test_path) if test_path is not None else None
    return train_df, test_df

####################################
# 2) 基本情報の確認
####################################
def show_basic_info(df, df_name="DataFrame"):
    """
    - データの先頭行、サイズ、info()等を表示
    - カラム一覧・データ型・メモリ使用量の概要も info() で確認
    """
    print(f"\n====== Basic Info: {df_name} ======")
    print(f"Shape: {df.shape}")  # 行数・列数
    print("\n--- Head(5) ---")
    print(df.head(5))
    print("\n--- Info ---")
    print(df.info())

####################################
# 3) 欠損値の確認
####################################
def check_missing_values(df, df_name="DataFrame"):
    """
    - 欠損値の総数、カラムごとの欠損値数を表示
    """
    print(f"\n====== Missing Values: {df_name} ======")
    total_missing = df.isnull().sum().sum()
    print(f"Total missing values: {total_missing}")
    missing_per_col = df.isnull().sum()
    missing_per_col = missing_per_col[missing_per_col > 0]
    if not missing_per_col.empty:
        print("\n--- Missing per column ---")
        print(missing_per_col)
    else:
        print("No missing values.")

####################################
# 4) 重複データの確認
####################################
def check_duplicates(df, df_name="DataFrame"):
    """
    - 重複行の数を表示。不要な場合は削除検討。
    """
    print(f"\n====== Duplicates Check: {df_name} ======")
    dup_count = df.duplicated().sum()
    if dup_count > 0:
        print(f"Found {dup_count} duplicate rows.")
    else:
        print("No duplicate rows found.")

####################################
# 5) カラムごとのユニーク数・カーディナリティ確認
####################################
def check_unique_values(df, df_name="DataFrame", top_n=5):
    """
    - 各カラムのユニーク値数を表示
    - カーディナリティが高いカテゴリ列などを把握するのに役立つ
    - さらに上位N件の値サンプルを表示
    """
    print(f"\n====== Unique Values (Top {top_n} samples): {df_name} ======")
    for col in df.columns:
        unique_count = df[col].nunique(dropna=False)
        print(f"{col}: {unique_count} unique values")
        # 上位N件のユニーク値をテキストで確認したい場合:
        # (数値カラムだと意味が薄いが、カテゴリ列などは参考になる)
        value_sample = df[col].drop_duplicates().head(top_n).values
        print(f"  Sample: {value_sample}")

####################################
# 6) 基本統計量の確認 (数値/カテゴリ)
####################################
def show_descriptive_stats(df, numeric_cols=None, categorical_cols=None, df_name="DataFrame"):
    """
    数値列・カテゴリ列で分けて describe() や value_counts() などを表示。
    """
    print(f"\n====== Descriptive Stats: {df_name} ======")
    # 数値列
    if numeric_cols is None:
        numeric_cols = df.select_dtypes(include=[np.number]).columns.tolist()
    if numeric_cols:
        print(f"\n--- Numeric Columns: {numeric_cols} ---")
        print(df[numeric_cols].describe())
    else:
        print("\nNo numeric columns found.")

    # カテゴリ列
    if categorical_cols is None:
        cat_dtypes = ["object", "category"]
        categorical_cols = [col for col in df.columns if df[col].dtype.name in cat_dtypes]
    if categorical_cols:
        print(f"\n--- Categorical Columns: {categorical_cols} ---")
        for col in categorical_cols:
            print(f"\nValue Counts: {col}")
            print(df[col].value_counts(dropna=False))
    else:
        print("\nNo categorical columns found.")

####################################
# 7) 歪度(Skewness)・尖度(Kurtosis) の確認
####################################
def check_skew_kurtosis(df, numeric_cols=None, df_name="DataFrame"):
    """
    数値列に対して、歪度(skew)・尖度(kurtosis)を計算して一覧表示。
    - |skew|が大きい(>1等)と分布が強く歪んでいる
    - kurtosisが大きいと裾が重い分布を示す
    """
    print(f"\n====== Skewness & Kurtosis: {df_name} ======")
    if numeric_cols is None:
        numeric_cols = df.select_dtypes(include=[np.number]).columns.tolist()
    if not numeric_cols:
        print("No numeric columns found.")
        return
    stats = []
    for col in numeric_cols:
        col_skew = df[col].skew()
        col_kurt = df[col].kurt()
        stats.append((col, col_skew, col_kurt))
    # 表示
    print("Column         Skewness       Kurtosis")
    for (c, s, k) in stats:
        print(f"{c:<14} {s:>10.4f} {k:>10.4f}")

####################################
# 8) 外れ値の簡易チェック (Zスコア or IQR)
####################################
def check_outliers(df, numeric_cols=None, method='zscore', threshold=3.0, df_name="DataFrame"):
    """
    - method='zscore' の場合、Zスコアの絶対値が threshold を超えるデータ数をカウント
    - method='iqr' の場合、1.5倍IQRを超えるデータ数をカウント
    ※あくまで目安。実際の外れ値処理は手動確認推奨。
    """
    print(f"\n====== Outlier Check ({method}): {df_name} ======")
    if numeric_cols is None:
        numeric_cols = df.select_dtypes(include=[np.number]).columns.tolist()
    if not numeric_cols:
        print("No numeric columns found.")
        return

    if method == 'zscore':
        # Zスコア計算: (x - mean) / std
        df_zscore = (df[numeric_cols] - df[numeric_cols].mean()) / df[numeric_cols].std()
        outlier_count = (df_zscore.abs() > threshold).sum()
        for col in numeric_cols:
            if outlier_count[col] > 0:
                print(f"{col}: {outlier_count[col]} outliers (threshold={threshold})")
    elif method == 'iqr':
        Q1 = df[numeric_cols].quantile(0.25)
        Q3 = df[numeric_cols].quantile(0.75)
        IQR = Q3 - Q1
        lower_bound = Q1 - 1.5 * IQR
        upper_bound = Q3 + 1.5 * IQR
        for col in numeric_cols:
            lb = lower_bound[col]
            ub = upper_bound[col]
            mask_outliers = (df[col] < lb) | (df[col] > ub)
            out_count = mask_outliers.sum()
            if out_count > 0:
                print(f"{col}: {out_count} outliers (IQR method)")
    else:
        print("Unknown method. Use 'zscore' or 'iqr'.")

####################################
# 9) 相関行列の表示
####################################
def check_correlations(df, numeric_cols=None, df_name="DataFrame"):
    """
    数値列同士の相関係数を表示。
    """
    print(f"\n====== Correlation Matrix: {df_name} ======")
    if numeric_cols is None:
        numeric_cols = df.select_dtypes(include=[np.number]).columns.tolist()
    if len(numeric_cols) > 1:
        corr_matrix = df[numeric_cols].corr()
        print(corr_matrix)
    else:
        print("Not enough numeric columns for correlation.")

####################################
# 10) ターゲット列との相関・統計など (回帰/分類 タスク前提)
####################################
def check_target_relation(df, target_col, numeric_cols=None, df_name="DataFrame"):
    """
    目的変数が既にわかっている場合に、その変数との相関を出すなどの一元的な確認。
    - 数値ターゲットなら相関を算出
    - カテゴリターゲットならvalue_countsやカテゴリ別の平均をチェック、など
    """
    print(f"\n====== Target Relation Check: {df_name} (Target={target_col}) ======")
    if target_col not in df.columns:
        print(f"Target column '{target_col}' not found in {df_name}.")
        return
    if numeric_cols is None:
        numeric_cols = df.select_dtypes(include=[np.number]).columns.tolist()

    # カラムが数値かどうか判断
    if np.issubdtype(df[target_col].dtype, np.number):
        # 数値ターゲット: 相関を表示
        if len(numeric_cols) > 1:
            corr_with_target = df[numeric_cols].corrwith(df[target_col]).sort_values(ascending=False)
            print("\n--- Correlation with target (descending) ---")
            print(corr_with_target)
        else:
            print("Not enough numeric columns to check correlation.")
    else:
        # カテゴリターゲット: 分布確認
        print(f"\n--- Category Target: {target_col} Value Counts ---")
        print(df[target_col].value_counts(dropna=False))

####################################
# メイン関数例
####################################
def main():
    # === 1) データ読み込み (パスは各自で変更) ===
    train_df, test_df = load_data('/content/drive/MyDrive/Colab Notebooks/standard/house-prices-advanced-regression-techniques/data/train.csv', '/content/drive/MyDrive/Colab Notebooks/standard/house-prices-advanced-regression-techniques/data/test.csv')

    # === 2) 基本情報の確認 ===
    show_basic_info(train_df, df_name="Train")
    if test_df is not None:
        show_basic_info(test_df, df_name="Test")

    # === 3) 欠損値の確認 ===
    check_missing_values(train_df, df_name="Train")
    if test_df is not None:
        check_missing_values(test_df, df_name="Test")

    # === 4) 重複データの確認 ===
    check_duplicates(train_df, df_name="Train")
    if test_df is not None:
        check_duplicates(test_df, df_name="Test")

    # === 5) ユニーク値・カーディナリティ確認 ===
    check_unique_values(train_df, df_name="Train", top_n=5)
    if test_df is not None:
        check_unique_values(test_df, df_name="Test", top_n=5)

    # === 6) 基本統計量の確認 ===
    show_descriptive_stats(train_df, df_name="Train")
    if test_df is not None:
        show_descriptive_stats(test_df, df_name="Test")

    # === 7) 歪度・尖度の確認 ===
    check_skew_kurtosis(train_df, df_name="Train")
    if test_df is not None:
        check_skew_kurtosis(test_df, df_name="Test")

    # === 8) 外れ値の簡易チェック ===
    check_outliers(train_df, method='zscore', threshold=3.0, df_name="Train")
    # check_outliers(train_df, method='iqr', df_name="Train")
    if test_df is not None:
        check_outliers(test_df, method='zscore', df_name="Test")

    # === 9) 相関行列の表示 ===
    check_correlations(train_df, df_name="Train")
    if test_df is not None:
        check_correlations(test_df, df_name="Test")

    # === 10) ターゲット列がある場合の確認 (例: 'SalePrice', 'Survived' etc) ===
    # データセットが回帰か分類かによってここを使い分け。
    # 例: 回帰であれば 'SalePrice' (数値)
    # 例: 分類であれば 'Survived' (0/1)
    target_col = 'SalePrice'  # ここは例。該当しないならスキップ
    check_target_relation(train_df, target_col, df_name="Train")

    # --- 他にも集計や深堀りしたい場合は各自追加 ---

if __name__ == "__main__":
    main()



Shape: (1460, 81)

--- Head(5) ---
   Id  MSSubClass MSZoning  LotFrontage  LotArea Street Alley LotShape  \
0   1          60       RL         65.0     8450   Pave   NaN      Reg   
1   2          20       RL         80.0     9600   Pave   NaN      Reg   
2   3          60       RL         68.0    11250   Pave   NaN      IR1   
3   4          70       RL         60.0     9550   Pave   NaN      IR1   
4   5          60       RL         84.0    14260   Pave   NaN      IR1   

  LandContour Utilities  ... PoolArea PoolQC Fence MiscFeature MiscVal MoSold  \
0         Lvl    AllPub  ...        0    NaN   NaN         NaN       0      2   
1         Lvl    AllPub  ...        0    NaN   NaN         NaN       0      5   
2         Lvl    AllPub  ...        0    NaN   NaN         NaN       0      9   
3         Lvl    AllPub  ...        0    NaN   NaN         NaN       0      2   
4         Lvl    AllPub  ...        0    NaN   NaN         NaN       0     12   

  YrSold  SaleType  SaleConditio

In [9]:
!pip install catboost

Collecting catboost
  Downloading catboost-1.2.7-cp311-cp311-manylinux2014_x86_64.whl.metadata (1.2 kB)
Downloading catboost-1.2.7-cp311-cp311-manylinux2014_x86_64.whl (98.7 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m98.7/98.7 MB[0m [31m5.7 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: catboost
Successfully installed catboost-1.2.7


In [11]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split, KFold, GridSearchCV, cross_val_score
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error, r2_score

# 回帰用モデル
from catboost import CatBoostRegressor
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.svm import SVR
from sklearn.neighbors import KNeighborsRegressor
from xgboost import XGBRegressor
from lightgbm import LGBMRegressor

RANDOM_STATE = 42

##############################################################################
# ユーザー指定: 欠損が無い数値列だけ（ベースライン用）
cols = [
    "MSSubClass", "LotArea", "OverallQual", "OverallCond",
    "YearBuilt", "YearRemodAdd", "1stFlrSF", "2ndFlrSF", "LowQualFinSF",
    "GrLivArea", "FullBath", "HalfBath", "BedroomAbvGr", "KitchenAbvGr",
    "TotRmsAbvGrd", "Fireplaces", "WoodDeckSF", "OpenPorchSF",
    "EnclosedPorch", "3SsnPorch", "ScreenPorch", "PoolArea",
    "MiscVal", "MoSold", "YrSold"
]
##############################################################################

def load_data(train_path, test_path=None):
    """
    任意のCSV読み込み用。必要があればtrain/test両方を返す。
    """
    train_df = pd.read_csv(train_path)
    test_df = pd.read_csv(test_path) if test_path is not None else None
    return train_df, test_df

def fill_missing_values(df):
    """
    汎用的な欠損補完例（回帰タスクでも同じ）。
    - 今回は「欠損がない列だけを使う」ためほぼ影響ないが、一応置いてある。
    """
    for col in df.columns:
        if df[col].dtype in [np.float64, np.int64]:
            df[col].fillna(df[col].median(), inplace=True)
        else:
            df[col].fillna('Unknown', inplace=True)
    return df

def feature_engineering(df):
    """
    汎用的な特徴量作成の例。
    - 今回は特になし。
    """
    return df

def make_features_target(df, target_col, feature_cols):
    """
    引数で「目的変数の列名」と「使いたい特徴量の列名リスト」を受け取り、X(features)とy(target)を返す。
    ダミー変数化などは必要に応じて行うが、今回はすべて数値列のため不要。
    """
    y = df[target_col]
    X = df[feature_cols].copy()

    # すべて数値列なのでダミー化不要、もし一部にobject型があれば下記を使う
    # cat_cols = [c for c in feature_cols if df[c].dtype == 'object']
    # X = pd.get_dummies(X, columns=cat_cols, drop_first=True)

    return X, y

def define_models():
    """
    回帰タスク用の主要モデルを定義して返す。
      1. LinearRegression
      2. SVR
      3. KNeighborsRegressor
      4. GradientBoostingRegressor
      5. RandomForestRegressor
      6. XGBRegressor
      7. LGBMRegressor
      8. CatBoostRegressor
    """
    return {
        "LinearReg": LinearRegression(),
        "SVR": SVR(),
        "KNeighbors": KNeighborsRegressor(),
        "GradientBoost": GradientBoostingRegressor(random_state=RANDOM_STATE),
        "RandomForest": RandomForestRegressor(random_state=RANDOM_STATE),
        "XGBoost": XGBRegressor(random_state=RANDOM_STATE),
        "LightGBM": LGBMRegressor(random_state=RANDOM_STATE),
        "CatBoost": CatBoostRegressor(verbose=0, random_state=RANDOM_STATE),
    }

def define_param_grids():
    """
    回帰モデルでGridSearchCVで探索するパラメータ。
    必要に応じて各モデルのパラメータを調整。
    """
    return {
        "GradientBoost": {
            "n_estimators": [100, 200],
            "learning_rate": [0.1, 0.05],
            "max_depth": [3, 5],
        },
        "RandomForest": {
            "n_estimators": [100, 200],
            "max_depth": [None, 5, 10],
        },
        "XGBoost": {
            "n_estimators": [100, 200],
            "learning_rate": [0.1, 0.05],
            "max_depth": [3, 5],
            "colsample_bytree": [1.0, 0.8],
        },
        "LightGBM": {
            "n_estimators": [100, 200],
            "learning_rate": [0.1, 0.05],
            "max_depth": [5, 10, -1],
            "colsample_bytree": [1.0, 0.8],
        },
        "CatBoost": {
            "iterations": [100, 200],
            "learning_rate": [0.1, 0.05],
            "depth": [3, 5],
        }
    }

def train_and_evaluate(models, param_grids, X_train_scaled, y_train, X_test_scaled, y_test):
    """
    共通の学習・評価フロー。
      - 交差検証(CV)はRMSEを評価指標とし、GridSearchCV あるいは cross_val_score を使用
      - 最終的にテストデータも MSE / R^2 を計算する
    """
    results = {}
    cv = KFold(n_splits=5, shuffle=True, random_state=RANDOM_STATE)

    def rmse_cv_score(model, X_train, y_train):
        mse_scores = cross_val_score(model, X_train, y_train, cv=cv, scoring='neg_mean_squared_error')
        rmse_scores = np.sqrt(-mse_scores)  # 負を外して平方根
        return rmse_scores

    for name, model in models.items():
        # 該当モデルにパラメータグリッドがあればGridSearch、無ければクロスバリデーションのみ
        if name in param_grids:
            param_grid = param_grids[name]
            grid_search = GridSearchCV(
                model,
                param_grid,
                cv=cv,
                scoring='neg_mean_squared_error',
                n_jobs=-1,
                verbose=0
            )
            grid_search.fit(X_train_scaled, y_train)

            best_estimator = grid_search.best_estimator_
            best_cv_score_neg_mse = grid_search.best_score_
            best_cv_score_rmse = np.sqrt(-best_cv_score_neg_mse)

            test_pred = best_estimator.predict(X_test_scaled)
            test_mse = mean_squared_error(y_test, test_pred)
            test_rmse = np.sqrt(test_mse)
            test_r2 = r2_score(y_test, test_pred)

            results[name] = {
                "Best Params": grid_search.best_params_,
                "CV Mean RMSE": f"{best_cv_score_rmse:.4f}",
                "Test RMSE": f"{test_rmse:.4f}",
                "Test R^2": f"{test_r2:.4f}"
            }
        else:
            # cross_val_scoreでRMSEを計算
            rmse_scores = rmse_cv_score(model, X_train_scaled, y_train)
            cv_mean_rmse = rmse_scores.mean()

            model.fit(X_train_scaled, y_train)
            test_pred = model.predict(X_test_scaled)
            test_mse = mean_squared_error(y_test, test_pred)
            test_rmse = np.sqrt(test_mse)
            test_r2 = r2_score(y_test, test_pred)

            results[name] = {
                "CV Mean RMSE": f"{cv_mean_rmse:.4f}",
                "Test RMSE": f"{test_rmse:.4f}",
                "Test R^2": f"{test_r2:.4f}"
            }
    return results

def main():
    # 1) データ読み込み
    train_df, _ = load_data('/content/drive/MyDrive/Colab Notebooks/standard/house-prices-advanced-regression-techniques/data/train.csv')  # test.csvが必要なら第二引数を渡す

    # 2) 前処理 & 特徴量作成
    train_df = fill_missing_values(train_df)
    train_df = feature_engineering(train_df)

    # 3) X, yの作成
    target_col = 'SalePrice'
    feature_cols = cols  # ユーザー指定のカラムを使用
    X, y = make_features_target(train_df, target_col, feature_cols)

    # 4) train_test_split
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=0.2, random_state=RANDOM_STATE
    )

    # 5) スケーリング
    scaler = StandardScaler()
    X_train_scaled = scaler.fit_transform(X_train)
    X_test_scaled = scaler.transform(X_test)

    # 6) モデル定義 & パラメータ
    models = define_models()
    param_grids = define_param_grids()

    # 7) 学習 & 評価
    results = train_and_evaluate(models, param_grids, X_train_scaled, y_train, X_test_scaled, y_test)

    # 8) 結果表示
    print("===== Model Results (Regression) =====")
    for model_name, vals in results.items():
        if "Best Params" in vals:
            print(f"{model_name}:")
            print(f"  Best Params : {vals['Best Params']}")
            print(f"  CV Mean RMSE: {vals['CV Mean RMSE']}")
            print(f"  Test RMSE   : {vals['Test RMSE']}")
            print(f"  Test R^2    : {vals['Test R^2']}")
        else:
            print(f"{model_name}:")
            print(f"  CV Mean RMSE: {vals['CV Mean RMSE']}")
            print(f"  Test RMSE   : {vals['Test RMSE']}")
            print(f"  Test R^2    : {vals['Test R^2']}")

if __name__ == "__main__":
    main()


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df[col].fillna(df[col].median(), inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df[col].fillna('Unknown', inplace=True)


[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000247 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1515
[LightGBM] [Info] Number of data points in the train set: 1168, number of used features: 24
[LightGBM] [Info] Start training from score 181441.541952
===== Model Results (Regression) =====
LinearReg:
  CV Mean RMSE: 37151.7388
  Test RMSE   : 38075.4202
  Test R^2    : 0.8110
SVR:
  CV Mean RMSE: 78821.0996
  Test RMSE   : 88620.8770
  Test R^2    : -0.0239
KNeighbors:
  CV Mean RMSE: 39559.5519
  Test RMSE   : 42049.1652
  Test R^2    : 0.7695
GradientBoost:
  Best Params : {'learning_rate': 0.05, 'max_depth': 3, 'n_estimators': 200}
  CV Mean RMSE: 29486.6745
  Test RMSE   : 29376.3365
  Test R^2    : 0.8875
RandomForest:
  Best Params : {'max_depth': None, 'n_estimators': 200}
  CV Mean RMSE: 31850.7677
  Test RMSE   : 30347.96