<a href="https://colab.research.google.com/github/kodani-73/Desktop/blob/main/analysis.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [30]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, roc_auc_score
from sklearn.decomposition import PCA
from sklearn.cluster import KMeans
from sklearn.impute import SimpleImputer

In [31]:
# データセットの読み込み
train_df = pd.read_csv('/content/train.csv')
test_df = pd.read_csv('/content/test.csv', on_bad_lines='skip')
sample_submission_df = pd.read_csv('/content/sample_submission.csv')

In [32]:
# 特徴量とターゲットの分離
X = train_df.drop(columns=['target'])
y = train_df['target']

In [33]:
# 欠損値の処理（平均値で埋める）
imputer = SimpleImputer(strategy='mean')
X_imputed = imputer.fit_transform(X)
test_imputed = imputer.transform(test_df)

In [34]:
# データの標準化
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X_imputed)
test_scaled = scaler.transform(test_imputed)

# 訓練データと検証データに分割
X_train, X_val, y_train, y_val = train_test_split(X_scaled, y, test_size=0.2, random_state=42)

In [35]:
# ロジスティック回帰モデル
log_reg = LogisticRegression()
log_reg.fit(X_train, y_train)
y_pred_log_reg = log_reg.predict(X_val)
y_pred_proba_log_reg = log_reg.predict_proba(X_val)[:, 1]

# ランダムフォレストモデル
rf_clf = RandomForestClassifier(n_estimators=100, random_state=42)
rf_clf.fit(X_train, y_train)
y_pred_rf = rf_clf.predict(X_val)
y_pred_proba_rf = rf_clf.predict_proba(X_val)[:, 1]

# モデルの評価
print(f'Logistic Regression Accuracy: {accuracy_score(y_val, y_pred_log_reg)}')
print(f'Logistic Regression ROC AUC: {roc_auc_score(y_val, y_pred_proba_log_reg)}')
print(f'Random Forest Accuracy: {accuracy_score(y_val, y_pred_rf)}')
print(f'Random Forest ROC AUC: {roc_auc_score(y_val, y_pred_proba_rf)}')

Logistic Regression Accuracy: 0.7654
Logistic Regression ROC AUC: 0.832920687967261
Random Forest Accuracy: 0.9134
Random Forest ROC AUC: 0.9600960364004213


In [36]:
# PCAによる次元削減
pca = PCA(n_components=2)
X_pca = pca.fit_transform(X_scaled)

# KMeansクラスタリング
kmeans = KMeans(n_clusters=2, random_state=42)
clusters = kmeans.fit_predict(X_pca)

# クラスタリング結果を特徴量に追加
X_with_clusters = np.hstack((X_scaled, clusters.reshape(-1, 1)))

# 再度訓練データと検証データに分割
X_train_wc, X_val_wc, y_train_wc, y_val_wc = train_test_split(X_with_clusters, y, test_size=0.2, random_state=42)

# ロジスティック回帰モデルの再学習
log_reg_wc = LogisticRegression()
log_reg_wc.fit(X_train_wc, y_train_wc)
y_pred_log_reg_wc = log_reg_wc.predict(X_val_wc)
y_pred_proba_log_reg_wc = log_reg_wc.predict_proba(X_val_wc)[:, 1]

# モデルの評価
print(f'Logistic Regression with Clustering Accuracy: {accuracy_score(y_val_wc, y_pred_log_reg_wc)}')
print(f'Logistic Regression with Clustering ROC AUC: {roc_auc_score(y_val_wc, y_pred_proba_log_reg_wc)}')

  super()._check_params_vs_input(X, default_n_init=10)


Logistic Regression with Clustering Accuracy: 0.7653
Logistic Regression with Clustering ROC AUC: 0.8332350171330996


In [37]:
# テストデータの予測
test_with_clusters = np.hstack((test_scaled, kmeans.predict(pca.transform(test_scaled)).reshape(-1, 1)))
test_pred_proba = log_reg_wc.predict_proba(test_with_clusters)[:, 1]

# 提出用データの作成
submission_df = sample_submission_df.copy()

# 長さが一致することを確認
if len(test_pred_proba) != len(submission_df):
    print(f"Warning: Length mismatch. Predicted: {len(test_pred_proba)}, Submission: {len(submission_df)}")
else:
    submission_df['target'] = test_pred_proba

# 提出データの保存
submission_df.to_csv('/content/submission.csv', index=False)


