In [12]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.compose import ColumnTransformer
from sklearn.metrics import accuracy_score

# Load dataset
url = "https://raw.githubusercontent.com/datasciencedojo/datasets/master/titanic.csv"
df = pd.read_csv(url)

# Pisahkan fitur dan target
X = df.drop('Survived', axis=1)
y = df['Survived']

# Pisahkan data menjadi data latih dan data uji
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# Definisikan pipeline untuk preprocessing
numeric_features = X.select_dtypes(include=['int64', 'float64']).columns
categorical_features = X.select_dtypes(include=['object']).columns

numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='mean')),
    ('scaler', StandardScaler())
])

categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='constant', fill_value='missing')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features),
        ('cat', categorical_transformer, categorical_features)
    ])


In [13]:
# Lakukan pelatihan dan evaluasi untuk setiap model
models = {
    'Random Forest': RandomForestClassifier(random_state=42),
    'Logistic Regression': LogisticRegression(random_state=42),
    'Decision Tree': DecisionTreeClassifier(random_state=42),
    'Gradient Boosting': GradientBoostingClassifier(random_state=42),
    'K-Nearest Neighbors': KNeighborsClassifier(),
}

# Buat DataFrame untuk menyimpan hasil evaluasi
report_data = []

for model_name, model in models.items():
    pipeline = Pipeline(steps=[('preprocessor', preprocessor),
                                ('classifier', model)])

    # Latih model menggunakan data latih
    pipeline.fit(X_train, y_train)

    # Lakukan prediksi pada data uji
    predictions = pipeline.predict(X_test)

    # Evaluasi performa model
    accuracy = accuracy_score(y_test, predictions)

    # Tambahkan hasil evaluasi ke DataFrame
    report_data.append({'Model': model_name, 'Accuracy': accuracy})

In [14]:
# Buat DataFrame dari hasil evaluasi
report_df = pd.DataFrame(report_data)

# Urutkan DataFrame berdasarkan akurasi secara descending
report_df = report_df.sort_values(by='Accuracy', ascending=False)

# Print tabel report
print(report_df)

                 Model  Accuracy
1  Logistic Regression  0.820896
3    Gradient Boosting  0.820896
0        Random Forest  0.809701
2        Decision Tree  0.798507
4  K-Nearest Neighbors  0.798507
