In [11]:
import pandas as pd
import numpy as np
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier,StackingClassifier
from scipy.stats import mode

# Load the Titanic dataset (replace 'data' with 'df')
url = "https://raw.githubusercontent.com/datasciencedojo/datasets/master/titanic.csv"
df = pd.read_csv(url)

# Preprocess the data
df['Age'].fillna(df['Age'].median(), inplace=True)  # Use df instead of data
df['Embarked'].fillna('S', inplace=True)
df['Sex'] = df['Sex'].map({'male': 0, 'female': 1})
df['Embarked'] = df['Embarked'].map({'C': 0, 'Q': 1, 'S': 2})

# Select features and target
features = ['Pclass', 'Sex', 'Age', 'SibSp', 'Parch', 'Fare', 'Embarked']
X = df[features].values  # Use df instead of data
y = df['Survived'].values

# Train-test split
np.random.seed(42)
indices = np.random.permutation(len(X))
train_size = int(0.8 * len(X))
train_idx, test_idx = indices[:train_size], indices[train_size:]
X_train, X_test = X[train_idx], X[test_idx]
y_train, y_test = y[train_idx], y[test_idx]

# Initialize models
log_model = LogisticRegression(random_state=42, max_iter=200)
rf_model = RandomForestClassifier(n_estimators=100, random_state=42)
gb_model = GradientBoostingClassifier(n_estimators=100, random_state=42)

# Train models
log_model.fit(X_train, y_train)
rf_model.fit(X_train, y_train)
gb_model.fit(X_train, y_train)

# Get predictions
log_pred = log_model.predict(X_test)
rf_pred = rf_model.predict(X_test)
gb_pred = gb_model.predict(X_test)

# Combine predictions using majority voting
all_preds = np.array([log_pred, rf_pred, gb_pred])
voting_pred, _ = mode(all_preds, axis=0)

# Evaluate the Voting Classifier
voting_acc = np.mean(voting_pred.flatten() == y_test)
print(f"Voting Classifier Accuracy: {voting_acc:.2f}")


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['Age'].fillna(df['Age'].median(), inplace=True)  # Use df instead of data
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['Embarked'].fillna('S', inplace=True)


Voting Classifier Accuracy: 0.82


In [15]:
import pandas as pd
import numpy as np

# Load the Titanic dataset 
url = "https://raw.githubusercontent.com/datasciencedojo/datasets/master/titanic.csv"
df = pd.read_csv(url)


# Preprocess the data
df['Age'].fillna(df['Age'].median(), inplace=True)  # Use df instead of data
df['Embarked'].fillna('S', inplace=True)
df['Sex'] = df['Sex'].map({'male': 0, 'female': 1})
df['Embarked'] = df['Embarked'].map({'C': 0, 'Q': 1, 'S': 2})

# Select features and target
features = ['Pclass', 'Sex', 'Age', 'SibSp', 'Parch', 'Fare', 'Embarked']
X = df[features].values  # Use df instead of data
y = df['Survived'].values

# Train-test split
np.random.seed(42)
indices = np.random.permutation(len(X))
train_size = int(0.8 * len(X))
train_idx, test_idx = indices[:train_size], indices[train_size:]
X_train, X_test = X[train_idx], X[test_idx]
y_train, y_test = y[train_idx], y[test_idx]


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['Age'].fillna(df['Age'].median(), inplace=True)  # Use df instead of data
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['Embarked'].fillna('S', inplace=True)


In [17]:
from sklearn.linear_model import LogisticRegression

log_model = LogisticRegression(random_state=42, max_iter=200)
log_model.fit(X_train, y_train)
log_pred = log_model.predict(X_test)
log_acc = np.mean(log_pred == y_test)
print(f"Logistic Regression Accuracy: {log_acc:.2f}")


Logistic Regression Accuracy: 0.78


In [19]:
from sklearn.ensemble import RandomForestClassifier

rf_model = RandomForestClassifier(n_estimators=100, random_state=42)
rf_model.fit(X_train, y_train)
rf_pred = rf_model.predict(X_test)
rf_acc = np.mean(rf_pred == y_test)
print(f"Random Forest Accuracy: {rf_acc:.2f}")


Random Forest Accuracy: 0.83


In [21]:
from sklearn.ensemble import GradientBoostingClassifier

gb_model = GradientBoostingClassifier(n_estimators=100, random_state=42)
gb_model.fit(X_train, y_train)
gb_pred = gb_model.predict(X_test)
gb_acc = np.mean(gb_pred == y_test)
print(f"Gradient Boosting Accuracy: {gb_acc:.2f}")


Gradient Boosting Accuracy: 0.82


In [35]:
from scipy.stats import mode

# Combine predictions
all_preds = np.array([log_pred, rf_pred, gb_pred])
voting_pred, _ = mode(all_preds, axis=0)

# Evaluate Voting Classifier
voting_acc = np.mean(voting_pred.flatten() == y_test)
print(f"Voting Classifier Accuracy: {voting_acc:.2f}")

print(f"Logistic Regression Accuracy: {log_acc:.2f}")
print(f"Random Forest Accuracy: {rf_acc:.2f}")
print(f"Gradient Boosting Accuracy: {gb_acc:.2f}")


Voting Classifier Accuracy: 0.82
Logistic Regression Accuracy: 0.78
Random Forest Accuracy: 0.83
Gradient Boosting Accuracy: 0.82


In [37]:
from sklearn.ensemble import StackingClassifier

stacking_clf = StackingClassifier(
    estimators=[
        ('log', log_model),
        ('rf', rf_model),
        ('gb', gb_model)
        
    ],
    final_estimator=LogisticRegression(random_state=42)
)

# Train the stacking model
stacking_clf.fit(X_train, y_train)

# Evaluate the ensemble model
stacking_acc = stacking_clf.score(X_test, y_test)
print(f"Stacking Ensemble Accuracy: {stacking_acc:.2f}")

# Optionally, check individual model accuracies
log_model.fit(X_train, y_train)
rf_model.fit(X_train, y_train)
gb_model.fit(X_train, y_train)

log_acc = log_model.score(X_test, y_test)
rf_acc = rf_model.score(X_test, y_test)
gb_acc = gb_model.score(X_test, y_test)

print(f"Logistic Regression Accuracy: {log_acc:.2f}")
print(f"Random Forest Accuracy: {rf_acc:.2f}")
print(f"Gradient Boosting Accuracy: {gb_acc:.2f}")


Stacking Ensemble Accuracy: 0.82
Logistic Regression Accuracy: 0.78
Random Forest Accuracy: 0.83
Gradient Boosting Accuracy: 0.82
