In [22]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, classification_report

# ------------------ 1. Load Dataset ------------------ #
def load_data(filepath):
    """Loads the Titanic dataset from the given file path."""
    df = pd.read_csv(filepath)
    print(df.head())  # Display the first 5 rows
    print(df.info())  # Dataset overview
    print(df.describe())  # Statistical summary
    return df

# ------------------ 2. Data Preprocessing ------------------ #
def preprocess_data(df):
    """Handles missing values, drops unnecessary columns, and encodes categorical data."""
    
    # Fill missing values
    df.fillna({'Embarked': df['Embarked'].mode()[0]}, inplace=True)
    df.fillna({'Age': df['Age'].mean()}, inplace=True)

    # Drop irrelevant columns
    df.drop(['PassengerId', 'Name', 'Ticket', 'Cabin'], axis=1, inplace=True)

    # Encode categorical variables
    df['Sex'] = df['Sex'].map({'male': 0, 'female': 1})
    df['Embarked'] = df['Embarked'].map({'C': 0, 'Q': 1, 'S': 2})

    return df

# ------------------ 3. Feature Engineering ------------------ #
def featEng(df):
    """Creates new features to improve model performance."""
    df['FamilySize'] = df['SibSp'] + df['Parch'] + 1  # Total family members onboard
    df['isAlone'] = (df['FamilySize'] == 1).astype(int)  # 1 if alone, else 0
    df['FarePerPerson'] = df['Fare'] / df['FamilySize']  # Adjusted fare per person
    return df

# ------------------ 4. Split Data ------------------ #
def split_data(df):
    """Splits data into training and test sets."""
    
    # Define features and target variable
    Feature_Selections = ['Sex', 'Pclass', 'Age', 'SibSp', 'Parch', 'Fare', 'Embarked', 'FamilySize', 'isAlone', 'FarePerPerson']
    X = df[Feature_Selections]
    y = df['Survived']

    # Print shapes
    print("Shape of X:", X.shape)  
    print("Shape of y:", y.shape)

    # Train-test split (80% train, 20% test)
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

    return X_train, X_test, y_train, y_test

# ------------------ 5. Model Training Functions ------------------ #

def train_logistic_regression(X_train, y_train):
    """Trains a Logistic Regression model."""
    model = LogisticRegression(max_iter=500, random_state=42)
    model.fit(X_train, y_train)
    return model

def train_knn(X_train, y_train):
    """Trains a K-Nearest Neighbors classifier."""
    model = KNeighborsClassifier(n_neighbors=5)
    model.fit(X_train, y_train)
    return model

def train_gbc(X_train, y_train):
    """Trains a Gradient Boosting Classifier model."""
    model = GradientBoostingClassifier(n_estimators=100, learning_rate=0.1, max_depth=5, random_state=42)
    model.fit(X_train, y_train)
    return model

def train_random_forest(X_train, y_train):
    """Trains a Random Forest Classifier model with GridSearchCV."""
    param_grid = {
        'n_estimators': [100, 200, 300],
        'max_depth': [5, 10, None],
        'min_samples_split': [2, 5, 10],
        'min_samples_leaf': [1, 2, 4]
    }

    grid_search = GridSearchCV(RandomForestClassifier(random_state=42), param_grid, cv=5, n_jobs=-1)
    grid_search.fit(X_train, y_train)

    print("Best Parameters:", grid_search.best_params_)
    best_model = grid_search.best_estimator_
    return best_model

# ------------------ 6. Model Evaluation ------------------ #
def evaluate_model(model, X_test, y_test):
    """Evaluates model performance using accuracy and classification report."""
    y_pred = model.predict(X_test)

    # Print accuracy and classification report
    accuracy = accuracy_score(y_test, y_pred)
    print(f"Accuracy: {accuracy:.2f}")
    print("Classification Report:\n", classification_report(y_test, y_pred))

# ------------------ Main Execution ------------------ #
if __name__ == "__main__":
    filepath = 'C:\\Users\\agrah\\OneDrive\\careers\\projects\\titanic survival rate predictions\\data\\train.csv'

    # Load and process data
    df = load_data(filepath)
    df = preprocess_data(df)
    df = featEng(df)  

    # Split data
    X_train, X_test, y_train, y_test = split_data(df)

    # Define models
    models = {
        "Gradient Boosting": train_gbc,
        "KNN": train_knn,
        "Logistic Regression": train_logistic_regression,
        "Random Forest": train_random_forest
    }
    
    print("\nModels to train:", models.keys())

    # Train and evaluate each model
    for name, model_func in models.items():
        print(f"\nTraining {name} model...")
        model = model_func(X_train, y_train)
        evaluate_model(model, X_test, y_test)


XGBoost Accuracy: 0.81
XGBoost Classification Report:
               precision    recall  f1-score   support

           0       0.81      0.89      0.85       105
           1       0.81      0.70      0.75        74

    accuracy                           0.81       179
   macro avg       0.81      0.79      0.80       179
weighted avg       0.81      0.81      0.81       179

   PassengerId  Survived  Pclass  \
0            1         0       3   
1            2         1       1   
2            3         1       3   
3            4         1       1   
4            5         0       3   

                                                Name     Sex   Age  SibSp  \
0                            Braund, Mr. Owen Harris    male  22.0      1   
1  Cumings, Mrs. John Bradley (Florence Briggs Th...  female  38.0      1   
2                             Heikkinen, Miss. Laina  female  26.0      0   
3       Futrelle, Mrs. Jacques Heath (Lily May Peel)  female  35.0      1   
4               

In [19]:
# Import XGBoost
from xgboost import XGBClassifier

# Initialize and train the XGBoost model
xgb_model = XGBClassifier(n_estimators=200, max_depth=5, learning_rate=0.05, random_state=42)
xgb_model.fit(X_train, y_train)

# Predict using the new model
y_pred_xgb = xgb_model.predict(X_test)

# Evaluate the XGBoost model
accuracy_xgb = accuracy_score(y_test, y_pred_xgb)
print(f"XGBoost Accuracy: {accuracy_xgb:.2f}")
print("XGBoost Classification Report:\n", classification_report(y_test, y_pred_xgb))


XGBoost Accuracy: 0.81
XGBoost Classification Report:
               precision    recall  f1-score   support

           0       0.81      0.89      0.85       105
           1       0.81      0.70      0.75        74

    accuracy                           0.81       179
   macro avg       0.81      0.79      0.80       179
weighted avg       0.81      0.81      0.81       179



In [3]:
import sys
print(sys.executable)



C:\Users\agrah\anaconda3\python.exe
