In [28]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score, classification_report
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC

# ------------------ 1. Load Dataset ------------------ #
def load_data(filepath):
    df = pd.read_csv(filepath)
    print(df.head())
    print(df.info())
    print(df.describe())
    return df

# ------------------ 2. Data Preprocessing ------------------ #
def preprocess_data(df):
    df.fillna({'Embarked': df['Embarked'].mode()[0]}, inplace=True)
    df.fillna({'Age': df['Age'].mean()}, inplace=True)
    df.drop(['PassengerId', 'Name', 'Ticket', 'Cabin'], axis=1, inplace=True)
    df['Sex'] = df['Sex'].map({'male': 0, 'female': 1})
    df['Embarked'] = df['Embarked'].map({'C': 0, 'Q': 1, 'S': 2})
    return df

# ------------------ 3. Feature Engineering ------------------ #
def featEng(df):
    df['FamilySize'] = df['SibSp'] + df['Parch'] + 1
    df['isAlone'] = (df['FamilySize'] == 1).astype(int)
    df['FarePerPerson'] = df['Fare'] / df['FamilySize']
    return df

# ------------------ 4. Data Visualization ------------------ #
def visualize_data(df):
    sns.countplot(x='Survived', data=df)
    plt.title('Survival Count')
    plt.show()
    plt.figure(figsize=(10, 6))
    sns.heatmap(df.corr(), annot=True, cmap='coolwarm', fmt=".2f")
    plt.title('Correlation Heatmap')
    plt.show()

# ------------------ 5. Split Data ------------------ #
def split_data(df):
    Feature_Selections = ['Sex', 'Pclass', 'Age', 'SibSp', 'Parch', 'Fare', 'Embarked', 'FamilySize', 'isAlone', 'FarePerPerson']
    X = df[Feature_Selections]
    y = df['Survived']
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
    return X_train, X_test, y_train, y_test

# ------------------ 6. Model Training Functions ------------------ #
def train_logistic_regression(X_train, y_train):
    model = LogisticRegression(max_iter=500, random_state=42)
    model.fit(X_train, y_train)
    return model

def train_knn(X_train, y_train):
    model = KNeighborsClassifier(n_neighbors=5)
    model.fit(X_train, y_train)
    return model

def train_gbc(X_train, y_train):
    model = GradientBoostingClassifier(n_estimators=100, learning_rate=0.1, max_depth=5, random_state=42)
    model.fit(X_train, y_train)
    return model

def train_random_forest(X_train, y_train):
    param_grid = {'n_estimators': [100, 150], 'max_depth': [5, 10], 'min_samples_split': [2, 5], 'min_samples_leaf': [1, 2]}
    grid_search = GridSearchCV(RandomForestClassifier(random_state=42), param_grid, cv=5, n_jobs=-1)
    grid_search.fit(X_train, y_train)
    print("Best Parameters:", grid_search.best_params_)
    return grid_search.best_estimator_

def train_xgboost(X_train, y_train):
    model = XGBClassifier(n_estimators=200, max_depth=5, learning_rate=0.05, random_state=42)
    model.fit(X_train, y_train)
    return model

# ------------------ 7. Model Evaluation ------------------ #
def evaluate_model(model, X_test, y_test):
    y_pred = model.predict(X_test)
    accuracy = accuracy_score(y_test, y_pred)
    print(f"Accuracy: {accuracy:.2f}")
    print("Classification Report:\n", classification_report(y_test, y_pred))

# ------------------ Main Execution ------------------ #
if __name__ == "__main__":
    filepath = r'C:\Users\agrah\OneDrive\Desktop\titanic survival rate predictions\data\train.csv'
    df = load_data(filepath)
    df = preprocess_data(df)
    df = featEng(df)
    X_train, X_test, y_train, y_test = split_data(df)

    models = {
        "Gradient Boosting": train_gbc(X_train, y_train),
        "KNN": train_knn(X_train, y_train),
        "Logistic Regression": train_logistic_regression(X_train, y_train),
        "Random Forest": train_random_forest(X_train, y_train),
        "XGBoost": train_xgboost(X_train, y_train)
    }

    for name, model in models.items():
        print(f"\nEvaluating {name} model...")
        evaluate_model(model, X_test, y_test)


   PassengerId  Survived  Pclass  \
0            1         0       3   
1            2         1       1   
2            3         1       3   
3            4         1       1   
4            5         0       3   

                                                Name     Sex   Age  SibSp  \
0                            Braund, Mr. Owen Harris    male  22.0      1   
1  Cumings, Mrs. John Bradley (Florence Briggs Th...  female  38.0      1   
2                             Heikkinen, Miss. Laina  female  26.0      0   
3       Futrelle, Mrs. Jacques Heath (Lily May Peel)  female  35.0      1   
4                           Allen, Mr. William Henry    male  35.0      0   

   Parch            Ticket     Fare Cabin Embarked  
0      0         A/5 21171   7.2500   NaN        S  
1      0          PC 17599  71.2833   C85        C  
2      0  STON/O2. 3101282   7.9250   NaN        S  
3      0            113803  53.1000  C123        S  
4      0            373450   8.0500   NaN        S  
<c

In [None]:
import sys
print(sys.executable)

