# Titanic

## https://www.kaggle.com/competitions/titanic

In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import f1_score
import joblib

In [2]:
from sklearn.pipeline import Pipeline
# https://scikit-learn.org/stable/modules/generated/sklearn.pipeline.Pipeline.html

from sklearn.compose import ColumnTransformer
# https://scikit-learn.org/stable/modules/generated/sklearn.compose.ColumnTransformer.html

from sklearn.model_selection import GridSearchCV
# https://scikit-learn.org/stable/modules/generated/sklearn.model_selection.GridSearchCV.html

# Split Data

In [3]:
# Load data
df = pd.read_csv('data_train.csv')

# Split data
X = df[['Pclass', 'Sex', 'Embarked', 'Age', 'SibSp', 'Parch', 'Fare']]
y = df['Survived']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

numeric_features = ['Age', 'SibSp', 'Parch', 'Fare']
categorical_features = ['Pclass', 'Sex', 'Embarked']

# Create Pipeline, Grid Search CV and Train Data

In [4]:
numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='mean')),
    ('scaler', StandardScaler())
])

In [5]:
categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder())
])

In [6]:
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features),
        ('cat', categorical_transformer, categorical_features)
    ])

In [7]:
pipeline = Pipeline(steps=[('preprocessor', preprocessor),
                             ('classifier', LogisticRegression(random_state=42, max_iter=1000))])

In [8]:
param_grid = {
    'classifier__C': [0.1, 1, 10, 100],
}

In [9]:
grid_search = GridSearchCV(pipeline, param_grid, cv=5)
grid_search.fit(X_train, y_train)

In [10]:
print("Best parameters found: ", grid_search.best_params_)
print("Best cross-validation score: {:.2f}".format(grid_search.best_score_))

Best parameters found:  {'classifier__C': 0.1}
Best cross-validation score: 0.80


In [11]:
best_model = grid_search.best_estimator_
joblib.dump(best_model, 'best_model.joblib')
print('done')

done


# Simple Test

In [12]:
data_dict = {
    'Pclass': 3,
    'Name': 'Braund, Mr. Owen Harris',
    'Sex': 'male',
    'Age': 22,
    'SibSp': 1,
    'Parch': 0,
    'Ticket': 'A/5 21171',
    'Fare': 7.25,
    'Cabin': None,
    'Embarked': 'S'
}

In [13]:
new_data = pd.DataFrame([data_dict])
new_data.drop(['Name','Cabin','Ticket'], axis=1, inplace=True)

In [14]:
import joblib

loaded_model = joblib.load('best_model.joblib')
predictions = loaded_model.predict(new_data)

print(predictions)

[0]


# X_test dan y_test

In [15]:
# X_test.head()

In [16]:
y_pred = loaded_model.predict(X_test)
# y_pred

In [17]:
f1 = f1_score(y_test, y_pred)
print(f"F1 Score: {f1:.2f}")

F1 Score: 0.74


# Submission

In [18]:
df_sub = pd.read_csv('data_test.csv')
df_sub_asli = df_sub.copy()

In [19]:
df_sub.drop(['Name','Cabin','Ticket','PassengerId'], axis=1, inplace=True)

In [20]:
predictions_sub = loaded_model.predict(df_sub)

In [21]:
df_sub_asli.reset_index(drop=True, inplace=True)
df_sub_asli = pd.concat([df_sub_asli, pd.Series(predictions_sub, name='Survived')], axis=1)
# df_sub_asli.head()

In [22]:
df_sub_asli = df_sub_asli[['PassengerId','Survived']]
# df_sub_asli.head()

In [23]:
df_sub_asli.to_csv('hasil_prediksi_2.csv', index=False)