In [18]:
import pandas as pd
import numpy as np
import mlflow
import mlflow.sklearn
import streamlit as st
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report

In [19]:
# Load dữ liệu
def load_data():
    url = "https://raw.githubusercontent.com/datasciencedojo/datasets/master/titanic.csv"
    df = pd.read_csv(url)
    return df


In [20]:

def preprocess_data(df):
    df['Age'].fillna(df['Age'].median(), inplace=True)
    df['Embarked'].fillna(df['Embarked'].mode()[0], inplace=True)
    df['Sex'] = df['Sex'].map({'male':0, 'female':1})
    df = pd.get_dummies(df, columns=['Embarked'], drop_first=True)
    return df

In [21]:
# Chia dữ liệu
def split_data(df):
    X = df.drop(columns=['Survived'])
    y = df['Survived']
    X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=0.3, random_state=42)
    X_valid, X_test, y_valid, y_test = train_test_split(X_temp, y_temp, test_size=0.5, random_state=42)
    return X_train, X_valid, X_test, y_train, y_valid, y_test


In [22]:
# def train_and_log_model(X_train, y_train, X_valid, y_valid):
#     with mlflow.start_run():
#         model = RandomForestClassifier(n_estimators=100, random_state=42)
#         model.fit(X_train, y_train)

#         # Cross Validation
#         cv_scores = cross_val_score(model, X_train, y_train, cv=5)
#         valid_acc = accuracy_score(y_valid, model.predict(X_valid))

#         # Ghi log tham số và metric vào MLFlow
#         mlflow.log_param("n_estimators", 100)
#         mlflow.log_metric("cv_mean_accuracy", np.mean(cv_scores))
#         mlflow.log_metric("validation_accuracy", valid_acc)

#         # Lưu mô hình vào MLFlow
#         mlflow.sklearn.log_model(model, "random_forest_model")

#         return model

In [23]:
def main():
    st.title("Titanic Survival Prediction")
    df = load_data()
    df = preprocess_data(df)
    X_train, X_valid, X_test, y_train, y_valid, y_test = split_data(df)
    
    if st.button("Train Model"):
        model = train_and_log_model(X_train, y_train, X_valid, y_valid)
        y_pred = model.predict(X_test)
        acc = accuracy_score(y_test, y_pred)
        report = classification_report(y_test, y_pred)
        
        st.write(f"Test Accuracy: {acc:.4f}")
        st.text("Classification Report:")
        st.text(report)
        
        st.success("Model training completed and logged with MLFlow!")


In [24]:
if __name__ == "__main__":
    main()


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['Age'].fillna(df['Age'].median(), inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['Embarked'].fillna(df['Embarked'].mode()[0], inplace=True)


In [None]:
!streamlit run test.py