In [None]:
import streamlit as st
import numpy as np
from sklearn.linear_model import LinearRegression
import pandas as pd
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.metrics import r2_score, mean_squared_error
import joblib

def train_model(csv_file_path):
    data = pd.read_csv(csv_file_path)
    data["Kilometer Driven"] = pd.to_numeric(data["Kilometer Driven"], errors="coerce")
    data["Price"] = pd.to_numeric(data["Price"].str.replace(".", "").str.replace(" đ", ""), errors="coerce")
    data["Year of Manufacture"] = pd.to_numeric(data["Year of Manufacture"], errors="coerce")
    data["Min_Price"] = pd.to_numeric(data["Min_Price"].str.replace(" triệu", ""), errors="coerce")

    data = data.drop(columns=["Thời gian đăng"])

    data = data.dropna(subset=["Model", "Brand", "Kilometer Driven", "Year of Manufacture", "Price", "Min_Price"])
   

    # display(data)

    features = ["Model", "Brand", "Kilometer Driven", "Year of Manufacture", "Min_Price"]
    target = "Price"
    X = data[features]
    y = data[target]

    categorical_features = ["Model", "Brand"]
    numeric_features = ["Year of Manufacture", "Min_Price", "Kilometer Driven"]

    preprocessor = ColumnTransformer(
        transformers=[
            ("cat", OneHotEncoder(handle_unknown='ignore'), categorical_features),
            ("num", StandardScaler(), numeric_features)
        ]
    )

    model = Pipeline(steps=[
        ('preprocessor', preprocessor),
        ('regressor', LinearRegression())
    ])

    model.fit(X, y)
    # joblib.dump(model, "linear regression.pkl")
    
    y_pred = model.predict(X)
    
    r2 = r2_score(y, y_pred)
    mse = mean_squared_error(y, y_pred)
    print(f"Model Linear Regression")
    print(f"R^2: {r2}")
    print(f"MSE: {mse}")
    
    

if __name__ == "__main__":
    file_csv = "CHOTOT_motorcycles.csv"
    train_model(file_csv)





Model Linear Regression
R^2: 0.7299650007048908
MSE: 212628755083038.9
