In [1]:
import pandas as pd
import numpy as np
import sklearn

In [None]:
data = pd.read_csv("data/car-sales-extended.csv")
data

#convert all features to numeric
data.dtypes #check column data type 

#Remove or impute missing rows
data.is.na().sum() #Total number missing values or na in each column

#Note
1. ALL data should be numerical
2. There should be no missing values
3. Manipulate the test set the same as training set
4. Never test data you have not trained on
5. Tune hyperparameters on the validation set OR perform cross-validation
6. One best performance evaluation metric does not mean the best model


#Make a pipeline in steps eg fill the missing data, convert the data to numbers etc
Pipeline steps
1. Fill missing data
2. Convert data to numbers
3. Build a model on the data

#Pipeline imports

import pandas as pd
from sklearn.compose import ColumnTransformer #to transform to numeric columns
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer #to impute missing numbers
from sklearn.preprocessing import OneHotEncoder

#Modelling

from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split, GridSearchCV

#Set random seed to ensure the results are reproducible
import numpy as np
np.random.seed(42) 


#import the data and drop rows with missing values in the target
data = pd.read_csv("data/car-sales-extended.csv")

#Preprocessing (Fill missing data and convert data to numbers
data.dropna(subset=["Price"], inplace=True) #reasign if inplace does not work, drop na rows with missing values in the target (Price)

#Define different features and transformer pipeline
categorical_features = ["Doors", "Colour"]  #Difine a list of catogorical features to be transformed to numeric
categorical_transformer = Pipeline(steps=[("imputer", SimpleImputer(strategy="constant", fill_value="missing")), ("onehot", OneHotEncoder(handle_unknown="igonore"))])   #define a categorical transformer

Door_feature = ["Doors"]
Door_transformer = Pipeline(steps=[("imputer", SimpleImputer(strategy="constant", fill_value=4))])

numeric_features = ["Odometer (KM)"]
numeric_transformer = Pipeline(steps=[("imputer", SimpleImputer(strategy="mean")])

#Setup preprocessing steps fill missing values and convert data to numbers

preprocessor = ColumnTransformer(transformers=[("cat", categorical_transformer, categorical_features), ("Door",Door_transformer, Door_feature), ("num", numeric_transformer, numeric_features)])

#Create a preprocessing and modelling pipeline

model = Pipeline(steps=[("preprocessing", preprocessor), ("model", RandomForestRegressor())])

#Split the data

x = data.drop("Price", axis=1)
y = data["Price"]

x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2)

#Fit and score the model
model.fit(x_train, y_train)
model.score(x_test, y_test)

#Hyperparameter tunning with a pipeline to improve the score

pipe_grid = {"preprocessor__num__imputer__strategy": ["mean", "median"], "model__n_estimators": [100, 1000], "model__max_depth": [None, 5],
              "model_features": [auto], "model__min_samples_split":[2, 4]}

#call the GridSearchCV 
gs_model = GridSearchCV(model, pipe_grid, cv=5, verbose=2)

gs_model.fit(x_train, y_train)

gs_model.score(x_test, y_test)


