## Installing the necessary packages

In [1]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score

In [2]:
df = pd.read_csv("cleaned_data.csv")

In [3]:
df.info()

## "gasoline" and "diesel" to numerical value

In [4]:
df['fuel_type'] = df['fuel_type'].map({'gasoline': 1, 'diesel': 0})

In [5]:
df.info()

## "object" type to "category" type

In [6]:
df["district"] = df["district"].astype("category")
df["city"] = df["city"].astype("category")
df["color"] = df["color"].astype("category")
df["model"] = df["model"].astype("category")

In [7]:
categorical_features = ["city", "district", "color", "model"]
numerical_features = ["fuel_type", "year", "km"]

In [8]:
column_transformer = ColumnTransformer([
    ("num", StandardScaler(), numerical_features),
    ("cat", OneHotEncoder(handle_unknown="ignore"), categorical_features)
])

'''
"error" (varsayilan): Eğitim sirasinda görülmeyen kategorik değerler test veya 
yeni verilerde bulunduğunda hata verir.

"ignore": Eğitim sirasinda görülmeyen kategorik değerler test veya yeni verilerde bulunduğunda 
bu değerleri yoksayar ve kodlama sirasinda herhangi bir hata vermez.
'''

## Splitting X and y

In [9]:
X = df.drop("price", axis=1)
y = df["price"]

## train_test_split

In [10]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

## Creating a Pipeline

In [11]:
model = Pipeline([
    ("preperation", column_transformer),
    ("model", LinearRegression())
])

In [None]:
'''
[EN]
Pipeline is a method used to organize data processing and 
model training processes step by step and make them repeatable.

Pipelines are especially useful in machine learning projects for data cleaning, feature engineering, and 
It allows performing operations such as model training in a sequential manner.

[TR]
Pipeline, veri işleme ve model eğitimi süreçlerini adim adim organize etmek ve 
tekrarlanabilir hale getirmek için kullanilan bir yöntemdir. 

Pipeline'lar özellikle makine öğrenimi projelerinde veri temizleme, öznitelik mühendisliği ve 
model eğitimi gibi işlemleri sirali bir şekilde gerçekleştirmeyi sağlar.
'''

## Fitting the model

In [13]:
model.fit(X_train, y_train)

## Prediction and error calculations

In [14]:
y_pred = model.predict(X_test)
mse = mean_squared_error(y_test, y_pred)
root_mse = np.sqrt(mse)
r2 = r2_score(y_test, y_pred)

In [15]:
print(f"MSE: {mse:.4f}")
print(f"RMSE: {root_mse:.4f}")
print(f"R^2: {r2:.4f}")

## Price prediction with randomly created data

In [27]:
my_data = pd.DataFrame({
    "district": ["Buca"],
    "city": ["İzmir"],
    "color": ["Siyah"],
    "km": [110000],
    "year": [2016],
    "model": ["320i"],
    "fuel_type": [0]
})

In [28]:
prediction = float(list(model.predict(my_data))[0])
print(f"Predicted price: {prediction:.3f} TL")

In [29]:
print(df[(df["city"] == "İzmir") & (df["district"] == "Buca") & (df["fuel_type"] == 1) & (df["km"] < 180000) & (df["year"] > 2015) & (df["year"] < 2018)])