<a href="https://colab.research.google.com/github/headhuncho1234/HW/blob/main/Vehicle_Protect_Project.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
# Import necessary libraries
import pandas as pd # For data manipulation and analysis
import numpy as np # For numerical operations
import matplotlib.pyplot as plt # For creating visualizations
import seaborn as sns # For creating more aesthetically pleasing visualizations
from sklearn.model_selection import train_test_split # To split data into training and testing sets for machine learning
from sklearn.linear_model import LinearRegression # To implement a linear regression model
from sklearn.tree import DecisionTreeClassifier # To implement a decision tree classifier
from sklearn.ensemble import RandomForestClassifier # To implement a random forest classifier
import dmba
from sklearn.preprocessing import LabelEncoder # Import LabelEncoder

ModuleNotFoundError: No module named 'dmba'

In [None]:
!pip install dmba

In [None]:
# Load the dataset from the specified CSV file into a pandas DataFrame
car_car_df = pd.read_csv("used_cars.csv")

In [None]:
# Display the first 5 rows of the DataFrame to get a quick overview of the data
car_car_df.head()

In [None]:
# Set the pandas option to display all columns when printing the DataFrame head/tail
pd.set_option('display.max_columns', None)

In [None]:
# Generate descriptive statistics of the numerical columns in the DataFrame
car_car_df.describe()

In [None]:
# Print information about the DataFrame, including the index dtype and column dtypes, non-null values and memory usage
car_car_df.info()

In [None]:
# Count the number of missing values in each column of the DataFrame
car_car_df.isnull().sum()

In [None]:
# Remove rows with any missing values and modify the DataFrame in place
car_car_df.dropna(inplace=True)

In [None]:
# Check for duplicate rows and remove them, modifying the DataFrame in place
car_car_df.duplicated().sum() # Count duplicate rows
car_car_df.drop_duplicates(inplace=True) # Drop duplicate rows

In [None]:
# --- Clean milage ---
# Convert 'milage' to string, remove commas and ' mi.', convert to numeric, fill missing with 0, convert to int
car_car_df['milage'] = (
    car_car_df['milage'].astype(str) # Convert to string first
                        .str.replace(',', '', regex=False)   # remove commas
                        .str.replace(' mi.', '', regex=False)  # remove ' mi.'
)
car_car_df['milage'] = pd.to_numeric(car_car_df['milage'], errors='coerce').fillna(0).astype(int)

# --- Clean price ---
# Convert 'price' to string, remove '$' and commas, convert to numeric, fill missing with 0, convert to int
car_car_df['price'] = (
    car_car_df['price'].astype(str) # Convert to string first
                       .str.replace(r'[\$,]', '', regex=True)  # remove $ and commas
)
car_car_df['price'] = pd.to_numeric(car_car_df['price'], errors='coerce').fillna(0).astype(int)


In [None]:
# Feature extraction --- Create hp column
car_car_df['hp'] = car_car_df['engine'].str.extract(r'(\d+\.\d+)HP').astype(float, errors='ignore')

# Create engine displacement
car_car_df['engine displacement'] = car_car_df['engine'].str.extract(r'(\d+\.\d+)\s*L')
car_car_df['engine displacement'] = car_car_df['engine displacement'].fillna(car_car_df['engine'].str.extract(r'(\d+\.\d+)\s*LITER')[0])
car_car_df['engine displacement'] = car_car_df['engine displacement'].astype(float, errors='ignore')

# Is it V type or not
car_car_df['is_v_engine'] = car_car_df['engine'].str.contains(r'V\d+', case=False, na=False)

In [None]:
# Display the data types of each column in the DataFrame
car_car_df.dtypes

In [None]:
#cleaning the fuel_type feature
car_car_df['fuel_type'] = car_car_df['fuel_type'].str.strip().str.upper().replace({'PLUG-IN HYBRID': 'HYBRID', 'NOT SUPPORTED':'OTHER', '–':'OTHER'})

car_car_df['fuel_type'].value_counts()

In [None]:
#cleaning the transmission feature

def classify_transmission(transmission):
    T = str(transmission).upper()

    if 'M/T' in T or 'MT' in T or 'MANUAL' in T:
        return 'M/T'
    elif 'A/T' in T or 'AT' in T or 'AUTOMATIC' in T:
        return 'A/T'
    elif 'CVT' in T or 'VARIABLE' in T or 'SINGLE-SPEED' in T:
        return 'CVT'
    else:
        return 'OTHER'

car_car_df['transmission'] = car_car_df['transmission'].apply(classify_transmission)

car_car_df['transmission'].unique()

In [None]:
#handling missing values

car_car_df['hp'] = car_car_df.groupby('brand')['hp'].transform(lambda x: x.fillna(x.mean()))

car_car_df.dropna(subset=['hp'], inplace=True)

most_common_fuel = car_car_df.groupby('brand')['fuel_type'].agg(lambda x: x.mode()[0] if not x.mode().empty else None)

car_car_df['fuel_type'] = car_car_df.apply(
    lambda row: most_common_fuel[row['brand']] if pd.isna(row['fuel_type']) else row['fuel_type'],
    axis=1
)

car_car_df['fuel_type'] = car_car_df['fuel_type'].fillna('OTHER')

most_common_displacement = car_car_df.groupby('brand')['engine displacement'].agg(lambda x: x.mode()[0] if not x.mode().empty else None)

car_car_df['engine displacement'] = car_car_df.apply(
    lambda row: most_common_displacement[row['brand']] if pd.isna(row['engine displacement']) else row['engine displacement'],
    axis=1
)

car_car_df['engine displacement'] = car_car_df['engine displacement'].fillna(car_car_df['engine displacement'].median())

In [None]:
#encoding catergorical features
car_car_df['accident'] = car_car_df['accident'].apply(lambda x: 1 if x == 'At least 1 accident or damage reported' else 0)
car_car_df['clean_title'] = car_car_df['clean_title'].apply(lambda x: 1 if x == 'Yes' else 0)



In [None]:
#removing outliers

columns = ['hp', 'milage', 'engine displacement']

fig, axes = plt.subplots(1, 3, figsize=(20, 5))
axes = axes.flatten()

for i, col in enumerate(columns):
    sns.boxplot(x=car_car_df[col], ax=axes[i])
    axes[i].set_title(f'Boxplot of {col}')
    axes[i].set_xlabel(f'{col}')

plt.tight_layout()
plt.show()

In [None]:
columns = ['engine displacement', 'hp', 'price', 'milage']
for col in columns:
    Q1 = car_car_df[col].quantile(0.25)
    Q3 = car_car_df[col].quantile(0.75)
    IQR = Q3 - Q1

    lower_bound = Q1 - 1.5 * IQR
    upper_bound = Q3 + 1.5 * IQR

    car_car_df = car_car_df[(car_car_df[col] >= lower_bound) & (car_car_df[col] <= upper_bound)]

In [None]:
car_car_df.shape

In [None]:
vis_df = car_car_df.copy()
vis_df.head()

In [None]:
categorical_columns = ['fuel_type', 'transmission', 'is_v_engine']

for cat_col in categorical_columns:
    encoder = LabelEncoder()
    car_car_df[cat_col] = encoder.fit_transform(car_car_df[cat_col])

In [None]:
# Select top 10 brands based on their frequency for plotting
top_brands = car_car_df['brand'].value_counts().nlargest(20).index
car_car_df_top_brands = car_car_df[car_car_df['brand'].isin(top_brands)]


# --- Scatter plot: Price vs Milage by Brand ---
# Create a scatter plot showing the relationship between mileage and price, colored by brand for the top 10 brands
plt.figure(figsize=(12, 8))
sns.scatterplot(data=car_car_df_top_brands, x='milage', y='price', hue='brand', alpha=0.6)
plt.title("Used Cars: Price vs. Milage by Top 20 Brands")
plt.xlabel("Milage (mi)")
plt.ylabel("Price ($)")
plt.grid(True, linestyle='--', alpha=0.6)
plt.show()

In [None]:
#feature engineering
car_car_df['Vehicle_Age'] = 2025 - car_car_df['model_year']

In [None]:
car_car_df['Milage_per_Year'] = car_car_df.apply(
    lambda row: row['milage'] / row['Vehicle_Age'] if row['Vehicle_Age'] > 0 else row['milage'],
    axis=1
)

In [None]:
car_car_df['Vehicle_Age_Bin'] = pd.qcut(car_car_df['Vehicle_Age'], q=4, labels=['New', 'Mid', 'Old', 'Very Old'])
car_car_df['Milage_Bin'] = pd.qcut(car_car_df['milage'], q=4, labels=['Low', 'Medium', 'High', 'Very High'])

# One-hot encode
# car_car_df = pd.get_dummies(car_car_df, columns=['Vehicle_Age_Bin', 'Milage_Bin'], prefix=['Age', 'Milage'], drop_first=True, dtype=int)

In [None]:
car_car_df.isnull().sum()

In [None]:
car_car_df.head()

In [None]:
# --- Average price by brand ---
# Calculate the average price for each brand, sort in descending order, and select the top 15
brand_prices = (
    car_car_df.groupby('brand')['price']
              .mean()
              .sort_values(ascending=False)
              .head(20)   # top 20 brands
)

# --- Plot ---
# Create a bar plot to visualize the average price of the top 20 brands
plt.figure(figsize=(12,6))
brand_prices.plot(kind='bar', color='skyblue', edgecolor='black')
plt.title("Average Used Car Price by Brand (Top 20)")
plt.ylabel("Average Price (Thousands $)") # Update y-axis label
plt.xlabel("Brand")
plt.xticks(rotation=45, ha="right") # Rotate x-axis labels for better readability
plt.grid(axis='y', linestyle='--', alpha=0.6) # Add a horizontal grid

# Format y-axis labels to show values in thousands
plt.gca().yaxis.set_major_formatter(plt.FuncFormatter(lambda x, _: f'{x/1000:.0f}'))

plt.tight_layout() # Adjust layout to prevent labels overlapping
plt.show() # Display the plot

In [None]:
# Data Visualization
plt.figure(figsize=(7,4))
sns.histplot(np.log1p(car_car_df['price']), bins=40, kde=True)
plt.title('Price Distribution')
plt.xlabel('Price (log scale)')
plt.ylabel('Count')
plt.show()

In [None]:
num_cols = ['hp','engine displacement','Vehicle_Age','Milage_per_Year']

fig, axes = plt.subplots(2, 2, figsize=(12, 10))
axes = axes.flatten()

for i, col in enumerate(num_cols):
    sns.scatterplot(x=car_car_df[col], y=car_car_df['price'], alpha=0.6, ax=axes[i])
    axes[i].set_title(f'{col} vs Price')

plt.tight_layout()
plt.show()

In [None]:
num_features = ['price','hp','engine displacement','Vehicle_Age','Milage_per_Year']
sns.heatmap(car_car_df[num_features].corr(), annot=True, cmap='coolwarm')

In [None]:
brand_avg_price = (
    vis_df.groupby('brand')['price']
      .mean()
      .sort_values(ascending=False)
)

plt.figure(figsize=(10,5))
sns.barplot(
    x=brand_avg_price.index,
    y=brand_avg_price.values,
    palette=sns.color_palette("blend:midnightblue,lightblue", n_colors=len(brand_avg_price)
))
plt.xticks(rotation=90)
plt.title("Average Used Car Price by Brand")
plt.xlabel("Brand")
plt.ylabel("Average Price")
plt.tight_layout()
plt.show()

In [None]:
order = (
    vis_df.groupby('fuel_type')['price']
          .mean()
          .sort_values(ascending=False)
          .index
)


mean = vis_df.groupby('fuel_type')['price'].mean().loc[order]

# Color Grediant
cmap = plt.colormaps.get_cmap('Blues')
colors = [cmap(x) for x in np.linspace(1, 0.3, len(mean))]

plt.figure(figsize=(8,5))
ax = sns.barplot(
    x='fuel_type',
    y='price',
    data=vis_df,
    order=order,
    estimator=np.mean,
    errorbar=None,
    palette=colors
)

plt.title("Mean Used Car Price by Fuel Type")
plt.xlabel("Fuel Type")
ax.set_ylabel("Mean Price")
ax.set_yticks([])


for p in ax.patches:
    ax.annotate(
        f'{p.get_height():.0f}',
        (p.get_x() + p.get_width()/2, p.get_height()),
        ha='center', va='bottom', fontsize=10, color='black'
    )

plt.show()

In [None]:
plt.figure(figsize=(7,5))

sns.barplot(
    x='accident',
    y='price',
    data=vis_df,
    estimator=np.mean,
    errorbar=None,
    palette= ['lightblue','steelblue'],
    hue='accident',  # Assign the x variable to hue
    legend=False     # Set legend to False
)

plt.title("Mean Price by Accident Status")
plt.xlabel("Accident (0 = No, 1 = Yes)")
plt.ylabel("Mean Price")
plt.show()

In [None]:
sns.lmplot(data=car_car_df, x='hp', y='price', line_kws={'color':'red'}, scatter_kws={'alpha':0.5})
plt.title('Price vs Horsepower with Trend Line')
plt.tight_layout()
plt.show()

In [None]:
top_brands = vis_df['brand'].value_counts().nlargest(10).index

brand_order = (
    vis_df[vis_df['brand'].isin(top_brands)]
    .groupby('brand')['price']
    .median()
    .sort_values(ascending=False)
    .index
)

plt.figure(figsize=(10,5))
ax = sns.barplot(
    data=vis_df[vis_df['brand'].isin(top_brands)],
    x='brand',
    y='price',
    hue='accident',
    estimator=np.median,
    errorbar=None,
    order=brand_order,
    palette=['lightblue','midnightblue']
)

plt.title('Median Price by Brand and Accident Status')
plt.xticks(rotation=45)

ax.legend(title="Accident", labels=["No", "Yes"])

plt.show()


In [None]:
car_car_df.replace([np.inf, -np.inf], np.nan, inplace=True)

n_rows = 3
numeric_column_name = car_car_df.select_dtypes(include='number').columns
n_cols = int(np.ceil(len(numeric_column_name)/n_rows))
colors = ['blue', 'red', 'green']

plt.figure(figsize=(4*n_cols, 4*n_rows))
for idx, column in enumerate(numeric_column_name, 1):
    plt.subplot(n_rows, n_cols, idx)
    sns.kdeplot(
        data=car_car_df,
        x=column,
        fill=True,
        color=colors[(idx-1) % len(colors)]
    )
    plt.title(column)
plt.tight_layout()
plt.show()

Target Encoding for Brand

**Linear Regression**

In [None]:
car_car_df.replace([np.inf, -np.inf], np.nan, inplace=True)


In [None]:

# Define target and predictor variables
target = 'price'
# Exclude 'clean_title' from the initial features list as we will handle it separately
features = [col for col in car_car_df.columns if col not in ['price', 'ext_col', 'int_col', 'model', 'engine', 'good_deal', 'hp', 'engine displacement', 'is_v_engine', 'transmission']]

X = car_car_df[features]
y = car_car_df[target]

# Handle other categorical features in X by creating dummy variables
X = pd.get_dummies(X, drop_first=True)
X = X.drop(columns=['fuel_type_not supported', 'fuel_type_–'], errors='ignore')

# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Initialize and train the Linear Regression model
model = LinearRegression()
model.fit(X_train, y_train)

# Make predictions on the test set
y_pred = model.predict(X_test)

In [None]:
# Import necessary metrics for evaluating the model
from sklearn.metrics import r2_score, mean_squared_error, mean_absolute_error
import numpy as np # Import numpy for square root

# Make predictions on the test set
y_pred = model.predict(X_test)

# Calculate and print the evaluation metrics: R², RMSE, and MAE
print("R²:", r2_score(y_test, y_pred)) # R-squared value
print("RMSE:", np.sqrt(mean_squared_error(y_test, y_pred))) # Root Mean Squared Error
print("MAE:", mean_absolute_error(y_test, y_pred)) # Mean Absolute Error

In [None]:
# Display the coefficients of the linear regression model
coefficients = pd.DataFrame({
    'Feature': X_train.columns,
    'Coefficient': model.coef_
})
display(coefficients.sort_values(by='Coefficient', ascending=False))

In [None]:
# Display the number of observations (rows) in the DataFrame
print("Number of observations:", car_car_df.shape[0])

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np

# Assuming y_test and y_pred are available from the previous Linear Regression model evaluation
# If you have run other models since, you might need to re-run the Linear Regression evaluation cell first (c06bbd5c)

plt.figure(figsize=(10, 6))
sns.scatterplot(x=y_test, y=y_pred, alpha=0.6)
plt.plot([y_test.min(), y_test.max()], [y_test.min(), y_test.max()], color='red', linestyle='--', lw=2) # Add the y=x line
plt.xlabel("Actual Price")
plt.ylabel("Predicted Price")
plt.title("Actual vs. Predicted Prices with Line of Best Fit (Linear Regression)")
plt.grid(True, linestyle='--', alpha=0.6)

# Set limits on the axes to reduce the scale for better visualization
plt.xlim(0, 150000) # Set x-axis limit (adjust the value as needed)
plt.ylim(0, 150000) # Set y-axis limit (adjust the value as needed)

plt.show()

**Decision Tree**

In [None]:
# Re-load the dataset to ensure 'milage' is present
car_car_df = pd.read_csv("used_cars.csv")

# --- Clean mileage ---
# Convert 'milage' to string, remove commas and ' mi.', convert to numeric, fill missing with 0, convert to int
car_car_df['milage'] = (
    car_car_df['milage'].astype(str) # Convert to string first
                        .str.replace(',', '', regex=False)   # remove commas
                        .str.replace(' mi.', '', regex=False)  # remove ' mi.'
)
car_car_df['milage'] = pd.to_numeric(car_car_df['milage'], errors='coerce').fillna(0).astype(int)

# --- Clean price ---
# Convert 'price' to string, remove '$' and commas, convert to numeric, fill missing with 0, convert to int
car_car_df['price'] = (
    car_car_df['price'].astype(str) # Convert to string first
                       .str.replace(r'[\$,]', '', regex=True)  # remove $ and commas
)
car_car_df['price'] = pd.to_numeric(car_car_df['price'], errors='coerce').fillna(0).astype(int)

# --- Feature Extraction: Create hp, engine displacement, and is_v_engine columns ---
car_car_df['hp'] = pd.to_numeric(car_car_df['engine'].str.extract(r'(\d+\.?\d*)HP')[0], errors='coerce')

car_car_df['engine displacement'] = pd.to_numeric(car_car_df['engine'].str.extract(r'(\d+\.?\d+)\s*L')[0], errors='coerce')
car_car_df['engine displacement'] = car_car_df['engine displacement'].fillna(pd.to_numeric(car_car_df['engine'].str.extract(r'(\d+\.?\d+)\s*LITER')[0], errors='coerce'))

car_car_df['is_v_engine'] = car_car_df['engine'].str.contains(r'V\d+', case=False, na=False)


# Create the target variable 'good_deal' based on the price and other criteria
# A 'good_deal' is defined as a car priced below the average price, with a clean title, milage under 100,000, and a model year newer than 2015.
average_price = car_car_df['price'].mean()
car_car_df['good_deal'] = (
    (car_car_df['price'] < average_price) &
    (car_car_df['clean_title'] == 'Yes') &
    (car_car_df['milage'] < 100000) &
    (car_car_df['model_year'] > 2015)
).astype(int)


# Select features and target variable
features = ['model_year', 'milage', 'clean_title', 'accident', 'price']
target = 'good_deal'

# Convert categorical features to dummy variables
X = car_car_df[features]
X = pd.get_dummies(X, columns=['clean_title', 'accident'], drop_first=True)
y = car_car_df[target]

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Initialize and train the Decision Tree Classifier
decision_tree_model = DecisionTreeClassifier(random_state=42) # Renamed model to decision_tree_model
decision_tree_model.fit(X_train, y_train)

# Evaluate the model (optional, but good practice)
from sklearn.metrics import accuracy_score, classification_report

y_pred = decision_tree_model.predict(X_test)
print("Accuracy:", accuracy_score(y_test, y_pred))
print("\nClassification Report:\n", classification_report(y_test, y_pred))

In [None]:
from dmba import plotDecisionTree

# Access the X_train used for training the model in cell QMneUs-R34ux
X_for_plotting = X_train

plotDecisionTree(model, feature_names=X_for_plotting.columns, class_names=['Not Good Deal', 'Good Deal'])

### **Logisitic Regression**

> Add blockquote



In [None]:
!pip install dmba
import pandas as pd
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from dmba import classificationSummary # Explicitly import classificationSummary
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score

In [None]:
car_df = pd.read_csv('used_cars.csv') #read data
car_df.head()

In [None]:
car_df.dtypes

In [None]:
# Encode categorical variables before splitting
car_df['model_year'] = car_df['model_year'].astype(pd.CategoricalDtype())
car_df['milage'] = car_df['milage'].astype(pd.CategoricalDtype())
car_df['price'] = car_df['price'].astype(pd.CategoricalDtype())

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
import pandas as pd

# Ensure columns are of 'category' dtype before using .cat accessor
car_df['model_year'] = car_df['model_year'].astype('category')
car_df['milage'] = car_df['milage'].astype('category')
car_df['price'] = car_df['price'].astype('category')

car_df['model_year'] = car_df['model_year'].cat.codes
car_df['milage'] = car_df['milage'].cat.codes
car_df['price'] = car_df['price'].cat.codes

# Redefine X and y
y = car_df['brand']
X = car_df.drop(columns=['price', 'brand'])

# Apply one-hot encoding to categorical columns in X
X = pd.get_dummies(X, drop_first=True)

# Split the data into training and testing sets
train_X, test_X, train_y, test_y = train_test_split(X, y, test_size=0.3, random_state=1)

# Initialize and fit the scaler on the training data
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(train_X)
X_train_scaled = pd.DataFrame(X_train_scaled, columns=train_X.columns) # Keep column names

# Transform the test data using the fitted scaler
X_test_scaled = scaler.transform(test_X)
X_test_scaled = pd.DataFrame(X_test_scaled, columns=test_X.columns) # Keep column names

In [None]:
scaler = StandardScaler()

In [None]:
# Removed redundant scaling code

In [None]:
# Removed redundant scaling code

In [None]:
logit_reg = LogisticRegression()
logit_reg.fit(X_train_scaled, train_y)

In [None]:
# Evaluate the model on the test set
y_pred = logit_reg.predict(test_X)
classificationSummary(test_y, y_pred)

In [None]:
classificationSummary(train_y, logit_reg.predict(X_train_scaled))

In [None]:
# Evaluate the model on the test set
y_pred = logit_reg.predict(X_test_scaled)
classificationSummary(test_y, y_pred)

**KNN & Random Forest**

In [None]:
from sklearn.neighbors import KNeighborsRegressor
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

knn_model = KNeighborsRegressor(n_neighbors=5)
knn_model.fit(X_train_scaled, y_train)
y_pred_knn = knn_model.predict(X_test_scaled)

print("=== KNN RESULTS ===")
print("KNN R²:", r2_score(y_test, y_pred_knn))
print("KNN RMSE:", np.sqrt(mean_squared_error(y_test, y_pred_knn)))
print("KNN MAE:", mean_absolute_error(y_test, y_pred_knn))

In [None]:
from sklearn.ensemble import RandomForestRegressor

rf_model = RandomForestRegressor(n_estimators=100, random_state=1, max_depth=10)
rf_model.fit(X_train, y_train)
y_pred_rf = rf_model.predict(X_test)

print("=== RANDOM FOREST RESULTS ===")
print("Random Forest R²:", r2_score(y_test, y_pred_rf))
print("Random Forest RMSE:", np.sqrt(mean_squared_error(y_test, y_pred_rf)))
print("Random Forest MAE:", mean_absolute_error(y_test, y_pred_rf))
