In [1]:
import numpy as np
import pandas as pd

from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder

In [2]:
# Define column names manually because the data file doesn't ship with a header
column_names = [
    "symboling",
    "normalized_losses",
    "make",
    "fuel_type",
    "aspiration",
    "num_of_doors",
    "body_style",
    "drive_wheels",
    "engine_location",
    "wheel_base",
    "length",
    "width",
    "height",
    "curb_weight",
    "engine_type",
    "num_of_cylinders",
    "engine_size",
    "fuel_system",
    "bore",
    "stroke",
    "compression_ratio",
    "horsepower",
    "peak_rpm",
    "city_mpg",
    "highway_mpg",
    "price",
]

df = pd.read_csv("imports-85.data", names=column_names)

In [3]:
df.head()

Unnamed: 0,symboling,normalized_losses,make,fuel_type,aspiration,num_of_doors,body_style,drive_wheels,engine_location,wheel_base,...,engine_size,fuel_system,bore,stroke,compression_ratio,horsepower,peak_rpm,city_mpg,highway_mpg,price
0,3,?,alfa-romero,gas,std,two,convertible,rwd,front,88.6,...,130,mpfi,3.47,2.68,9.0,111,5000,21,27,13495
1,3,?,alfa-romero,gas,std,two,convertible,rwd,front,88.6,...,130,mpfi,3.47,2.68,9.0,111,5000,21,27,16500
2,1,?,alfa-romero,gas,std,two,hatchback,rwd,front,94.5,...,152,mpfi,2.68,3.47,9.0,154,5000,19,26,16500
3,2,164,audi,gas,std,four,sedan,fwd,front,99.8,...,109,mpfi,3.19,3.4,10.0,102,5500,24,30,13950
4,2,164,audi,gas,std,four,sedan,4wd,front,99.4,...,136,mpfi,3.19,3.4,8.0,115,5500,18,22,17450


In [7]:
# Select the columns we need
df = df[[
    "body_style",
    "drive_wheels",
    "length",
    "width",
    "height",
    "curb_weight",
    "fuel_system",
    "stroke",
    "horsepower",
    "peak_rpm",
    "price"
]]

# Replace missing values with the mean
df = df.replace("?", np.nan)
df = df.fillna(df.mean().to_dict())

# Encode categorical columns
df = df.apply(
    lambda x: pd.Series(LabelEncoder().fit_transform(x))
    if x.name in ["body_style", "drive_wheels", "fuel_system"]
    else x
)

  df = df.fillna(df.mean().to_dict())


In [14]:
# Drop all other NaN values
df = df.dropna()

In [15]:
df.head()

Unnamed: 0,body_style,drive_wheels,length,width,height,curb_weight,fuel_system,stroke,horsepower,peak_rpm,price
0,0,2,168.8,64.1,48.8,2548,5,2.68,111,5000,13495
1,0,2,168.8,64.1,48.8,2548,5,2.68,111,5000,16500
2,2,2,171.2,65.5,52.4,2823,5,3.47,154,5000,16500
3,3,1,176.6,66.2,54.3,2337,5,3.4,102,5500,13950
4,3,0,176.6,66.4,54.3,2824,5,3.4,115,5500,17450


In [16]:
X = df.drop("price", axis=1).values
y = df["price"].values

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [22]:
print(X_train.shape, X_test.shape)

(156, 10) (39, 10)


In [23]:
print(y_train.shape, y_test.shape)

(156,) (39,)


In [17]:
model = LinearRegression()

model.fit(X_train, y_train)

In [25]:
y_pred = model.predict(X_test)

model.score(X_test, y_test)

0.7756224026416316