In [2]:
from scipy.stats import uniform, norm
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd

Este conjunto de datos
contiene información sobre varios atributos de automóviles, incluyendo Millas por galón de combustible (MPG) y Caballos de Fuerza (HP).


In [5]:
columns = ['mpg', 'cylinders', 'displacement', 'horsepower', 'weight', 'acceleration', 'model_year', 'origin', 'car_name']

df = pd.read_csv('data/auto-mpg.data', sep=r'\s+', names=columns)
df.head(5)

Unnamed: 0,mpg,cylinders,displacement,horsepower,weight,acceleration,model_year,origin,car_name
0,18.0,8,307.0,130.0,3504.0,12.0,70,1,chevrolet chevelle malibu
1,15.0,8,350.0,165.0,3693.0,11.5,70,1,buick skylark 320
2,18.0,8,318.0,150.0,3436.0,11.0,70,1,plymouth satellite
3,16.0,8,304.0,150.0,3433.0,12.0,70,1,amc rebel sst
4,17.0,8,302.0,140.0,3449.0,10.5,70,1,ford torino


In [7]:
(df == '?').sum()

mpg             0
cylinders       0
displacement    0
horsepower      6
weight          0
acceleration    0
model_year      0
origin          0
car_name        0
dtype: int64

In [8]:
df.replace('?', pd.NA, inplace=True)

In [9]:
df = df.dropna()
df.isnull().sum()

mpg             0
cylinders       0
displacement    0
horsepower      0
weight          0
acceleration    0
model_year      0
origin          0
car_name        0
dtype: int64

Se pide ajustar modelos lineales simples para relacionar
MPG con cada una de las demás variables de manera individual (o sea, un modelo simple por covariable)
Ordenar las covariables de acuerdo al 𝑅^2 obtenido de la más importante a la menos importante.

In [15]:
def r2_score(y_true, y_pred):
    y_mean = y_true.mean()
    TSS = np.sum((y_true - y_mean)**2)
    RSS = np.sum((y_true - y_pred)**2)
    return 1 - RSS/TSS

In [32]:
def linear_regression(x, y):
    #np.column_stack((np.ones(n), x))
    X = np.column_stack((np.ones(x.shape[0]), x))
    
    beta_hat = np.linalg.inv(X.T @ X) @ X.T @ y
    return X, beta_hat

In [12]:
df.dtypes

mpg             float64
cylinders         int64
displacement    float64
horsepower       object
weight          float64
acceleration    float64
model_year        int64
origin            int64
car_name         object
dtype: object

In [28]:
df['mpg'].shape

(392,)

In [48]:
## Predecir MPG 
y = pd.to_numeric(df['mpg']).values
y = np.array(y)

resultados = []

In [49]:
# mpg vs horsepower
x = pd.to_numeric(df['horsepower'], downcast='float').values
x = np.array(x)

X, beta_hat = linear_regression(x, y)
#X.shape, y.shape, beta_hat.shape
y_pred = X @ beta_hat

r2 = r2_score(y, y_pred)
resultados.append({
    'feature': 'horsepower',
    'r2': r2,
})
print('r2 (Horse Power)',r2)


r2 (Horse Power) 0.6059482578894348


In [40]:
# mpg vs weight
x = pd.to_numeric(df['weight']).values
y = pd.to_numeric(df['mpg']).values

x = np.array(x)
y = np.array(y)

X, beta_hat = linear_regression(x, y)
#X.shape, y.shape, beta_hat.shape
y_pred = X @ beta_hat

r2 = r2_score(y, y_pred)
resultados.append({
    'feature': 'weight',
    'r2': r2,
})

print('r2 (Weight)',r2)

r2 (Weight) 0.6926304331206254


In [50]:
# mpg vs acceleration
x = pd.to_numeric(df['acceleration']).values
y = pd.to_numeric(df['mpg']).values

x = np.array(x)
y = np.array(y)

X, beta_hat = linear_regression(x, y)
#X.shape, y.shape, beta_hat.shape
y_pred = X @ beta_hat

r2 = r2_score(y, y_pred)
resultados.append({
    'feature': 'acceleration',
    'r2': r2,
})

print('r2 (acceleration)',r2)

r2 (acceleration) 0.1792070501562546


In [42]:
# mpg vs cylinders
x = pd.to_numeric(df['cylinders']).values
y = pd.to_numeric(df['mpg']).values

x = np.array(x)
y = np.array(y)

X, beta_hat = linear_regression(x, y)
#X.shape, y.shape, beta_hat.shape
y_pred = X @ beta_hat

r2 = r2_score(y, y_pred)
resultados.append({
    'feature': 'cylinders',
    'r2': r2,
})

print('r2 (cylinders)',r2)

r2 (cylinders) 0.6046889889441246


In [51]:
# mpg vs displacement
x = pd.to_numeric(df['displacement']).values
y = pd.to_numeric(df['mpg']).values

x = np.array(x)
y = np.array(y)

X, beta_hat = linear_regression(x, y)
#X.shape, y.shape, beta_hat.shape
y_pred = X @ beta_hat

r2 = r2_score(y, y_pred)
resultados.append({
    'feature': 'displacement',
    'r2': r2,
})

print('r2 (displacement)',r2)

r2 (displacement) 0.6482294003193044


In [52]:
# mpg vs model_year
x = pd.to_numeric(df['model_year']).values
y = pd.to_numeric(df['mpg']).values

x = np.array(x)
y = np.array(y)

X, beta_hat = linear_regression(x, y)
#X.shape, y.shape, beta_hat.shape
y_pred = X @ beta_hat

r2 = r2_score(y, y_pred)
resultados.append({
    'feature': 'model_year',
    'r2': r2,
})

print('r2 (model_year)',r2)

r2 (model_year) 0.33702781330962295


In [53]:
# mpg vs origin
x = pd.to_numeric(df['origin']).values
y = pd.to_numeric(df['mpg']).values

x = np.array(x)
y = np.array(y)

X, beta_hat = linear_regression(x, y)
#X.shape, y.shape, beta_hat.shape
y_pred = X @ beta_hat

r2 = r2_score(y, y_pred)
resultados.append({
    'feature': 'origin',
    'r2': r2,
})

print('r2 (origin)',r2)

r2 (origin) 0.3194609386689674


In [57]:
df_resultados = pd.DataFrame(resultados)
df_resultados = df_resultados.sort_values(by='r2', ascending=False)


df_resultados

Unnamed: 0,feature,r2
2,displacement,0.648229
0,horsepower,0.605948
3,model_year,0.337028
4,origin,0.319461
1,acceleration,0.179207
