In [11]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [12]:
print(pd.__version__)

2.2.2


In [13]:
!wget https://raw.githubusercontent.com/alexeygrigorev/datasets/master/car_fuel_efficiency.csv

--2025-09-19 08:03:05--  https://raw.githubusercontent.com/alexeygrigorev/datasets/master/car_fuel_efficiency.csv
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.109.133, 185.199.108.133, 185.199.111.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.109.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 874188 (854K) [text/plain]
Saving to: ‘car_fuel_efficiency.csv.2’


2025-09-19 08:03:06 (22.4 MB/s) - ‘car_fuel_efficiency.csv.2’ saved [874188/874188]



In [14]:
df = pd.read_csv('car_fuel_efficiency.csv')
display(df.head())

Unnamed: 0,engine_displacement,num_cylinders,horsepower,vehicle_weight,acceleration,model_year,origin,fuel_type,drivetrain,num_doors,fuel_efficiency_mpg
0,170,3.0,159.0,3413.433759,17.7,2003,Europe,Gasoline,All-wheel drive,0.0,13.231729
1,130,5.0,97.0,3149.664934,17.8,2007,USA,Gasoline,Front-wheel drive,0.0,13.688217
2,170,,78.0,3079.038997,15.1,2018,Europe,Gasoline,Front-wheel drive,0.0,14.246341
3,220,4.0,,2542.392402,20.2,2009,USA,Diesel,All-wheel drive,2.0,16.912736
4,210,1.0,140.0,3460.87099,14.4,2009,Europe,Gasoline,All-wheel drive,2.0,12.488369


In [15]:
df.head()

Unnamed: 0,engine_displacement,num_cylinders,horsepower,vehicle_weight,acceleration,model_year,origin,fuel_type,drivetrain,num_doors,fuel_efficiency_mpg
0,170,3.0,159.0,3413.433759,17.7,2003,Europe,Gasoline,All-wheel drive,0.0,13.231729
1,130,5.0,97.0,3149.664934,17.8,2007,USA,Gasoline,Front-wheel drive,0.0,13.688217
2,170,,78.0,3079.038997,15.1,2018,Europe,Gasoline,Front-wheel drive,0.0,14.246341
3,220,4.0,,2542.392402,20.2,2009,USA,Diesel,All-wheel drive,2.0,16.912736
4,210,1.0,140.0,3460.87099,14.4,2009,Europe,Gasoline,All-wheel drive,2.0,12.488369


In [16]:
df.shape

(9704, 11)

In [17]:
df.columns

Index(['engine_displacement', 'num_cylinders', 'horsepower', 'vehicle_weight',
       'acceleration', 'model_year', 'origin', 'fuel_type', 'drivetrain',
       'num_doors', 'fuel_efficiency_mpg'],
      dtype='object')

In [18]:
df['fuel_type'].unique()

array(['Gasoline', 'Diesel'], dtype=object)

In [19]:
df.isnull().sum()

Unnamed: 0,0
engine_displacement,0
num_cylinders,482
horsepower,708
vehicle_weight,0
acceleration,930
model_year,0
origin,0
fuel_type,0
drivetrain,0
num_doors,502


In [20]:
print(df.isnull().sum())


engine_displacement      0
num_cylinders          482
horsepower             708
vehicle_weight           0
acceleration           930
model_year               0
origin                   0
fuel_type                0
drivetrain               0
num_doors              502
fuel_efficiency_mpg      0
dtype: int64


In [21]:
max_efficiency_asia = df[df['origin'] == 'Asia']['fuel_efficiency_mpg'].max()
print(f"Maximum fuel efficiency of cars from Asia: {max_efficiency_asia:.2f} MPG")

Maximum fuel efficiency of cars from Asia: 23.76 MPG


In [22]:
median_horsepower_before = df['horsepower'].median()
print(f"Median horsepower before filling missing values: {median_horsepower_before:.2f}")

Median horsepower before filling missing values: 149.00


In [23]:
mode_horsepower = df['horsepower'].mode()[0]
print(f"Most frequent horsepower value: {mode_horsepower:.2f}")

Most frequent horsepower value: 152.00


In [24]:
df['horsepower'].fillna(mode_horsepower, inplace=True)
print("Missing values in horsepower column have been filled with the mode.")

Missing values in horsepower column have been filled with the mode.


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['horsepower'].fillna(mode_horsepower, inplace=True)


In [25]:
median_horsepower_after = df['horsepower'].median()
print(f"Median horsepower after filling missing values: {median_horsepower_after:.2f}")

if median_horsepower_before == median_horsepower_after:
    print("The median horsepower has not changed after filling missing values.")
else:
    print("The median horsepower has changed after filling missing values.")

Median horsepower after filling missing values: 152.00
The median horsepower has changed after filling missing values.


## Select cars from asia and relevant columns


In [26]:
asia_cars_subset = df[df['origin'] == 'Asia'][['vehicle_weight', 'model_year']]
display(asia_cars_subset.head())

Unnamed: 0,vehicle_weight,model_year
8,2714.21931,2016
12,2783.868974,2010
14,3582.687368,2007
20,2231.808142,2011
21,2659.431451,2016



Select the first 7 rows of the `asia_cars_subset`


In [27]:
asia_cars_first_7 = asia_cars_subset.head(7)
display(asia_cars_first_7.head())

Unnamed: 0,vehicle_weight,model_year
8,2714.21931,2016
12,2783.868974,2010
14,3582.687368,2007
20,2231.808142,2011
21,2659.431451,2016


## Get the underlying numpy array



Convert the `asia_cars_first_7` DataFrame to a NumPy array.



In [28]:
X = asia_cars_first_7.values
print(X)

[[2714.21930965 2016.        ]
 [2783.86897424 2010.        ]
 [3582.68736772 2007.        ]
 [2231.8081416  2011.        ]
 [2659.43145076 2016.        ]
 [2844.22753389 2014.        ]
 [3761.99403819 2019.        ]]


## Compute matrix multiplication (xtx)




Calculate the product of the transpose of X and X.



In [29]:
XTX = X.T @ X
print(XTX)

[[62248334.33150762 41431216.5073268 ]
 [41431216.5073268  28373339.        ]]


## Invert xtx


In [30]:
XTX_inv = np.linalg.inv(XTX)
print(XTX_inv)

[[ 5.71497081e-07 -8.34509443e-07]
 [-8.34509443e-07  1.25380877e-06]]


## Create array y


In [31]:
y = np.array([1100, 1300, 800, 900, 1000, 1100, 1200])
print(y)

[1100 1300  800  900 1000 1100 1200]


## Compute w



Calculate w using the formula w = np.linalg.inv(XTX) @ X.T @ y and print the result.



In [32]:
w = XTX_inv @ X.T @ y
print(w)

[0.01386421 0.5049067 ]



Calculate the sum of the elements in w and print the result.



In [33]:
sum_w = np.sum(w)
print(f"Sum of elements in w: {sum_w}")

Sum of elements in w: 0.5187709081074016



Calculate the sum of all elements in the array `w`.


In [34]:
sum_w = np.sum(w)
print(f"Sum of elements in w: {sum_w}")

Sum of elements in w: 0.5187709081074016


END