# Set up the environment

In [1]:
import sys
print(sys.version)

3.12.1 (main, Jul 10 2025, 11:57:50) [GCC 13.3.0]


In [3]:
import pandas as pd
import numpy as np

import seaborn as sns
from matplotlib import pyplot as plt
%matplotlib inline

# Q1. Pandas version

What version of Pandas did you install?

In [4]:
PV = pd.__version__

In [6]:
print(f"Pandas Version installed: {PV}")

Pandas Version installed: 2.3.1


# Getting the data

In [5]:
df = pd.read_csv('https://raw.githubusercontent.com/alexeygrigorev/datasets/master/car_fuel_efficiency.csv')

In [7]:
df.head()

Unnamed: 0,engine_displacement,num_cylinders,horsepower,vehicle_weight,acceleration,model_year,origin,fuel_type,drivetrain,num_doors,fuel_efficiency_mpg
0,170,3.0,159.0,3413.433759,17.7,2003,Europe,Gasoline,All-wheel drive,0.0,13.231729
1,130,5.0,97.0,3149.664934,17.8,2007,USA,Gasoline,Front-wheel drive,0.0,13.688217
2,170,,78.0,3079.038997,15.1,2018,Europe,Gasoline,Front-wheel drive,0.0,14.246341
3,220,4.0,,2542.392402,20.2,2009,USA,Diesel,All-wheel drive,2.0,16.912736
4,210,1.0,140.0,3460.87099,14.4,2009,Europe,Gasoline,All-wheel drive,2.0,12.488369


# Q2. Records count

How many records are in the dataset?

In [8]:
records = len(df)

In [9]:
print(f"the dataset has: {records} records")

the dataset has: 9704 records


# Q3. Fuel Types

How many fuel types are presented in the dataset?

In [10]:
fuelTypes = df.fuel_type.nunique()

In [11]:
print(f"The dataset has {fuelTypes} fuel Types")

The dataset has 2 fuel Types


# Q4. Missing values

How many columns in the dataset have missing values?

In [12]:
ncWMV = df.isnull().any().sum()

In [13]:
print(f"The dataset has {ncWMV} columns with missing values")

The dataset has 4 columns with missing values


# Q5. Max fuel efficiency

What's the maximum fuel efficiency of cars from Asia?

In [14]:
df.origin.unique()

array(['Europe', 'USA', 'Asia'], dtype=object)

In [15]:
df.groupby('origin').fuel_efficiency_mpg.max()

origin
Asia      23.759123
Europe    25.967222
USA       24.971452
Name: fuel_efficiency_mpg, dtype: float64

In [16]:
mask = df['origin'] == "Asia"
vMax = df.loc[mask, 'fuel_efficiency_mpg'].max()
print(f"The maximum fuel efficiency of cars from Asia is {vMax}")

The maximum fuel efficiency of cars from Asia is 23.759122836520497


# Q6. Median value of horsepower

Find the median value of the horsepower column in the dataset

In [17]:
median_value_nulls = df.horsepower.mean()

In [18]:
print(f"The median value of the horsepower with nulls is {median_value_nulls}")

The median value of the horsepower with nulls is 149.65729212983547


Next, calculate the most frequent value of the same horsepower column.

In [19]:
most_freq_hp = df.horsepower.mode()[0]

Use the fillna method to fill the missing values in the horsepower column with the most frequent value from the previous step.

In [20]:
df.horsepower.fillna(most_freq_hp, inplace=True )

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df.horsepower.fillna(most_freq_hp, inplace=True )


Now, calculate the median value of horsepower once again.

In [21]:
median_value_without_nulls = df.horsepower.mean()

In [22]:
print(f"The median value of the horsepower without nulls is {median_value_without_nulls}")

The median value of the horsepower without nulls is 149.82821516900248


R/ The median value of the horsepower increased

# Q7. Sum of weights

Select all the cars from Asia

In [23]:
carsAsia = df[df['origin'] == 'Asia']

Select only columns vehicle_weight and model_year

In [24]:
carsAsia_filt = carsAsia[['vehicle_weight','model_year']]

In [25]:
carsAsia_filt

Unnamed: 0,vehicle_weight,model_year
8,2714.219310,2016
12,2783.868974,2010
14,3582.687368,2007
20,2231.808142,2011
21,2659.431451,2016
...,...,...
9688,3948.404625,2018
9692,3680.341381,2016
9693,2545.070139,2012
9698,3107.427820,2005


Select the first 7 values

In [26]:
carsAsia_filt7 = carsAsia_filt.iloc[:7]

In [27]:
carsAsia_filt7

Unnamed: 0,vehicle_weight,model_year
8,2714.21931,2016
12,2783.868974,2010
14,3582.687368,2007
20,2231.808142,2011
21,2659.431451,2016
34,2844.227534,2014
38,3761.994038,2019


Get the underlying NumPy array. Let's call it X.

In [28]:
X = carsAsia_filt7.to_numpy()

In [29]:
X

array([[2714.21930965, 2016.        ],
       [2783.86897424, 2010.        ],
       [3582.68736772, 2007.        ],
       [2231.8081416 , 2011.        ],
       [2659.43145076, 2016.        ],
       [2844.22753389, 2014.        ],
       [3761.99403819, 2019.        ]])

Compute matrix-matrix multiplication between the transpose of X and X. To get the transpose, use X.T. Let's call the result XTX.

In [30]:
XTX = X.T @ X

In [31]:
XTX

array([[62248334.33150762, 41431216.5073268 ],
       [41431216.5073268 , 28373339.        ]])

Invert XTX.

In [32]:
I_XTX = np.linalg.inv(XTX)

In [33]:
I_XTX

array([[ 5.71497081e-07, -8.34509443e-07],
       [-8.34509443e-07,  1.25380877e-06]])

Create an array y with values [1100, 1300, 800, 900, 1000, 1100, 1200].

In [34]:
y = np.array([1100, 1300, 800, 900, 1000, 1100, 1200])

In [35]:
y

array([1100, 1300,  800,  900, 1000, 1100, 1200])

Multiply the inverse of XTX with the transpose of X, and then multiply the result by y. Call the result w.

In [37]:
w = I_XTX @ X.T @ y

In [38]:
w

array([0.01386421, 0.5049067 ])

What's the sum of all the elements of the result?

In [39]:
w_sum = w.sum()

In [40]:
print(f"The sum of all the elements of the result is {w_sum}")

The sum of all the elements of the result is 0.5187709081074016
