In [10]:
import pandas as pd
import numpy as np

## Q1. Pandas Version

In [11]:
# What's the version of Pandas that you installed?
pd.__version__

'2.3.2'

## Getting the data
```wget https://raw.githubusercontent.com/alexeygrigorev/datasets/master/car_fuel_efficiency.csv```

In [12]:
# Loading the dataset
URL = "https://raw.githubusercontent.com/alexeygrigorev/datasets/master/car_fuel_efficiency.csv"
FILE = "./car_fuel_efficiency.csv"
df = pd.read_csv(FILE)
df.head()

Unnamed: 0,engine_displacement,num_cylinders,horsepower,vehicle_weight,acceleration,model_year,origin,fuel_type,drivetrain,num_doors,fuel_efficiency_mpg
0,170,3.0,159.0,3413.433759,17.7,2003,Europe,Gasoline,All-wheel drive,0.0,13.231729
1,130,5.0,97.0,3149.664934,17.8,2007,USA,Gasoline,Front-wheel drive,0.0,13.688217
2,170,,78.0,3079.038997,15.1,2018,Europe,Gasoline,Front-wheel drive,0.0,14.246341
3,220,4.0,,2542.392402,20.2,2009,USA,Diesel,All-wheel drive,2.0,16.912736
4,210,1.0,140.0,3460.87099,14.4,2009,Europe,Gasoline,All-wheel drive,2.0,12.488369


## Records Count

In [13]:
# How many records are in the dataset?
df.shape

(9704, 11)

## Fuel Types

In [14]:
# How many fuel types are presented in the dataset?
len(df['fuel_type'].unique())

2

## Missing values

In [15]:
# How many columns in the dataset have missing values?
df.isnull().any().sum()
missing_summary = df.isnull().sum()
columns_with_missing = missing_summary[missing_summary > 0]
len(columns_with_missing)

4

## Max fuel efficiency

In [16]:
# What's the maximum fuel efficiency of cars from Asia?.
df_asia = df[df['origin'] == 'Asia']
df_asia.head()
max(df_asia['fuel_efficiency_mpg'])


23.759122836520497

## Median value of horsepower

In [17]:
# Find the median value of horsepower column in the dataset.
df['horsepower'].median()

np.float64(149.0)

In [None]:
# Next, calculate the most frequent value of the same horsepower column.
most_frequent_hp = df['horsepower'].mode().iloc[0]
most_frequent_hp

np.float64(152.0)

In [19]:
# Use fillna method to fill the missing values in horsepower column with the most frequent value from the previous step.
df['horsepower'] = df['horsepower'].fillna(most_frequent_hp)

In [20]:
# Now, calculate the median value of horsepower once again.
df['horsepower'].median()

np.float64(152.0)

## Sum of weights

In [78]:
# Select all the cars from Asia
df_asia = df[df['origin'] == 'Asia']
df_asia.head()

Unnamed: 0,engine_displacement,num_cylinders,horsepower,vehicle_weight,acceleration,model_year,origin,fuel_type,drivetrain,num_doors,fuel_efficiency_mpg
8,250,1.0,174.0,2714.21931,10.3,2016,Asia,Diesel,Front-wheel drive,-1.0,16.823554
12,320,5.0,145.0,2783.868974,15.1,2010,Asia,Diesel,All-wheel drive,1.0,16.17582
14,200,6.0,160.0,3582.687368,14.9,2007,Asia,Diesel,All-wheel drive,0.0,11.871091
20,150,3.0,197.0,2231.808142,18.7,2011,Asia,Gasoline,Front-wheel drive,1.0,18.889083
21,160,4.0,133.0,2659.431451,,2016,Asia,Gasoline,Front-wheel drive,-1.0,16.07773


In [79]:
# Select only columns vehicle_weight and model_year
df_asia_subset = df_asia[['vehicle_weight', 'model_year']]
df_asia_subset.head()

Unnamed: 0,vehicle_weight,model_year
8,2714.21931,2016
12,2783.868974,2010
14,3582.687368,2007
20,2231.808142,2011
21,2659.431451,2016


In [80]:
# Select the first 7 values
df_asia_top_seven = df_asia_subset.head(7)
df_asia_top_seven

Unnamed: 0,vehicle_weight,model_year
8,2714.21931,2016
12,2783.868974,2010
14,3582.687368,2007
20,2231.808142,2011
21,2659.431451,2016
34,2844.227534,2014
38,3761.994038,2019


In [81]:
# Get the underlying NumPy array. Let's call it X
X = np.array(df_asia_top_seven)
X

array([[2714.21930965, 2016.        ],
       [2783.86897424, 2010.        ],
       [3582.68736772, 2007.        ],
       [2231.8081416 , 2011.        ],
       [2659.43145076, 2016.        ],
       [2844.22753389, 2014.        ],
       [3761.99403819, 2019.        ]])

In [82]:
# Compute matrix-matrix multiplication between the transpose of X and X.
XTX = X.T @  X
XTX

array([[62248334.33150762, 41431216.50732678],
       [41431216.50732678, 28373339.        ]])

In [83]:
# invert XTX
inverted_XTX = np.linalg.pinv(XTX)
inverted_XTX


array([[ 5.71497081e-07, -8.34509443e-07],
       [-8.34509443e-07,  1.25380877e-06]])

In [84]:
# Create an array y with values [1100, 1300, 800, 900, 1000, 1100, 1200]
y = np.array(
    [1100, 1300, 800, 900, 1000, 1100, 1200]
)
y

array([1100, 1300,  800,  900, 1000, 1100, 1200])

In [85]:
# Multiply the inverse of XTX with the transpose of X
z = inverted_XTX @ X.T
z

array([[-1.31202622e-04, -8.63909858e-05,  3.72634923e-04,
        -4.02726650e-04, -1.62513724e-04, -5.52342829e-05,
         4.65094049e-04],
       [ 2.62636846e-04,  1.96990690e-04, -4.73392228e-04,
         6.58944477e-04,  3.08357831e-04,  1.51636137e-04,
        -6.07979633e-04]])

In [86]:
# multiply the result by y
w = z @ y
w

array([0.01386421, 0.5049067 ])

In [87]:
# What's the sum of all the elements of the result?
np.sum(w)

np.float64(0.5187709081074007)