In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

In [2]:
df=pd.read_csv("car-price-data.csv")

### Features

For the rest of the homework, you'll need to use only these columns:

* `Make`,
* `Model`,
* `Year`,
* `Engine HP`,
* `Engine Cylinders`,
* `Transmission Type`,
* `Vehicle Style`,
* `highway MPG`,
* `city mpg`

### Data preparation

* Select only the features from above and transform their names using next line:
  ```
  data.columns = data.columns.str.replace(' ', '_').str.lower()
  ```
* Fill in the missing values of the selected features with 0.
* Rename `MSRP` variable to `price`.


In [3]:
list(df.columns.values)

['Make',
 'Model',
 'Year',
 'Engine Fuel Type',
 'Engine HP',
 'Engine Cylinders',
 'Transmission Type',
 'Driven_Wheels',
 'Number of Doors',
 'Market Category',
 'Vehicle Size',
 'Vehicle Style',
 'highway MPG',
 'city mpg',
 'Popularity',
 'MSRP']

In [4]:
df.columns=df.columns.str.lower().str.replace(" ","_")

In [5]:
list(df.columns.values)

['make',
 'model',
 'year',
 'engine_fuel_type',
 'engine_hp',
 'engine_cylinders',
 'transmission_type',
 'driven_wheels',
 'number_of_doors',
 'market_category',
 'vehicle_size',
 'vehicle_style',
 'highway_mpg',
 'city_mpg',
 'popularity',
 'msrp']

In [6]:
base=['make',
 'model',
 'year',
 'engine_hp',
 'engine_cylinders',
 'transmission_type',
 'vehicle_style',
 'highway_mpg',
 'city_mpg']

In [7]:
df[base]=df[base].fillna(0)

In [8]:
df[base].isnull().sum()

make                 0
model                0
year                 0
engine_hp            0
engine_cylinders     0
transmission_type    0
vehicle_style        0
highway_mpg          0
city_mpg             0
dtype: int64

In [9]:
df.rename(columns={'msrp':'price'},inplace=True)

In [10]:
df.columns

Index(['make', 'model', 'year', 'engine_fuel_type', 'engine_hp',
       'engine_cylinders', 'transmission_type', 'driven_wheels',
       'number_of_doors', 'market_category', 'vehicle_size', 'vehicle_style',
       'highway_mpg', 'city_mpg', 'popularity', 'price'],
      dtype='object')

### Question 1

What is the most frequent observation (mode) for the column `transmission_type`?

- `AUTOMATIC`
- `MANUAL`
- `AUTOMATED_MANUAL`
- `DIRECT_DRIVE`

In [11]:
df.transmission_type.value_counts()

AUTOMATIC           8266
MANUAL              2935
AUTOMATED_MANUAL     626
DIRECT_DRIVE          68
UNKNOWN               19
Name: transmission_type, dtype: int64

### Question 2

Create the [correlation matrix](https://www.google.com/search?q=correlation+matrix) for the numerical features of your dataset. 
In a correlation matrix, you compute the correlation coefficient between every pair of features in the dataset.

What are the two features that have the biggest correlation in this dataset?

- `engine_hp` and `year`
- `engine_hp` and `engine_cylinders`
- `highway_mpg` and `engine_cylinders`
- `highway_mpg` and `city_mpg`

In [12]:
numerical_features = df.select_dtypes(include=['number'])

In [13]:
# Calculate the correlation matrix
correlation_matrix = numerical_features.corr()

In [14]:
correlation_matrix

Unnamed: 0,year,engine_hp,engine_cylinders,number_of_doors,highway_mpg,city_mpg,popularity,price
year,1.0,0.338714,-0.040708,0.263787,0.25824,0.198171,0.073049,0.22759
engine_hp,0.338714,1.0,0.774851,-0.10764,-0.415707,-0.424918,0.031409,0.650095
engine_cylinders,-0.040708,0.774851,1.0,-0.141498,-0.614541,-0.587306,0.045226,0.526274
number_of_doors,0.263787,-0.10764,-0.141498,1.0,0.11857,0.120881,-0.048272,-0.126635
highway_mpg,0.25824,-0.415707,-0.614541,0.11857,1.0,0.886829,-0.020991,-0.160043
city_mpg,0.198171,-0.424918,-0.587306,0.120881,0.886829,1.0,-0.003217,-0.157676
popularity,0.073049,0.031409,0.045226,-0.048272,-0.020991,-0.003217,1.0,-0.048476
price,0.22759,0.650095,0.526274,-0.126635,-0.160043,-0.157676,-0.048476,1.0


In [15]:
largest_correlations = correlation_matrix.abs().unstack().sort_values(ascending=False)

In [16]:
largest_correlations=largest_correlations[largest_correlations<1]

In [17]:
# The first two elements of 'largest_correlations' will be the two features with the biggest correlation
feature1, feature2 = largest_correlations.index[0]

In [18]:
feature1, feature2

('city_mpg', 'highway_mpg')

### Make `price` binary

* Now we need to turn the `price` variable from numeric into a binary format.
* Let's create a variable `above_average` which is `1` if the `price` is above its mean value and `0` otherwise.


In [19]:
df.price.mean()

40594.737032063116

In [20]:
df["above_average"]=(df.price>df.price.mean()).astype("int")

In [21]:
df.head()

Unnamed: 0,make,model,year,engine_fuel_type,engine_hp,engine_cylinders,transmission_type,driven_wheels,number_of_doors,market_category,vehicle_size,vehicle_style,highway_mpg,city_mpg,popularity,price,above_average
0,BMW,1 Series M,2011,premium unleaded (required),335.0,6.0,MANUAL,rear wheel drive,2.0,"Factory Tuner,Luxury,High-Performance",Compact,Coupe,26,19,3916,46135,1
1,BMW,1 Series,2011,premium unleaded (required),300.0,6.0,MANUAL,rear wheel drive,2.0,"Luxury,Performance",Compact,Convertible,28,19,3916,40650,1
2,BMW,1 Series,2011,premium unleaded (required),300.0,6.0,MANUAL,rear wheel drive,2.0,"Luxury,High-Performance",Compact,Coupe,28,20,3916,36350,0
3,BMW,1 Series,2011,premium unleaded (required),230.0,6.0,MANUAL,rear wheel drive,2.0,"Luxury,Performance",Compact,Coupe,28,18,3916,29450,0
4,BMW,1 Series,2011,premium unleaded (required),230.0,6.0,MANUAL,rear wheel drive,2.0,Luxury,Compact,Convertible,28,18,3916,34500,0


### Split the data

* Split your data in train/val/test sets with 60%/20%/20% distribution.
* Use Scikit-Learn for that (the `train_test_split` function) and set the seed to `42`.
* Make sure that the target value (`price`) is not in your dataframe.

In [22]:
from sklearn.model_selection import train_test_split

In [23]:
df_full_train,df_test=train_test_split(df,test_size=0.2,random_state=42)

In [24]:
df_train,df_val=train_test_split(df_full_train,test_size=0.25,random_state=42)

In [25]:
len(df_full_train),len(df_train),len(df_test),len(df_val)

(9531, 7148, 2383, 2383)

In [26]:
df_train=df_train.reset_index(drop=True)
df_val=df_val.reset_index(drop=True)
df_test=df_test.reset_index(drop=True)
df_full_train=df_full_train.reset_index(drop=True)

In [27]:
y_train=df_train.above_average.values
y_val=df_val.above_average.values
y_test=df_test.above_average.values
y_full_train=df_full_train.above_average.values


In [28]:
df_full_train

Unnamed: 0,make,model,year,engine_fuel_type,engine_hp,engine_cylinders,transmission_type,driven_wheels,number_of_doors,market_category,vehicle_size,vehicle_style,highway_mpg,city_mpg,popularity,price,above_average
0,Cadillac,CT6,2016,premium unleaded (recommended),265.0,4.0,AUTOMATIC,rear wheel drive,4.0,Luxury,Large,Sedan,31,22,1624,53495,1
1,Mercedes-Benz,GLS-Class,2017,premium unleaded (required),449.0,8.0,AUTOMATIC,all wheel drive,4.0,"Crossover,Luxury,Performance",Large,4dr SUV,18,14,617,93850,1
2,Kia,Forte,2016,regular unleaded,173.0,4.0,AUTOMATIC,front wheel drive,2.0,,Compact,Coupe,34,25,1720,19890,0
3,Dodge,RAM 250,1993,regular unleaded,180.0,6.0,MANUAL,rear wheel drive,2.0,,Large,Regular Cab Pickup,16,11,1851,2000,0
4,Hyundai,Tiburon,2008,regular unleaded,172.0,6.0,AUTOMATIC,front wheel drive,2.0,Hatchback,Compact,2dr Hatchback,24,17,1439,21270,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9526,Toyota,Venza,2014,regular unleaded,181.0,4.0,AUTOMATIC,front wheel drive,4.0,Crossover,Midsize,Wagon,26,20,2031,27950,0
9527,Pontiac,G6,2009,flex-fuel (unleaded/E85),219.0,6.0,AUTOMATIC,front wheel drive,4.0,Flex Fuel,Midsize,Sedan,26,17,210,24710,0
9528,Volkswagen,Golf GTI,2016,premium unleaded (recommended),220.0,4.0,AUTOMATED_MANUAL,front wheel drive,2.0,"Hatchback,Performance",Compact,2dr Hatchback,33,25,873,27590,0
9529,Saab,9-5,2009,premium unleaded (recommended),260.0,4.0,AUTOMATIC,front wheel drive,4.0,"Luxury,Performance",Midsize,Wagon,27,17,376,43270,1


### Question 3

* Calculate the mutual information score between `above_average` and other categorical variables in our dataset. 
  Use the training set only.
* Round the scores to 2 decimals using `round(score, 2)`.

Which of these variables has the lowest mutual information score?
  
- `make`
- `model`
- `transmission_type`
- `vehicle_style`

In [29]:
df_train.head()

Unnamed: 0,make,model,year,engine_fuel_type,engine_hp,engine_cylinders,transmission_type,driven_wheels,number_of_doors,market_category,vehicle_size,vehicle_style,highway_mpg,city_mpg,popularity,price,above_average
0,Mitsubishi,Endeavor,2011,premium unleaded (recommended),225.0,6.0,AUTOMATIC,all wheel drive,4.0,Crossover,Midsize,4dr SUV,19,15,436,33599,0
1,Kia,Borrego,2009,regular unleaded,276.0,6.0,AUTOMATIC,rear wheel drive,4.0,,Midsize,4dr SUV,21,17,1720,26245,0
2,Lamborghini,Gallardo,2012,premium unleaded (required),570.0,10.0,MANUAL,all wheel drive,2.0,"Exotic,Factory Tuner,High-Performance",Compact,Convertible,20,12,1158,248000,1
3,Chevrolet,Colorado,2016,regular unleaded,200.0,4.0,AUTOMATIC,rear wheel drive,4.0,Diesel,Compact,Crew Cab Pickup,27,20,1385,24990,0
4,Pontiac,Vibe,2009,regular unleaded,158.0,4.0,AUTOMATIC,all wheel drive,4.0,Hatchback,Compact,4dr Hatchback,26,20,210,20475,0


In [30]:
from sklearn.metrics import mutual_info_score

In [31]:
def mutual_info_above_average_score(series):
    return mutual_info_score(df_full_train.above_average,series)

In [63]:
mi=df_full_train[base].apply(mutual_info_above_average_score)
round(mi.sort_values(ascending=True),2)

transmission_type    0.02
highway_mpg          0.04
city_mpg             0.06
year                 0.07
vehicle_style        0.08
engine_cylinders     0.12
make                 0.24
engine_hp            0.36
model                0.46
dtype: float64

### Question 4

* Now let's train a logistic regression.
* Remember that we have several categorical variables in the dataset. Include them using one-hot encoding.
* Fit the model on the training dataset.
    - To make sure the results are reproducible across different versions of Scikit-Learn, fit the model with these parameters:
    - `model = LogisticRegression(solver='liblinear', C=10, max_iter=1000, random_state=42)`
* Calculate the accuracy on the validation dataset and round it to 2 decimal digits.

What accuracy did you get?

- 0.60
- 0.72
- 0.84
- 0.95

In [64]:
df_train.dtypes

make                  object
model                 object
year                   int64
engine_fuel_type      object
engine_hp            float64
engine_cylinders     float64
transmission_type     object
driven_wheels         object
number_of_doors      float64
market_category       object
vehicle_size          object
vehicle_style         object
highway_mpg            int64
city_mpg               int64
popularity             int64
price                  int64
above_average          int64
dtype: object

In [65]:
df_train.nunique()

make                   48
model                 869
year                   28
engine_fuel_type       10
engine_hp             339
engine_cylinders        9
transmission_type       5
driven_wheels           4
number_of_doors         3
market_category        66
vehicle_size            3
vehicle_style          16
highway_mpg            55
city_mpg               62
popularity             48
price                4349
above_average           2
dtype: int64

In [66]:
base=['make',
 'model',
 'year',
 'engine_hp',
 'engine_cylinders',
 'transmission_type',
 'vehicle_style',
 'highway_mpg',
 'city_mpg']

In [67]:
train_dict=df_train[base].to_dict(orient="records")

In [68]:
from sklearn.feature_extraction import DictVectorizer
dv = DictVectorizer(sparse=False)
X_train=dv.fit_transform(train_dict)
val_dict=df_val[base].to_dict(orient="records")
X_val = dv.transform(val_dict)

In [69]:
X_val

array([[2.300e+01, 4.000e+00, 2.100e+02, ..., 0.000e+00, 0.000e+00,
        2.015e+03],
       [1.700e+01, 6.000e+00, 3.540e+02, ..., 0.000e+00, 0.000e+00,
        2.015e+03],
       [2.200e+01, 4.000e+00, 1.400e+02, ..., 1.000e+00, 0.000e+00,
        2.005e+03],
       ...,
       [1.200e+01, 6.000e+00, 1.900e+02, ..., 0.000e+00, 0.000e+00,
        2.003e+03],
       [1.400e+01, 8.000e+00, 4.300e+02, ..., 0.000e+00, 0.000e+00,
        2.015e+03],
       [1.800e+01, 6.000e+00, 3.210e+02, ..., 1.000e+00, 0.000e+00,
        2.015e+03]])

In [70]:
from sklearn.linear_model import LogisticRegression

In [71]:
model = LogisticRegression(solver='liblinear', C=10, max_iter=1000, random_state=42)

In [72]:
model.fit(X_train,y_train)

In [73]:
# Bias 
model.intercept_[0]

-0.3499031192168502

In [74]:
model.coef_.round(3)

array([[ 7.600e-02, -1.280e-01,  3.700e-02, -2.000e-03,  1.315e+00,
         1.774e+00,  5.850e-01,  2.762e+00,  2.142e+00,  1.760e-01,
         0.000e+00, -4.770e-01,  2.101e+00, -1.269e+00, -1.335e+00,
        -3.856e+00, -5.040e-01,  3.660e-01, -1.815e+00, -7.610e-01,
         6.050e-01, -2.380e-01, -1.248e+00, -2.693e+00,  3.750e-01,
        -1.490e+00,  1.000e-02,  1.928e+00,  1.248e+00,  1.197e+00,
         4.014e+00,  1.038e+00,  5.000e-03, -1.658e+00,  0.000e+00,
         8.740e-01, -1.991e+00, -8.740e-01, -1.418e+00, -3.450e-01,
        -3.168e+00,  2.021e+00,  9.580e-01,  8.920e-01, -2.320e-01,
         3.240e-01, -2.701e+00, -1.910e+00,  3.225e+00, -9.930e-01,
        -5.730e-01,  1.262e+00, -1.346e+00, -3.940e-01, -4.700e-02,
        -9.000e-03, -2.000e-03, -1.176e+00, -9.030e-01, -3.000e-03,
        -4.000e-03, -3.000e-03, -1.880e-01, -2.300e-02,  1.647e+00,
         4.320e-01, -4.520e-01, -6.500e-02, -1.640e-01, -1.610e-01,
        -0.000e+00, -1.600e-02, -9.940e-01,  1.5

In [75]:
model.predict(X_train)

array([0, 0, 1, ..., 0, 0, 0])

In [76]:
y_pred=model.predict_proba(X_val)[:,1]

In [77]:
y_pred

array([1.59213958e-03, 9.95013070e-01, 2.14143302e-04, ...,
       6.31038001e-04, 9.87567368e-01, 9.84569331e-01])

In [78]:
y_train

array([0, 0, 1, ..., 0, 0, 0])

In [79]:
above_average_decision=(y_pred>=0.5)

In [80]:
above_average_decision

array([False,  True, False, ..., False,  True,  True])

In [81]:
(y_val==above_average_decision).mean()

0.9454469156525388

In [82]:
df_pred=pd.DataFrame()

In [83]:
df_pred["probability"]=y_pred

In [84]:
df_pred["prediction"]=above_average_decision.astype("int")

In [85]:
df_pred["actual"]=y_val

In [86]:
df_pred["correct"]=df_pred.prediction == df_pred.actual

In [87]:
x=df_pred.correct.mean()

In [88]:
acc_orj=df_pred.correct.mean()

In [89]:
df_pred.head(50)

Unnamed: 0,probability,prediction,actual,correct
0,0.001592,0,0,True
1,0.995013,1,1,True
2,0.000214,0,0,True
3,0.206066,0,0,True
4,0.001812,0,0,True
5,0.000339,0,0,True
6,0.000947,0,0,True
7,0.137551,0,0,True
8,0.424989,0,0,True
9,0.000917,0,0,True


In [90]:
x-acc_orj

0.0

In [91]:
acc_orj

0.9454469156525388

### Question 5 

* Let's find the least useful feature using the *feature elimination* technique.
* Train a model with all these features (using the same parameters as in Q4).
* Now exclude each feature from this set and train a model without it. Record the accuracy for each model.
* For each feature, calculate the difference between the original accuracy and the accuracy without the feature. 

Which of following feature has the smallest difference?

- `year`
- `engine_hp`
- `transmission_type`
- `city_mpg`

> **Note**: the difference doesn't have to be positive

In [92]:
round(mi.sort_values(ascending=True),2)

transmission_type    0.02
highway_mpg          0.04
city_mpg             0.06
year                 0.07
vehicle_style        0.08
engine_cylinders     0.12
make                 0.24
engine_hp            0.36
model                0.46
dtype: float64

In [107]:
base=['make',
 'model',
 'year',
 'engine_hp',
 'engine_cylinders',
 'transmission_type',
 'vehicle_style',
 'highway_mpg',
 'city_mpg']

In [108]:
acc={}

n=len(base)

for i in range(len(base)):
    
    a=base.pop(i)
    df_pred=pd.DataFrame()
    dv = DictVectorizer(sparse=False)
    
    train_dict=df_train[base].to_dict(orient="records")
    X_train=dv.fit_transform(train_dict)
    val_dict=df_val[base].to_dict(orient="records")
    X_val = dv.transform(val_dict)
    model = LogisticRegression(solver='liblinear', C=10, max_iter=1000, random_state=42)
    model.fit(X_train,y_train)
    y_pred=model.predict_proba(X_val)[:,1]
    above_average_decision=(y_pred>=0.5)

    df_pred["probability"]=y_pred
    df_pred["prediction"]=above_average_decision.astype("int")
    df_pred["actual"]=y_val
    df_pred["correct"]=df_pred.prediction == df_pred.actual
    acc[a]=acc_orj - df_pred.correct.mean()
    
    base=['make',
 'model',
 'year',
 'engine_hp',
 'engine_cylinders',
 'transmission_type',
 'vehicle_style',
 'highway_mpg','city_mpg']

    

In [109]:
acc

{'make': -0.0012589173310952884,
 'model': 0.026017624842635367,
 'year': -0.0016785564414603105,
 'engine_hp': 0.022660511959714635,
 'engine_cylinders': 0.0,
 'transmission_type': 0.005035669324381042,
 'vehicle_style': 0.013428451531682706,
 'highway_mpg': -0.0012589173310952884,
 'city_mpg': -0.00041963911036513313}

### Question 6

* For this question, we'll see how to use a linear regression model from Scikit-Learn.
* We'll need to use the original column `price`. Apply the logarithmic transformation to this column.
* Fit the Ridge regression model on the training data with a solver `'sag'`. Set the seed to `42`.
* This model also has a parameter `alpha`. Let's try the following values: `[0, 0.01, 0.1, 1, 10]`.
* Round your RMSE scores to 3 decimal digits.

Which of these alphas leads to the best RMSE on the validation set?

- 0
- 0.01
- 0.1
- 1
- 10

> **Note**: If there are multiple options, select the smallest `alpha`.

In [119]:
from sklearn.linear_model import Ridge
from sklearn.metrics import mean_squared_error
from math import sqrt

In [115]:
df_train["price"]=np.log1p(df_train["price"])
df_val["price"]=np.log1p(df_val["price"])
df_test["price"]=np.log1p(df_test["price"])

In [122]:
y_train=df_train["price"].values
y_val=df_val["price"].values
y_test=df_test["price"].values

In [123]:
alpha_values = [0, 0.01, 0.1, 1, 10]

In [124]:
rmse_scores = {}
for alpha in alpha_values:
    model = Ridge(alpha=alpha, solver='sag', random_state=42)
    model.fit(X_train, y_train)
    y_pred = model.predict(X_val)
    rmse = round(sqrt(mean_squared_error(y_val, y_pred)), 3)
    
    rmse_scores[alpha] = rmse
    
for alpha, rmse in rmse_scores.items():
    print(f"Alpha: {alpha}, RMSE: {rmse}")
    



Alpha: 0, RMSE: 7.766
Alpha: 0.01, RMSE: 7.766
Alpha: 0.1, RMSE: 7.766
Alpha: 1, RMSE: 7.766
Alpha: 10, RMSE: 7.766


