# Import the `auto-mpg dataset`
- Download the `auto-mpg dataset`

In [34]:
import pandas as pd

# convert to dataframe object df
df = pd.read_csv('auto-mpg.csv')
df.head()

Unnamed: 0,mpg,cylinders,displacement,horsepower,weight,acceleration,model year,origin,car name,CAR Number
0,18,8,307.0,130,3504,12.0,70.0,1,chevrolet chevelle malibu,L6V 043
1,15,8,350.0,165,3693,11.5,70.0,1,buick skylark 320,RTY079
2,18,8,318.0,150,3436,11.0,70.0,1,plymouth satellite,MTP600
3,16,8,304.0,150,3433,12.0,70.0,1,amc rebel sst,MNJ000
4,17,8,302.0,140,3449,10.5,70.0,1,ford torino,JEETMEET


# Drop column `Origin` from the dataset
```python
DataFrame.drop(labels=None, *, axis=0, index=None, columns=None, level=None, inplace=False, errors='raise')
```
## axis {0 or ‘index’, 1 or ‘columns’}, default 0
- Whether to drop labels from the index (0 or ‘index’) or columns (1 or ‘columns’).
### Reference
- https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.DataFrame.drop.html

In [35]:
df = df.drop('origin',axis=1)
df.head()

Unnamed: 0,mpg,cylinders,displacement,horsepower,weight,acceleration,model year,car name,CAR Number
0,18,8,307.0,130,3504,12.0,70.0,chevrolet chevelle malibu,L6V 043
1,15,8,350.0,165,3693,11.5,70.0,buick skylark 320,RTY079
2,18,8,318.0,150,3436,11.0,70.0,plymouth satellite,MTP600
3,16,8,304.0,150,3433,12.0,70.0,amc rebel sst,MNJ000
4,17,8,302.0,140,3449,10.5,70.0,ford torino,JEETMEET


# Check data type whether it's appropriate and then transform data

In [58]:
# check datatypes
# df.info()

# noted datatypes for model year is float64 and change it to int64 because year doesn't have decimal
df['model year'] = df['model year'].astype('int',errors='ignore')
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 98 entries, 0 to 97
Data columns (total 9 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   mpg           98 non-null     int64  
 1   cylinders     98 non-null     int64  
 2   displacement  98 non-null     float64
 3   horsepower    98 non-null     int64  
 4   weight        98 non-null     int64  
 5   acceleration  96 non-null     float64
 6   model year    96 non-null     float64
 7   car name      98 non-null     object 
 8   CAR Number    91 non-null     object 
dtypes: float64(3), int64(4), object(2)
memory usage: 7.0+ KB


# Find out all the missing values in the dataset and replace it with its most appropriate replacement. 
Finding out missing values: We can use `.sum()` method after applying `.isnull()` this will return sum of missing values within each columns in the dataframe
Note: `isna()` is the updated function
```python
import pandas as pd
df = pd.read_csv('IMDB-Movie-Data.csv')
df.isnull().sum()
```

In [62]:
# find the sum value of null found in the dataset
df.isna().sum()

mpg             0
cylinders       0
displacement    0
horsepower      0
weight          0
acceleration    2
model year      2
car name        0
CAR Number      7
dtype: int64

# remove null values and replace with most appropriate replacement
- References: https://sparkbyexamples.com/pandas/pandas-get-column-average-mean/
- References: https://www.tutorialspoint.com/how-to-display-most-frequent-value-in-a-pandas-series
- Since car number are unique ID therefore I will leave it as is

In [66]:
# Mean value for each column for acceleration
mean_acceleration = round(df['acceleration'].mean(),1)

# Use mode for the highest frequency the year the car was made
mode_year = df['model year'].mode()[0]

# Fillna() on multiple columns with different values
df =  df.fillna(value={'acceleration':mean_acceleration,'model year':mode_year})
df.isna().sum()

mpg             0
cylinders       0
displacement    0
horsepower      0
weight          0
acceleration    0
model year      0
car name        0
CAR Number      7
dtype: int64