## download and read the csv file and make ID the index column

In [58]:
import pandas as pd

data = pd.read_csv('../data/auto.csv', index_col='ID')
df = pd.DataFrame(data)
df

Unnamed: 0_level_0,CarNumber,Make_n_model,Refund,Fines,History
ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
0,Y163O8161RUS,Ford Focus,2.0,3200.0,
1,E432XX77RUS,Toyota Camry,1.0,6500.0,
2,7184TT36RUS,Ford Focus,1.0,2100.0,
3,X582HE161RUS,Ford Focus,2.0,2000.0,
4,E34877152RUS,Ford Focus,2.0,6100.0,
...,...,...,...,...,...
926,Y163O8161RUS,Ford Focus,2.0,1600.0,
927,M0309X197RUS,Ford Focus,1.0,22300.0,
928,O673E8197RUS,Ford Focus,2.0,600.0,
929,8610T8154RUS,Ford Focus,1.0,2000.0,


### count the number of observations using method count()

In [59]:
df.count()[0]

931

## drop the duplicates taking into account only the following columns: CarNumber, Make_n_model, Fines
### between the two equal observations, you need to choose the last

In [60]:
df.drop_duplicates(subset=['CarNumber', 'Make_n_model', 'Fines'], inplace=True, keep='last')
df

Unnamed: 0_level_0,CarNumber,Make_n_model,Refund,Fines,History
ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
0,Y163O8161RUS,Ford Focus,2.0,3200.0,
1,E432XX77RUS,Toyota Camry,1.0,6500.0,
2,7184TT36RUS,Ford Focus,1.0,2100.0,
3,X582HE161RUS,Ford Focus,2.0,2000.0,
5,92918M178RUS,Ford Focus,1.0,5700.0,
...,...,...,...,...,...
926,Y163O8161RUS,Ford Focus,2.0,1600.0,
927,M0309X197RUS,Ford Focus,1.0,22300.0,
928,O673E8197RUS,Ford Focus,2.0,600.0,
929,8610T8154RUS,Ford Focus,1.0,2000.0,


### check again the number of observations

In [61]:
df.count()[0]

725

## **work with missing values**

### check how many missing values are in each column

In [62]:
df.isnull().sum()

CarNumber         0
Make_n_model      0
Refund           12
Fines            60
History         660
dtype: int64

drop all the columns that have more than 500 missing values using the argument thresh, check how many missing values are in each column

In [63]:
df.dropna(axis='columns', thresh=500, inplace=True)
df.isnull().sum()

CarNumber        0
Make_n_model     0
Refund          12
Fines           60
dtype: int64

### replace all the missing values in the Refund column by the previous value in this column for that cell, use the argument method, check how many missing values are in each column


In [64]:
df['Refund'] = df['Refund'].fillna(method='ffill')
df.isnull().sum()

CarNumber        0
Make_n_model     0
Refund           0
Fines           60
dtype: int64

### replace all the missing values in the Fines column by the mean value of this column (exclude NA/null values when computing the mean value), check how many missing values are in each column

In [65]:
df['Fines'] = df['Fines'].fillna(df['Fines'].mean(skipna=True))
df.isnull().sum()

CarNumber       0
Make_n_model    0
Refund          0
Fines           0
dtype: int64

## **split and parse the make and model**
 ### use the method apply both for splitting and for extracting the values to the new columns Make and Model
 ### drop the column Make_n_model

In [66]:
make_n_model = df['Make_n_model'].apply(lambda x: pd.Series(x.split(' ', 2)))

df['Make'] = make_n_model[0]
df['Model'] = make_n_model[1]
df.drop(['Make_n_model'], axis='columns', inplace=True)
df

Unnamed: 0_level_0,CarNumber,Refund,Fines,Make,Model
ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
0,Y163O8161RUS,2.0,3200.000000,Ford,Focus
1,E432XX77RUS,1.0,6500.000000,Toyota,Camry
2,7184TT36RUS,1.0,2100.000000,Ford,Focus
3,X582HE161RUS,2.0,2000.000000,Ford,Focus
5,92918M178RUS,1.0,5700.000000,Ford,Focus
...,...,...,...,...,...
926,Y163O8161RUS,2.0,1600.000000,Ford,Focus
927,M0309X197RUS,1.0,22300.000000,Ford,Focus
928,O673E8197RUS,2.0,600.000000,Ford,Focus
929,8610T8154RUS,1.0,2000.000000,Ford,Focus


save the dataframe in the JSON file auto.json in the format below:

In [67]:
df.to_json('../data/auto.json', orient="records")