## read the JSON file that you saved in ex02
### one of the columns has the float type, so let us define the format of it in pandas using pd.options.display.float_format: floats should be displayed with two decimals
### there are missing values in Model, do not do anything with them

In [79]:
import pandas as pd
import numpy as np
import requests

data = pd.read_json('../data/auto.json')
pd.options.display.float_format = '{:,.2f}'.format
df = pd.DataFrame(data)

df

Unnamed: 0,CarNumber,Refund,Fines,Make,Model
0,Y163O8161RUS,2,3200.00,Ford,Focus
1,E432XX77RUS,1,6500.00,Toyota,Camry
2,7184TT36RUS,1,2100.00,Ford,Focus
3,X582HE161RUS,2,2000.00,Ford,Focus
4,92918M178RUS,1,5700.00,Ford,Focus
...,...,...,...,...,...
720,Y163O8161RUS,2,1600.00,Ford,Focus
721,M0309X197RUS,1,22300.00,Ford,Focus
722,O673E8197RUS,2,600.00,Ford,Focus
723,8610T8154RUS,1,2000.00,Ford,Focus


## enrich the dataframe using a sample from that dataframe

### create a sample with 200 new observations with the random_state = 21
* the sample should not have new combinations of the car number, make
and model – so the whole dataset will be consistent in these terms
* the refund and fines do not have any restrictions, you can randomly take
any value from these columns and use it towards any car number


In [80]:
samples = df.sample(200, random_state=21)

np.random.seed(21)
samples['Refund'] = np.random.randint(0, high=3, size=200)
samples['Fines'] = np.random.randint(100, high=30_000, size=200)
samples


Unnamed: 0,CarNumber,Refund,Fines,Make,Model
445,M0299X197RUS,1,7527,Ford,Focus
22,83298C154RUS,0,13178,Ford,Focus
93,H957HY161RUS,0,21487,Ford,Focus
173,T941CC96RUS,0,26103,Ford,Focus
697,H966HY161RUS,0,583,Ford,Focus
...,...,...,...,...,...
14,8182XX154RUS,2,29000,Ford,Focus
623,X796TH96RUS,2,17899,Ford,Focus
498,T011MY163RUS,2,9666,Ford,Focus
536,T341CC96RUS,1,572,Volkswagen,Passat


### concatenate the sample with the initial dataframe to a new dataframe concat_rows

In [81]:
concat_rows = pd.concat([df, samples])
concat_rows

Unnamed: 0,CarNumber,Refund,Fines,Make,Model
0,Y163O8161RUS,2,3200.00,Ford,Focus
1,E432XX77RUS,1,6500.00,Toyota,Camry
2,7184TT36RUS,1,2100.00,Ford,Focus
3,X582HE161RUS,2,2000.00,Ford,Focus
4,92918M178RUS,1,5700.00,Ford,Focus
...,...,...,...,...,...
14,8182XX154RUS,2,29000.00,Ford,Focus
623,X796TH96RUS,2,17899.00,Ford,Focus
498,T011MY163RUS,2,9666.00,Ford,Focus
536,T341CC96RUS,1,572.00,Volkswagen,Passat


##  enrich the dataframe concat_rows by a new column with generated data
### create a series with the name Year using random integers from 1980 to 2019

In [82]:
total_length = len(concat_rows)

concat_rows['Year'] = np.random.randint(low=1980, high=2019, size=total_length)
fines = concat_rows
fines

Unnamed: 0,CarNumber,Refund,Fines,Make,Model,Year
0,Y163O8161RUS,2,3200.00,Ford,Focus,2005
1,E432XX77RUS,1,6500.00,Toyota,Camry,1996
2,7184TT36RUS,1,2100.00,Ford,Focus,1998
3,X582HE161RUS,2,2000.00,Ford,Focus,1992
4,92918M178RUS,1,5700.00,Ford,Focus,2004
...,...,...,...,...,...,...
14,8182XX154RUS,2,29000.00,Ford,Focus,2015
623,X796TH96RUS,2,17899.00,Ford,Focus,1997
498,T011MY163RUS,2,9666.00,Ford,Focus,2003
536,T341CC96RUS,1,572.00,Volkswagen,Passat,2015


## enrich the dataframe by the data from another dataframe

*  create a new dataframe with the car numbers and their owners
* get the most popular surnames(you can find the file surname.json in
the attachments) in the US
* create a new series with the surnames from the data you gathered, the count should be equal to the number of the unique car numbers using sample
(use random_state = 21)
* create the dataframe owners with 2 columns: CarNumber and SURNAME

In [83]:
surnames = pd.DataFrame(pd.read_json('../data/surname.json', orient='records'))
surnames.columns = surnames.iloc[0]
surnames = surnames.iloc[1:]
uniq_car_numbers = list(fines.CarNumber.unique())
surnames = surnames['NAME'].sample(len(uniq_car_numbers), random_state=21, replace=True)

owners = pd.DataFrame({ 'CarNumber': uniq_car_numbers, 'SURNAME': surnames})
owners

Unnamed: 0,CarNumber,SURNAME
74,Y163O8161RUS,RICHARDSON
80,E432XX77RUS,ROSS
57,7184TT36RUS,MORGAN
5,X582HE161RUS,BAILEY
49,92918M178RUS,LOPEZ
...,...,...
10,O136HO197RUS,CAMPBELL
32,O22097197RUS,HALL
6,M0309X197RUS,BAKER
21,O673E8197RUS,DIAZ


### append to the fines dataframe 5 more observations (come up with your own ideas of CarNumber, etc.)


In [84]:
fines = fines.append(pd.DataFrame(
        [['FFF123RUS', 1, 10000, 'BMW', 'X5', 2019],
        ['UB4153RUS', 2, 1000, 'Lada', 'Granta', 2016],
        ['CNF363RUS', 1, 2600, 'Liaz', 'Pandas', 2012],
        ['FFF123RUS', 2, 2350, 'BMW', 'X5', 2019],
        ['UFF865RUS', 0, 2360, 'UAZ', 'Patriot', 2017],]
        , columns=['CarNumber', 'Refund', 'Fines', 'Make', 'Model', 'Year']
    ),  ignore_index=True
)
fines

Unnamed: 0,CarNumber,Refund,Fines,Make,Model,Year
0,Y163O8161RUS,2,3200.00,Ford,Focus,2005
1,E432XX77RUS,1,6500.00,Toyota,Camry,1996
2,7184TT36RUS,1,2100.00,Ford,Focus,1998
3,X582HE161RUS,2,2000.00,Ford,Focus,1992
4,92918M178RUS,1,5700.00,Ford,Focus,2004
...,...,...,...,...,...,...
925,FFF123RUS,1,10000.00,BMW,X5,2019
926,UB4153RUS,2,1000.00,Lada,Granta,2016
927,CNF363RUS,1,2600.00,Liaz,Pandas,2012
928,FFF123RUS,2,2350.00,BMW,X5,2019


### delete from the owners dataframe last 20 observations and add 3 new observations (they are not the same as those you add to the fines dataframe)


In [85]:
owners = owners[:-20]
owners = owners.append([
    { 'CarNumber': 'FQW675RUS', 'SURNAME': 'GRIFFIN'},
    { 'CarNumber': 'AWF325RUS', 'SURNAME': 'JEFFERSON'},
    { 'CarNumber': 'FAW421RUS', 'SURNAME': 'ADAMS'}
    ], ignore_index=True)
owners

Unnamed: 0,CarNumber,SURNAME
0,Y163O8161RUS,RICHARDSON
1,E432XX77RUS,ROSS
2,7184TT36RUS,MORGAN
3,X582HE161RUS,BAILEY
4,92918M178RUS,LOPEZ
...,...,...
509,O50197197RUS,WRIGHT
510,7608EE777RUS,HILL
511,FQW675RUS,GRIFFIN
512,AWF325RUS,JEFFERSON


### join both dataframes:
* the new dataframe should have only the car numbers that exist in both dataframes

In [86]:
fines.merge(owners,how='inner',on='CarNumber')

Unnamed: 0,CarNumber,Refund,Fines,Make,Model,Year,SURNAME
0,Y163O8161RUS,2,3200.00,Ford,Focus,2005,RICHARDSON
1,Y163O8161RUS,2,1600.00,Ford,Focus,2008,RICHARDSON
2,E432XX77RUS,1,6500.00,Toyota,Camry,1996,ROSS
3,E432XX77RUS,2,13000.00,Toyota,Camry,2018,ROSS
4,7184TT36RUS,1,2100.00,Ford,Focus,1998,MORGAN
...,...,...,...,...,...,...,...
894,E41977152RUS,2,2400.00,Ford,Focus,2013,BAKER
895,9464EX178RUS,2,2100.00,Ford,Focus,2007,MARTIN
896,O50197197RUS,2,7800.00,Ford,Focus,1990,WRIGHT
897,7608EE777RUS,1,4000.00,Skoda,Octavia,2018,HILL


### the new dataframe should have all the car numbers that exist in both dataframes


In [87]:
fines.merge(owners,how='outer',on='CarNumber')

Unnamed: 0,CarNumber,Refund,Fines,Make,Model,Year,SURNAME
0,Y163O8161RUS,2.00,3200.00,Ford,Focus,2005.00,RICHARDSON
1,Y163O8161RUS,2.00,1600.00,Ford,Focus,2008.00,RICHARDSON
2,E432XX77RUS,1.00,6500.00,Toyota,Camry,1996.00,ROSS
3,E432XX77RUS,2.00,13000.00,Toyota,Camry,2018.00,ROSS
4,7184TT36RUS,1.00,2100.00,Ford,Focus,1998.00,MORGAN
...,...,...,...,...,...,...,...
928,CNF363RUS,1.00,2600.00,Liaz,Pandas,2012.00,
929,UFF865RUS,0.00,2360.00,UAZ,Patriot,2017.00,
930,FQW675RUS,,,,,,GRIFFIN
931,AWF325RUS,,,,,,JEFFERSON


### the new dataframe should have only the car numbers from the fines dataframe

In [88]:
fines.merge(owners,how='left',on='CarNumber')

Unnamed: 0,CarNumber,Refund,Fines,Make,Model,Year,SURNAME
0,Y163O8161RUS,2,3200.00,Ford,Focus,2005,RICHARDSON
1,E432XX77RUS,1,6500.00,Toyota,Camry,1996,ROSS
2,7184TT36RUS,1,2100.00,Ford,Focus,1998,MORGAN
3,X582HE161RUS,2,2000.00,Ford,Focus,1992,BAILEY
4,92918M178RUS,1,5700.00,Ford,Focus,2004,LOPEZ
...,...,...,...,...,...,...,...
925,FFF123RUS,1,10000.00,BMW,X5,2019,
926,UB4153RUS,2,1000.00,Lada,Granta,2016,
927,CNF363RUS,1,2600.00,Liaz,Pandas,2012,
928,FFF123RUS,2,2350.00,BMW,X5,2019,


### the new dataframe should have only the car numbers from the owners dataframe


In [89]:
fines.merge(owners,how='right',on='CarNumber')

Unnamed: 0,CarNumber,Refund,Fines,Make,Model,Year,SURNAME
0,Y163O8161RUS,2.00,3200.00,Ford,Focus,2005.00,RICHARDSON
1,Y163O8161RUS,2.00,1600.00,Ford,Focus,2008.00,RICHARDSON
2,E432XX77RUS,1.00,6500.00,Toyota,Camry,1996.00,ROSS
3,E432XX77RUS,2.00,13000.00,Toyota,Camry,2018.00,ROSS
4,7184TT36RUS,1.00,2100.00,Ford,Focus,1998.00,MORGAN
...,...,...,...,...,...,...,...
897,7608EE777RUS,1.00,4000.00,Skoda,Octavia,2018.00,HILL
898,7608EE777RUS,2.00,7765.00,Skoda,Octavia,1996.00,HILL
899,FQW675RUS,,,,,,GRIFFIN
900,AWF325RUS,,,,,,JEFFERSON


## create a pivot table from the fines dataframe, it should look like this (the values are the sums of the fines), but only with all the years (the values can be different for you):

In [103]:
fines.pivot_table(columns='Year', index=['Make','Model'], values='Fines', aggfunc=sum)

Unnamed: 0_level_0,Year,1980,1981,1982,1983,1984,1985,1986,1987,1988,1989,...,2010,2011,2012,2013,2014,2015,2016,2017,2018,2019
Make,Model,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1
BMW,X5,,,,,,,,,,,...,,,,,,,,,,12350.0
Ford,Focus,148018.59,112362.59,257239.0,96863.17,105726.0,236836.0,144476.17,201930.59,166592.17,130577.76,...,172272.0,368431.17,229330.0,203406.59,129548.0,169735.76,97176.76,171449.59,85242.0,
Ford,Mondeo,,,,34400.0,,,7800.0,,,,...,,8600.0,,,,,,,2200.0,
Lada,Granta,,,,,,,,,,,...,,,,,,,1000.0,,,
Liaz,Pandas,,,,,,,,,,,...,,,2600.0,,,,,,,
Skoda,Octavia,,1588.0,73000.0,300.0,600.0,31775.0,2400.0,28494.59,,8200.0,...,16120.0,36829.0,4400.0,9800.0,28040.0,,,,9100.0,
Toyota,Camry,,,,,1844.0,,8594.59,,7500.0,1000.0,...,,,11600.0,,,12000.0,19089.0,4150.0,13000.0,
Toyota,Corolla,,,,,12154.0,,,,,9600.0,...,4400.0,,,574.0,,39500.0,,12700.0,15004.0,
UAZ,Patriot,,,,,,,,,,,...,,,,,,,,2360.0,,
Volkswagen,Golf,4600.0,,,,,10600.0,,7841.0,,12547.0,...,,,,44119.0,,,17508.0,4873.0,,


 ## save both dataframes fines and owners to csv files without index

In [105]:
fines.to_csv('../data/fines.csv', index=False)
owners.to_csv('../data/owners.csv', index=False)