# MPG Cars

### Introduction:

The following exercise utilizes data from [UC Irvine Machine Learning Repository](https://archive.ics.uci.edu/ml/datasets/Auto+MPG)

### Step 1. Import the necessary libraries

In [1]:
import pandas as pd
import numpy as np

### Step 2. Import the first dataset [cars1](https://raw.githubusercontent.com/guipsamora/pandas_exercises/master/05_Merge/Auto_MPG/cars1.csv) and [cars2](https://raw.githubusercontent.com/guipsamora/pandas_exercises/master/05_Merge/Auto_MPG/cars2.csv).  

   ### Step 3. Assign each to a variable called cars1 and cars2

In [7]:
url_1 = 'https://raw.githubusercontent.com/guipsamora/pandas_exercises/master/05_Merge/Auto_MPG/cars1.csv'
url_2 = 'https://raw.githubusercontent.com/guipsamora/pandas_exercises/master/05_Merge/Auto_MPG/cars2.csv'
cars1 = pd.read_csv(url_1, sep=',')
cars2 = pd.read_csv(url_2, sep=',')

print(cars1.head())
print(cars2.head())

    mpg  cylinders  displacement horsepower  weight  acceleration  model  \
0  18.0          8           307        130    3504          12.0     70   
1  15.0          8           350        165    3693          11.5     70   
2  18.0          8           318        150    3436          11.0     70   
3  16.0          8           304        150    3433          12.0     70   
4  17.0          8           302        140    3449          10.5     70   

   origin                        car  Unnamed: 9  Unnamed: 10  Unnamed: 11  \
0       1  chevrolet chevelle malibu         NaN          NaN          NaN   
1       1          buick skylark 320         NaN          NaN          NaN   
2       1         plymouth satellite         NaN          NaN          NaN   
3       1              amc rebel sst         NaN          NaN          NaN   
4       1                ford torino         NaN          NaN          NaN   

   Unnamed: 12  Unnamed: 13  
0          NaN          NaN  
1          NaN

### Step 4. Oops, it seems our first dataset has some unnamed blank columns, fix cars1

In [8]:
print(cars1.columns)
print(cars2.columns)

Index(['mpg', 'cylinders', 'displacement', 'horsepower', 'weight',
       'acceleration', 'model', 'origin', 'car', 'Unnamed: 9', 'Unnamed: 10',
       'Unnamed: 11', 'Unnamed: 12', 'Unnamed: 13'],
      dtype='object')
Index(['mpg', 'cylinders', 'displacement', 'horsepower', 'weight',
       'acceleration', 'model', 'origin', 'car'],
      dtype='object')


In [9]:
cars1 = cars1.loc[:, 'mpg':'car']
cars1.head()

Unnamed: 0,mpg,cylinders,displacement,horsepower,weight,acceleration,model,origin,car
0,18.0,8,307,130,3504,12.0,70,1,chevrolet chevelle malibu
1,15.0,8,350,165,3693,11.5,70,1,buick skylark 320
2,18.0,8,318,150,3436,11.0,70,1,plymouth satellite
3,16.0,8,304,150,3433,12.0,70,1,amc rebel sst
4,17.0,8,302,140,3449,10.5,70,1,ford torino


### Step 5. What is the number of observations in each dataset?

In [10]:
print(cars1.shape)
print(cars2.shape)

(198, 9)
(200, 9)


### Step 6. Join cars1 and cars2 into a single DataFrame called cars

In [13]:
cars = cars1.append(cars2)
cars

Unnamed: 0,mpg,cylinders,displacement,horsepower,weight,acceleration,model,origin,car
0,18.0,8,307,130,3504,12.0,70,1,chevrolet chevelle malibu
1,15.0,8,350,165,3693,11.5,70,1,buick skylark 320
2,18.0,8,318,150,3436,11.0,70,1,plymouth satellite
3,16.0,8,304,150,3433,12.0,70,1,amc rebel sst
4,17.0,8,302,140,3449,10.5,70,1,ford torino
...,...,...,...,...,...,...,...,...,...
195,27.0,4,140,86,2790,15.6,82,1,ford mustang gl
196,44.0,4,97,52,2130,24.6,82,2,vw pickup
197,32.0,4,135,84,2295,11.6,82,1,dodge rampage
198,28.0,4,120,79,2625,18.6,82,1,ford ranger


In [14]:
cars.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 398 entries, 0 to 199
Data columns (total 9 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   mpg           398 non-null    float64
 1   cylinders     398 non-null    int64  
 2   displacement  398 non-null    int64  
 3   horsepower    398 non-null    object 
 4   weight        398 non-null    int64  
 5   acceleration  398 non-null    float64
 6   model         398 non-null    int64  
 7   origin        398 non-null    int64  
 8   car           398 non-null    object 
dtypes: float64(2), int64(5), object(2)
memory usage: 31.1+ KB


### Step 7. Oops, there is a column missing, called owners. Create a random number Series from 15,000 to 73,000.
- random.randint(low, high=None, size=None, dtype=int)
- size의 경우 int, 또는 tuple 형태의 int값들을 넣어줘도 됨

In [19]:
owners = np.random.randint(15000, high=73001, size=398)
owners

array([67790, 59469, 40627, 69782, 46013, 33695, 36987, 36706, 29771,
       64489, 34369, 21592, 64119, 46564, 46552, 37075, 37827, 44545,
       52524, 39961, 71157, 46356, 37688, 53593, 52569, 55607, 67324,
       15937, 55082, 72972, 40782, 36037, 63760, 37776, 15216, 24876,
       56851, 61964, 41291, 21208, 36088, 46038, 65554, 54225, 68395,
       58046, 45402, 38828, 24445, 31899, 38802, 29983, 29795, 38088,
       64915, 30809, 65914, 53548, 48305, 22462, 31666, 36510, 48605,
       44804, 32123, 65298, 26983, 52411, 29977, 62014, 15781, 49464,
       15250, 35186, 63704, 21466, 56136, 23327, 56245, 17680, 59361,
       57806, 38290, 28796, 56816, 30171, 66149, 40750, 70439, 52338,
       51903, 34859, 38148, 69165, 19824, 54942, 15284, 63972, 35585,
       20806, 46075, 56723, 50635, 42443, 70856, 47536, 45313, 56746,
       38777, 31725, 71157, 47779, 25224, 41362, 15301, 42901, 55397,
       72116, 21375, 15720, 29541, 56858, 44831, 23419, 61580, 28205,
       70514, 65017,

### Step 8. Add the column owners to cars

In [20]:
cars['owners'] = owners
cars.head()

Unnamed: 0,mpg,cylinders,displacement,horsepower,weight,acceleration,model,origin,car,owners
0,18.0,8,307,130,3504,12.0,70,1,chevrolet chevelle malibu,67790
1,15.0,8,350,165,3693,11.5,70,1,buick skylark 320,59469
2,18.0,8,318,150,3436,11.0,70,1,plymouth satellite,40627
3,16.0,8,304,150,3433,12.0,70,1,amc rebel sst,69782
4,17.0,8,302,140,3449,10.5,70,1,ford torino,46013
