# Joining Data with Pandas

***

## Data Merging Basics

### Importing Libraries

In [1]:
# Import libarary
import pickle
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np

In [3]:
# Load 1st dataset tax_owners

taxi_owners = pickle.load(
    open("D:/git_repositories/Datacamp-Joining_data_with_pandas/Datasets/taxi_owners.p", "rb")
)

In [4]:
# Load 2nd dataset taxi_vehicles

taxi_veh = pickle.load(
    open("D:/git_repositories/Datacamp-Joining_data_with_pandas/Datasets/taxi_veh.p", "rb")
)

---

### Showing Datasets

In [6]:
# 1st data
taxi_owners.head()

Unnamed: 0,rid,vid,owner,address,zip
0,T6285,6285,AGEAN TAXI LLC,4536 N. ELSTON AVE.,60630
1,T4862,4862,MANGIB CORP.,5717 N. WASHTENAW AVE.,60659
2,T1495,1495,"FUNRIDE, INC.",3351 W. ADDISON ST.,60618
3,T4231,4231,ALQUSH CORP.,6611 N. CAMPBELL AVE.,60645
4,T5971,5971,EUNIFFORD INC.,3351 W. ADDISON ST.,60618


In [7]:
# 2nd data
taxi_veh.head()

Unnamed: 0,vid,make,model,year,fuel_type,owner
0,2767,TOYOTA,CAMRY,2013,HYBRID,SEYED M. BADRI
1,1411,TOYOTA,RAV4,2017,HYBRID,DESZY CORP.
2,6500,NISSAN,SENTRA,2019,GASOLINE,AGAPH CAB CORP
3,2746,TOYOTA,CAMRY,2013,HYBRID,"MIDWEST CAB CO, INC"
4,5922,TOYOTA,CAMRY,2013,HYBRID,SUMETTI CAB CO


#### Total Columns and Rows in these datasets

In [8]:
taxi_owners.shape

(3519, 5)

In taxi owners dataset there are 5 columns and 3,519 rows.

In [9]:
taxi_veh.shape

(3519, 6)

In taxi vehicles dataset there are 6 columns and 3,519 rows.

---

### Inner Join

**Merge taxi_owners with taxi_veh on the column vid, and save the result to taxi_own_veh**

In [10]:
# Merging two datasets

taxi_owners_veh = taxi_owners.merge(
    taxi_veh,       # merging 2nd dataset in 1st dataset
    on = 'vid'      # mergind id or same column 'vid'
)

In [12]:
# Printing out the new merged dataset
print(taxi_owners_veh.columns)

Index(['rid', 'vid', 'owner_x', 'address', 'zip', 'make', 'model', 'year',
       'fuel_type', 'owner_y'],
      dtype='object')


In [13]:
# Showing merged dataset
taxi_owners_veh.head()

Unnamed: 0,rid,vid,owner_x,address,zip,make,model,year,fuel_type,owner_y
0,T6285,6285,AGEAN TAXI LLC,4536 N. ELSTON AVE.,60630,NISSAN,ALTIMA,2011,HYBRID,AGEAN TAXI LLC
1,T4862,4862,MANGIB CORP.,5717 N. WASHTENAW AVE.,60659,HONDA,CRV,2014,GASOLINE,MANGIB CORP.
2,T1495,1495,"FUNRIDE, INC.",3351 W. ADDISON ST.,60618,TOYOTA,SIENNA,2015,GASOLINE,"FUNRIDE, INC."
3,T4231,4231,ALQUSH CORP.,6611 N. CAMPBELL AVE.,60645,TOYOTA,CAMRY,2014,HYBRID,ALQUSH CORP.
4,T5971,5971,EUNIFFORD INC.,3351 W. ADDISON ST.,60618,TOYOTA,SIENNA,2015,GASOLINE,EUNIFFORD INC.


**Set the left and right table suffixes for overlapping columns of the merge to _own and _veh, respectively.**

In [17]:
taxi_owners_veh = taxi_owners.merge(
    taxi_veh,
    on = 'vid',
    suffixes = ('_own', '_veh')
)

In [18]:
taxi_owners_veh.head()

Unnamed: 0,rid,vid,owner_own,address,zip,make,model,year,fuel_type,owner_veh
0,T6285,6285,AGEAN TAXI LLC,4536 N. ELSTON AVE.,60630,NISSAN,ALTIMA,2011,HYBRID,AGEAN TAXI LLC
1,T4862,4862,MANGIB CORP.,5717 N. WASHTENAW AVE.,60659,HONDA,CRV,2014,GASOLINE,MANGIB CORP.
2,T1495,1495,"FUNRIDE, INC.",3351 W. ADDISON ST.,60618,TOYOTA,SIENNA,2015,GASOLINE,"FUNRIDE, INC."
3,T4231,4231,ALQUSH CORP.,6611 N. CAMPBELL AVE.,60645,TOYOTA,CAMRY,2014,HYBRID,ALQUSH CORP.
4,T5971,5971,EUNIFFORD INC.,3351 W. ADDISON ST.,60618,TOYOTA,SIENNA,2015,GASOLINE,EUNIFFORD INC.


In [19]:
# select the fuel type column and counts the items
taxi_owners_veh['fuel_type'].value_counts()

HYBRID                    2792
GASOLINE                   611
FLEX FUEL                   89
COMPRESSED NATURAL GAS      27
Name: fuel_type, dtype: int64

---

### New Datasets

**Chicago Wards and Chicago census dataset**

In [20]:
# Chicago wards dataset
wards = pickle.load(
    open("D:/git_repositories/Datacamp-Joining_data_with_pandas/Datasets/ward.p", "rb")
)

In [21]:
# Print the shape of the wards
print("Total rows and columns of wards are: ", wards.shape)

Total rows and columns of wards are:  (50, 4)


In [22]:
wards.head()

Unnamed: 0,ward,alderman,address,zip
0,1,"Proco ""Joe"" Moreno",2058 NORTH WESTERN AVENUE,60647
1,2,Brian Hopkins,1400 NORTH ASHLAND AVENUE,60622
2,3,Pat Dowell,5046 SOUTH STATE STREET,60609
3,4,William D. Burns,"435 EAST 35TH STREET, 1ST FLOOR",60616
4,5,Leslie A. Hairston,2325 EAST 71ST STREET,60649


In [23]:
# Chicago census dataset

census = pickle.load(
    open("D:/git_repositories/Datacamp-Joining_data_with_pandas/Datasets/census.p", "rb")
)

In [24]:
# Print the shape of the census
print("Total rows and columns of census dataset is: ", census.shape)

Total rows and columns of census dataset is:  (50, 6)


In [25]:
# Census Dataset
census.head()

Unnamed: 0,ward,pop_2000,pop_2010,change,address,zip
0,1,52951,56149,6%,2765 WEST SAINT MARY STREET,60647
1,2,54361,55805,3%,WM WASTE MANAGEMENT 1500,60622
2,3,40385,53039,31%,17 EAST 38TH STREET,60653
3,4,51953,54589,5%,31ST ST HARBOR BUILDING LAKEFRONT TRAIL,60653
4,5,55302,51455,-7%,JACKSON PARK LAGOON SOUTH CORNELL DRIVE,60637


#### Merging wards and census dataset