# Joining Data with pandas - 1

In [1]:
import pandas as pd
import numpy as np

%load_ext nb_black

<IPython.core.display.Javascript object>

In [2]:
taxi_owners = pd.read_pickle("data/taxi_owners.p")
taxi_veh = pd.read_pickle("data/taxi_vehicles.p")

wards = pd.read_pickle("data/ward.p")
census = pd.read_pickle("data/census.p")

<IPython.core.display.Javascript object>

## Inner Join

In [3]:
# Merge the taxi_owners and taxi_veh tables setting a suffix
taxi_own_veh = taxi_owners.merge(taxi_veh, on="vid", suffixes=("_own", "_veh"))

# Print the value_counts to find the most popular fuel_type
print(taxi_own_veh["fuel_type"].value_counts())

HYBRID                    2792
GASOLINE                   611
FLEX FUEL                   89
COMPRESSED NATURAL GAS      27
Name: fuel_type, dtype: int64


<IPython.core.display.Javascript object>

In [4]:
# Change '1' to None in `ward` col
census.loc[census["ward"] == "1", "ward"] = None

# Merge the wards and census tables on the ward column
wards_census = wards.merge(census, on="ward")

# Print the shape of wards_census
print(wards_census.shape)

(49, 9)


<IPython.core.display.Javascript object>

## One to Many Relationships

In [5]:
biz_owners = pd.read_pickle("data/business_owners.p")
licenses = pd.read_pickle("data/licenses.p")

<IPython.core.display.Javascript object>

In [6]:
# Merge the licenses and biz_owners table on account
licenses_owners = pd.merge(licenses, biz_owners, on="account")

# Group the results by title then count the number of accounts
counted_df = licenses_owners.groupby(["title"]).agg({"account": "count"})

# Sort the counted_df in desending order
sorted_df = counted_df.sort_values(by="account", ascending=False)

# Use .head() method to print the first few rows of sorted_df
print(sorted_df.head())

                 account
title                   
PRESIDENT           6259
SECRETARY           5205
SOLE PROPRIETOR     1658
OTHER               1200
VICE PRESIDENT       970


<IPython.core.display.Javascript object>

In [7]:
land_use = pd.read_pickle("data/land_use.p")

<IPython.core.display.Javascript object>

In [8]:
# Merge land_use and census and merge result with licenses including suffixes
land_cen_lic = land_use.merge(census, on="ward").merge(
    licenses, on="ward", suffixes=("_cen", "_lic")
)

# Group by ward, pop_2010, and vacant, then count the # of accounts
pop_vac_lic = land_cen_lic.groupby(["ward", "pop_2010", "vacant"], as_index=False).agg(
    {"account": "count"}
)

# Sort pop_vac_lic and print the results
sorted_pop_vac_lic = pop_vac_lic.sort_values(
    by=["vacant", "account", "pop_2010"], ascending=[False, True, True]
)

# Print the top few rows of sorted_pop_vac_lic
print(sorted_pop_vac_lic.head())

   ward  pop_2010  vacant  account
46    7     51581      19       80
11   20     52372      15      123
0    10     51535      14      130
15   24     54909      13       98
6    16     51954      13      156


<IPython.core.display.Javascript object>

## Merging on indexes

In [9]:
movies = pd.read_pickle("data/movies.p")
ratings = pd.read_pickle("data/ratings.p")

<IPython.core.display.Javascript object>

In [10]:
movies.set_index(["id"], inplace=True)
ratings.set_index(["id"], inplace=True)

<IPython.core.display.Javascript object>

In [11]:
# Merge to the movies table the ratings table on the index
movies_ratings = pd.merge(movies, ratings, on="id")

# Print the first few rows of movies_ratings
print(movies_ratings.head())

                      title  popularity release_date  vote_average  vote_count
id                                                                            
257            Oliver Twist   20.415572   2005-09-23           6.7       274.0
14290  Better Luck Tomorrow    3.877036   2002-01-12           6.5        27.0
38365             Grown Ups   38.864027   2010-06-24           6.0      1705.0
9672               Infamous    3.680896   2006-11-16           6.4        60.0
12819       Alpha and Omega   12.300789   2010-09-17           5.3       124.0


<IPython.core.display.Javascript object>

In [12]:
sequels = pd.read_pickle("data/sequels.p")
financials = pd.read_pickle("data/financials.p")

<IPython.core.display.Javascript object>

In [13]:
sequels.set_index(["id"], inplace=True)
financials.set_index(["id"], inplace=True)

<IPython.core.display.Javascript object>

In [14]:
# Merge sequels and financials on index id
sequels_fin = sequels.merge(financials, on="id", how="left")

sequels_fin.dropna(inplace=True)

# Self merge with suffixes as inner join with left on sequel and right on id
orig_seq = sequels_fin.merge(
    sequels_fin,
    how="inner",
    left_on="sequel",
    right_on="id",
    right_index=True,
    suffixes=("_org", "_seq"),
)

# Add calculation to subtract revenue_org from revenue_seq
orig_seq["diff"] = orig_seq["revenue_seq"] - orig_seq["revenue_org"]

# Select the title_org, title_seq, and diff
titles_diff = orig_seq[["title_org", "title_seq", "diff"]]

# Print the first rows of the sorted titles_diff
print(titles_diff.sort_values(by="diff", ascending=False).head())

                                         title_org  \
id                                                   
1771            Captain America: The First Avenger   
816    Austin Powers: International Man of Mystery   
87            Indiana Jones and the Temple of Doom   
862                                      Toy Story   
36657                                        X-Men   

                                   title_seq         diff  
id                                                         
1771     Captain America: The Winter Soldier  344196798.0  
816    Austin Powers: The Spy Who Shagged Me  243256097.0  
87        Indiana Jones and the Last Crusade  141171806.0  
862                              Toy Story 2  123812836.0  
36657                                     X2  111372022.0  


<IPython.core.display.Javascript object>