In [1]:
import numpy as np
import pandas as pd
print('NumPy version: ',np.__version__)
print('Pandas version:',pd.__version__)

NumPy version:  1.26.4
Pandas version: 2.2.1


In [2]:
# For convenience
def make_df(cols, ind):
    """Quickly make a DataFrame"""
    data = {c: [str(c) + str(i) for i in ind] for c in cols}
    return pd.DataFrame(data, ind)

# Example
df = make_df('ABC', range(3))

In [3]:
df

Unnamed: 0,A,B,C
0,A0,B0,C0
1,A1,B1,C1
2,A2,B2,C2


In [7]:
# NumPy concatenate
x = [1, 2, 3]
y = [4, 5, 6]
z = [7, 8, 9]
np.concatenate([x, y, z])

array([1, 2, 3, 4, 5, 6, 7, 8, 9])

In [8]:
x = [[1, 2],
     [3, 4]]
np.concatenate([x, x], axis=1)

array([[1, 2, 1, 2],
       [3, 4, 3, 4]])

In [9]:
x = [[1, 2],
     [3, 4]]
np.concatenate([x, x], axis=0)

array([[1, 2],
       [3, 4],
       [1, 2],
       [3, 4]])

### Pandas concatenation:
1. `pd.concat()` 

In [10]:
ser1 = pd.Series(['A', 'B', 'C'], index=[1, 2, 3])
ser2 = pd.Series(['D', 'E', 'F'], index=[4, 5, 6])
pd.concat([ser1, ser2])

1    A
2    B
3    C
4    D
5    E
6    F
dtype: object

In [12]:
df1 = make_df('AB',[1,2])
df2 = make_df('AB',[3,4])
print(df1)
print(df2)
print(pd.concat([df1,df2]))

    A   B
1  A1  B1
2  A2  B2
    A   B
3  A3  B3
4  A4  B4
    A   B
1  A1  B1
2  A2  B2
3  A3  B3
4  A4  B4


In [16]:
df3 = make_df('CD',[1,2])
print(df3)
print(pd.concat([df1,df3]))
print(pd.concat([df1,df3], axis=1))

    C   D
1  C1  D1
2  C2  D2
     A    B    C    D
1   A1   B1  NaN  NaN
2   A2   B2  NaN  NaN
1  NaN  NaN   C1   D1
2  NaN  NaN   C2   D2
    A   B   C   D
1  A1  B1  C1  D1
2  A2  B2  C2  D2


In [20]:
df4 = make_df('ABC',[1,2])
df5 = make_df('BCD',[3,4])
print(df4)
print(df5)
print(pd.concat([df4,df5]))  #join='outer' - union

    A   B   C
1  A1  B1  C1
2  A2  B2  C2
    B   C   D
3  B3  C3  D3
4  B4  C4  D4
     A   B   C    D
1   A1  B1  C1  NaN
2   A2  B2  C2  NaN
3  NaN  B3  C3   D3
4  NaN  B4  C4   D4


In [21]:
print(pd.concat([df4,df5],join='inner'))  # intersection

    B   C
1  B1  C1
2  B2  C2
3  B3  C3
4  B4  C4


## Pandas merge function

`pd.merge()` implements a subset of *relational algebra*.

It implements 3 types of joins: the *one-to-one*, the *many-to-one* and the *many-to-many* joins.

`pd.merge(df1,df2, on='col_name', how='type_of_join', suffixes=['_x','_y'])`

### Example of US States data

Task: rank the US states and territories by their 2010 population density

In [45]:
pop = pd.read_csv("D:/Python/Project/Machine-Learning/Basic Programming/data/state-population.csv")
areas = pd.read_csv('D:/Python/Project/Machine-Learning/Basic Programming/data/state-areas.csv')
abbrevs = pd.read_csv('D:/Python/Project/Machine-Learning/Basic Programming/data/state-abbrevs.csv')

In [46]:
print(pop.head())
print(areas.head())
print(abbrevs.head())

  state/region     ages  year  population
0           AL  under18  2012   1117489.0
1           AL    total  2012   4817528.0
2           AL  under18  2010   1130966.0
3           AL    total  2010   4785570.0
4           AL  under18  2011   1125763.0
        state  area (sq. mi)
0     Alabama          52423
1      Alaska         656425
2     Arizona         114006
3    Arkansas          53182
4  California         163707
        state abbreviation
0     Alabama           AL
1      Alaska           AK
2     Arizona           AZ
3    Arkansas           AR
4  California           CA


In [47]:
state_pop = pd.merge(pop,abbrevs,
                    left_on='state/region', right_on='abbreviation')

# Drop duplicate attribute
state_pop = state_pop.drop('abbreviation',axis=1)
state_pop.head()

Unnamed: 0,state/region,ages,year,population,state
0,AL,under18,2012,1117489.0,Alabama
1,AL,total,2012,4817528.0,Alabama
2,AL,under18,2010,1130966.0,Alabama
3,AL,total,2010,4785570.0,Alabama
4,AL,under18,2011,1125763.0,Alabama


In [48]:
state_pop.isnull().any()  # To check for any mismatches

state/region    False
ages            False
year            False
population      False
state           False
dtype: bool

In [49]:
state_pop.shape

(2448, 5)

In [50]:
pop_area = pd.merge(state_pop, areas, on='state')
pop_area.head()

Unnamed: 0,state/region,ages,year,population,state,area (sq. mi)
0,AL,under18,2012,1117489.0,Alabama,52423
1,AL,total,2012,4817528.0,Alabama,52423
2,AL,under18,2010,1130966.0,Alabama,52423
3,AL,total,2010,4785570.0,Alabama,52423
4,AL,under18,2011,1125763.0,Alabama,52423


In [51]:
# Data for 2010
data_2010 = pop_area[(pop_area['year']==2010) & (pop_area['ages']=='total')]
data_2010.head()

Unnamed: 0,state/region,ages,year,population,state,area (sq. mi)
3,AL,total,2010,4785570.0,Alabama,52423
91,AK,total,2010,713868.0,Alaska,656425
101,AZ,total,2010,6408790.0,Arizona,114006
189,AR,total,2010,2922280.0,Arkansas,53182
197,CA,total,2010,37333601.0,California,163707


In [55]:
data_2010.shape

(51, 7)

In [58]:
# Calculating population density
data_2010.set_index('state',inplace=True)
data_2010.head()

Unnamed: 0_level_0,state/region,ages,year,population,area (sq. mi),density
state,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
Alabama,AL,total,2010,4785570.0,52423,91.287603
Alaska,AK,total,2010,713868.0,656425,1.087509
Arizona,AZ,total,2010,6408790.0,114006,56.214497
Arkansas,AR,total,2010,2922280.0,53182,54.948667
California,CA,total,2010,37333601.0,163707,228.051342


In [59]:
density = data_2010['population'] / data_2010['area (sq. mi)']
density.sort_values(ascending=False, inplace=True)
density.head()

state
District of Columbia    8898.897059
New Jersey              1009.253268
Rhode Island             681.339159
Connecticut              645.600649
Massachusetts            621.815538
dtype: float64