# Tidy Data

In [4]:
import pandas as pd

In [None]:
# Tidy data is a framework to structure data sets so that they can be easily analyzed

# Each row is an observation 
# Each column is a variable 
# Each type of observational unit forms a table 

# Concatenation 

In [None]:
# concatenation can be thought of appending rows or columns 
# this approach is possible if your data was split into parts or if 
# you want to perform a calculation that you want to append to your existing dataset

In [5]:
data1 = pd.read_csv(r"C:\Users\tanzh\Documents\Python\Pandas for Everyone\datasets\c4\concat1.csv")
data2 = pd.read_csv(r"C:\Users\tanzh\Documents\Python\Pandas for Everyone\datasets\c4\concat2.csv")
data3 = pd.read_csv(r"C:\Users\tanzh\Documents\Python\Pandas for Everyone\datasets\c4\concat3.csv")

In [None]:
print(data1)

In [None]:
# concatenting simply stack the dataframes on top of each other 
row_concat = pd.concat([data1, data2, data3]) # all the dataframes to be concatenated are passed in a list
print(row_concat)
# noticed that the row names (row indices), they are also stacked 

In [None]:
row_concat.iloc[3] # this print the row item with an index label of 3

In [None]:
row_concat.iloc[7] # this print the row item with an index label of 7

In [None]:
row_concat.loc[1] # this print out all the index label of 1

# Create a new series to append to a dataframe

In [None]:
new_row_series = pd.Series(["new1a","new2b","new3c","new4d","new5e"])
print(new_row_series)

In [None]:
# we now attempt to add the data series created above to data1

In [None]:
x = pd.concat([data1, new_row_series])
print(x)

# the above create a new column 
# the items in new_row_series were added as individual row items

In [None]:
# to fix the above problem, we need to turn our Series into a DataFrame
# the dataframe object will contain one row of items and the name of the columns to blind the items tp

new_rows_dataframe = pd.DataFrame([["new1a","new2b","new3c","new4d","new5e"]], columns = ["A","B","C","D","E"])
print(new_rows_dataframe)

In [None]:
pd.concat([data2, new_rows_dataframe]) # this concat data2 and the new_rows_dataframe 

# Append

In [None]:
# Concat is a general function that can concatenate multiple things at once
# if you need to append a single object to a existing dataframe, the append function can handle such task

In [None]:
data1

In [None]:
data2

In [None]:
to_append = data1.append(data2) # this append data1 to data2 
print(to_append)

In [None]:
to_append_new = data2.append(new_rows_dataframe)
print(to_append_new)

In [None]:
# using python dictionary 

data_dict = { "A" : "new1a" , 
              "B" : "new2b" , 
              "C" : "new3c" , 
              "D" : "new4d" 
            }

to_append_using_dict = data1.append(data_dict, ignore_index = True)
print(to_append_using_dict)

# noticed that the index was increased by 1

# Ignoring the index

In [None]:
data_ignore_index = pd.concat([data1, data2, data3], ignore_index = True)
print(data_ignore_index)

# noticed the index label is not repeated and instead is smooth running number from 0 to 12 

# Adding Columns 

In [None]:
# the default value for the axis is 0 which means it will concat data in a row-wise fashion
# if the axis is 1, it will concat data in a column-wise fashion

data_col_concat = pd.concat([data1,data2,data3], axis = 1 )
print(data_col_concat)

In [None]:
# we can retrieve a subset of the column that have the same name using the name of the columns. for example

print(data_col_concat["A"])

In [None]:
# to add a new column can be done without using any specific Pandas function 

data_col_concat["new_columns"] = ["new1", "new2", "new3", "new4", "new5"]
print(data_col_concat)

# note that you need to add in the same number of data points (row-wise) as per the original dataframe
# see below for an example
# data_col_concat["new_columns1"] = ["new1", "new2", "new3", "new4"]

# Concatenation with different indices
Concatenate rows with different columns

In [None]:
# the above examples assumes that we are performing simple row and columns concatenation where :
## new rows had the same column names or
## new columns had the same row indices 

In [6]:
data1.columns = ["A", "B", "C", "D"]
data2.columns = ["E", "F", "G", "H", "I"]
data3.columns = ["A", "C", "F", "H"]

In [7]:
new_table = pd.concat([data1,data2,data3])
print(new_table)

# row items across the data files that share the same column name will be group toegther under the same column name in the new dataframe object
# note that there is parameter in concat function known as join, which by default has a value of "outer"

A    B    C    D    E    F    G    H    I
0   a0   b0   c0   d0  NaN  NaN  NaN  NaN  NaN
1   a1   b1   c1   d1  NaN  NaN  NaN  NaN  NaN
2   a2   b2   c2   d2  NaN  NaN  NaN  NaN  NaN
3   a3   b3   c3   d3  NaN  NaN  NaN  NaN  NaN
0  NaN  NaN  NaN  NaN   a4   b4   c4   d4   e4
1  NaN  NaN  NaN  NaN   a5   b5   c5   d5   e5
2  NaN  NaN  NaN  NaN   a6   b6   c6   d6  NaN
3  NaN  NaN  NaN  NaN   a7   b7   c7   d7  NaN
0   a8  NaN   b8  NaN  NaN   c8  NaN   d8  NaN
1   a9  NaN   b9  NaN  NaN   c9  NaN   d9  NaN
2  a10  NaN  b10  NaN  NaN  c10  NaN  d10  NaN
3  a11  NaN  b11  NaN  NaN  c11  NaN  d11  NaN
4  a12  NaN  b12  NaN  NaN  c12  NaN  d12  NaN


In [8]:
# if we want to keep only the columns that are shared among the data sets , we can ;

table_with_common_col = pd.concat([data1,data2,data3], join = "inner")
print(table_with_common_col)

# we are looking for commonality column-wise between data1, data2 and data3 
# if there is no commonality column-wise between all of them, even if there is commonality column-wise between data1 and data3, it will return a empty dataframe

Empty DataFrame
Columns: []
Index: [0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3, 4]


In [9]:
table_with_common_col_revised = pd.concat([data1,data3],ignore_index = True, join = "inner")
print(table_with_common_col_revised)

A    C
0   a0   c0
1   a1   c1
2   a2   c2
3   a3   c3
4   a8   b8
5   a9   b9
6  a10  b10
7  a11  b11
8  a12  b12


# Concatenation with different indices
Concatenate columns with different rows

In [13]:
data1.index = [0,1,2,3]
data2.index = [4,5,6,7]
data3.index = [0,2,4,6,8]

In [14]:
table = pd.concat([data1,data2,data3], axis = 1)
# when we concat along the axis 1, the new dataframe will be added in a column-wise fashion and matched against their respective row indices 

print(table)

A    B    C    D    E    F    G    H    I    A    C    F    H
0   a0   b0   c0   d0  NaN  NaN  NaN  NaN  NaN   a8   b8   c8   d8
1   a1   b1   c1   d1  NaN  NaN  NaN  NaN  NaN  NaN  NaN  NaN  NaN
2   a2   b2   c2   d2  NaN  NaN  NaN  NaN  NaN   a9   b9   c9   d9
3   a3   b3   c3   d3  NaN  NaN  NaN  NaN  NaN  NaN  NaN  NaN  NaN
4  NaN  NaN  NaN  NaN   a4   b4   c4   d4   e4  a10  b10  c10  d10
5  NaN  NaN  NaN  NaN   a5   b5   c5   d5   e5  NaN  NaN  NaN  NaN
6  NaN  NaN  NaN  NaN   a6   b6   c6   d6  NaN  a11  b11  c11  d11
7  NaN  NaN  NaN  NaN   a7   b7   c7   d7  NaN  NaN  NaN  NaN  NaN
8  NaN  NaN  NaN  NaN  NaN  NaN  NaN  NaN  NaN  a12  b12  c12  d12


In [15]:
# if we want to keep only the row indices that are shared among the data file, we can 

table_with_common_rol = pd.concat([data1,data2,data3], axis = 1, join = "inner")
print(table_with_common_rol)

# we are looking for commonality row-wise between data1, data2 and data3 
# if there is no commonality row-wise between all of them, even if there is commonality row-wise between data1 and data3, it will return a empty dataframe

Empty DataFrame
Columns: [A, B, C, D, E, F, G, H, I, A, C, F, H]
Index: []


In [16]:
table_with_common_rol_revised = pd.concat([data1,data3], axis = 1, join = "inner")
print(table_with_common_rol_revised)

A   B   C   D   A   C   F   H
0  a0  b0  c0  d0  a8  b8  c8  d8
2  a2  b2  c2  d2  a9  b9  c9  d9


# Merging Multiple Data Sets

In [None]:
# Pandas has a pd.join command that uses pd.merge under the hood, join will merge dataframe objects based on an index but the merge command is much more explicit and flexible
# note that merge is a DataFrame method

In [22]:
person = pd.read_csv(r"C:\Users\tanzh\Documents\Python\Pandas for Everyone\datasets\c4\survey_person.csv")
site = pd.read_csv(r"C:\Users\tanzh\Documents\Python\Pandas for Everyone\datasets\c4\survey_site.csv")
survey = pd.read_csv(r"C:\Users\tanzh\Documents\Python\Pandas for Everyone\datasets\c4\survey_survey.csv")
visited = pd.read_csv(r"C:\Users\tanzh\Documents\Python\Pandas for Everyone\datasets\c4\survey_visited.csv")

# One-to-One Merge 

In [None]:
# in the simplest type of merge, we have two dataframes where we want to join one column to another column, and where the column we want to join do not contain any duplicate values
# the number of row items between the two dataframes are the same

In [23]:
visited_subset = visited.loc[[0,2,6],] # this extract row items with index labels 0,2,6 and assign to the said variable
print(visited_subset)

ident   site       dated
0    619   DR-1  1927-02-08
2    734   DR-3  1939-01-07
6    837  MSK-4  1932-01-14


In [27]:
site

Unnamed: 0,name,lat,long
0,DR-1,-49.85,-128.57
1,DR-3,-47.15,-126.72
2,MSK-4,-48.87,-123.4


In [None]:
# a one-to-one merge as follows: 

# in the merge function : 
#1 the dataframe that is being called will be refered to the one on the 'left
#1a in the below example, site is the dataframe that is on the 'left'
#2 'visited_subset' is the dataframe that is on the 'right'

# how to match
# we are matching the entries with the 'name' columns that are in both of the dataframes
# if both the dataframes do not contain columns of the same names, we have to define the left_on and right_on

o2o_merge_table = site.merge(visited_subset, left_on="name", right_on="site")
print(o2o_merge_table)

# Many-to-One Merge

In [None]:
# in this kind of merge, one of the dataframes has key values that repeat
# the dataframe that contain the single observations will then be duplicated in the merge

In [28]:
m2o_merge_table = site.merge(visited, left_on="name", right_on="site")
print(m2o_merge_table)

name    lat    long  ident   site       dated
0   DR-1 -49.85 -128.57    619   DR-1  1927-02-08
1   DR-1 -49.85 -128.57    622   DR-1  1927-02-10
2   DR-1 -49.85 -128.57    844   DR-1  1932-03-22
3   DR-3 -47.15 -126.72    734   DR-3  1939-01-07
4   DR-3 -47.15 -126.72    735   DR-3  1930-01-12
5   DR-3 -47.15 -126.72    751   DR-3  1930-02-26
6   DR-3 -47.15 -126.72    752   DR-3         NaN
7  MSK-4 -48.87 -123.40    837  MSK-4  1932-01-14


# Many-to-many Merge 

In [None]:
# we use many-to-many merge where we want to perform a match based on mutiple columns 
# suppose we have the following dataframes, person, survey, visited

In [31]:
ps = person.merge(survey, left_on= "ident", right_on="person")
ps

Unnamed: 0,ident,personal,family,taken,person,quant,reading
0,dyer,William,Dyer,619,dyer,rad,9.82
1,dyer,William,Dyer,619,dyer,sal,0.13
2,dyer,William,Dyer,622,dyer,rad,7.8
3,dyer,William,Dyer,622,dyer,sal,0.09
4,pb,Frank,Pabodie,734,pb,rad,8.41
5,pb,Frank,Pabodie,734,pb,temp,-21.5
6,pb,Frank,Pabodie,735,pb,rad,7.22
7,pb,Frank,Pabodie,751,pb,rad,4.35
8,pb,Frank,Pabodie,751,pb,temp,-18.5
9,lake,Anderson,Lake,734,lake,sal,0.05


In [30]:
vs = visited.merge(survey, left_on = "ident", right_on = "taken" )
print(vs)

ident   site       dated  taken person quant  reading
0     619   DR-1  1927-02-08    619   dyer   rad     9.82
1     619   DR-1  1927-02-08    619   dyer   sal     0.13
2     622   DR-1  1927-02-10    622   dyer   rad     7.80
3     622   DR-1  1927-02-10    622   dyer   sal     0.09
4     734   DR-3  1939-01-07    734     pb   rad     8.41
5     734   DR-3  1939-01-07    734   lake   sal     0.05
6     734   DR-3  1939-01-07    734     pb  temp   -21.50
7     735   DR-3  1930-01-12    735     pb   rad     7.22
8     735   DR-3  1930-01-12    735    NaN   sal     0.06
9     735   DR-3  1930-01-12    735    NaN  temp   -26.00
10    751   DR-3  1930-02-26    751     pb   rad     4.35
11    751   DR-3  1930-02-26    751     pb  temp   -18.50
12    751   DR-3  1930-02-26    751   lake   sal     0.10
13    752   DR-3         NaN    752   lake   rad     2.19
14    752   DR-3         NaN    752   lake   sal     0.09
15    752   DR-3         NaN    752   lake  temp   -16.00
16    752   DR-3  

In [None]:
ps_vs = ps.merge(vs, left_on = ["ident", "taken","quant","reading"], right_on = ["person","ident","quant","reading"])

print(ps_vs)

# pandas will automatically add a suffix to a column name if there are collusion in the name
# in the output, the _x refer to the values from the left dataframe and the _y suffix refer to the values that come from the right dataframe