In [1]:
import pandas as pd
import numpy as np

#### Let's start by creating some data frames to work with.

How do you write the following df? 

List of cities: "Frankfurt","Munich","Hamburg", "Cologne", "Berlin", "Dusseldorf", "Leipzig","Stuttgart","Bonn"

Second column (word length)

In [2]:
cities= ["Frankfurt","Munich","Hamburg",\
         "Cologne","Berlin","Dusseldorf",\
         "Leipzig","Stuttgart","Bonn"]


In [3]:
city_name_length = pd.DataFrame(cities,columns=["Cities"])
city_name_length["Length"]=city_name_length["Cities"].str.len()
city_name_length

Unnamed: 0,Cities,Length
0,Frankfurt,9
1,Munich,6
2,Hamburg,7
3,Cologne,7
4,Berlin,6
5,Dusseldorf,10
6,Leipzig,7
7,Stuttgart,9
8,Bonn,4


#### This time a df of even and odd numbers from 0 to 12 (6 rows)

Try np.arange and boolean selection.

In [4]:
np.arange(11)[np.arange(11)%2!=0]

array([1, 3, 5, 7, 9])

In [5]:
df_even = pd.DataFrame({'even':np.arange(11)[np.arange(11)%2==0],
       'odd':np.arange(12)[np.arange(12)%2!=0]})


In [7]:
df_even

Unnamed: 0,even,odd
0,0,1
1,2,3
2,4,5
3,6,7
4,8,9
5,10,11


In [8]:
city_name_length

Unnamed: 0,Cities,Length
0,Frankfurt,9
1,Munich,6
2,Hamburg,7
3,Cologne,7
4,Berlin,6
5,Dusseldorf,10
6,Leipzig,7
7,Stuttgart,9
8,Bonn,4


### Join, Merge, Concatanate

These are all methods to combine data frames together (append also exists, although it's simpler).

- Merge has its default settings at an inner join, only the matching rows will be taken from both data frames.

- Concatanate has its default settings at an outer join. If you have a lot of data it might not be ideal. But it ensures not losing data. It doesn't know about the content.

- Append is a special case of concat for axis=0 and how="outer".

- Join is similar to merge but is only doing a left join.

In [9]:
# Inner join city_name_length and df_even using 
# merge comparing the 'Length' and 'odd' columns.
df_even_odd = city_name_length.merge(df_even,left_on="Length",\
                                    right_on="odd", how="inner")
df_even_odd
#df_even_odd["odd"].value_counts()

Unnamed: 0,Cities,Length,even,odd
0,Frankfurt,9,8,9
1,Stuttgart,9,8,9
2,Hamburg,7,6,7
3,Cologne,7,6,7
4,Leipzig,7,6,7


In [105]:
# try it with pd.DataFrame.merge
pd.DataFrame.merge(city_name_length, df_even,left_on="Length",\
                                     right_on="odd")

Unnamed: 0,Cities,Length,even,odd
0,Frankfurt,9,8,9
1,Stuttgart,9,8,9
2,Hamburg,7,6,7
3,Cologne,7,6,7
4,Leipzig,7,6,7


#### Or merging on index

In [106]:
# Default mode on both indices.
pd.DataFrame.merge(city_name_length,df_even,left_index=True,\
                   right_index=True)

Unnamed: 0,Cities,Length,even,odd
0,Frankfurt,9,0,1
1,Munich,6,2,3
2,Hamburg,7,4,5
3,Cologne,7,6,7
4,Berlin,6,8,9
5,Dusseldorf,10,10,11


So the following command searches for the left indices which are up to 8 in right_key (odd = 1,3,5,7,9,11), which should give 1, 3, 5, 7.

In [107]:
pd.DataFrame.merge(city_name_length,df_even, \
                   right_on="odd", \
                   left_index=True)

Unnamed: 0,Cities,Length,even,odd
0,Munich,6,0,1
1,Cologne,7,2,3
2,Dusseldorf,10,4,5
3,Stuttgart,9,6,7


While this one searches for the right indices (which are up to 5) in left_key (length). A single value has this. The index of the resulting dataframe is that of the left df. 

In [108]:
pd.DataFrame.merge(city_name_length,df_even, left_on='Length',\
                   right_index=True)

Unnamed: 0,Cities,Length,even,odd
8,Bonn,4,8,9


#### Doing some of these with join

It works for index based matching using the column values. 

In [109]:
# simple inner join
city_name_length.join(df_even)

Unnamed: 0,Cities,Length,even,odd
0,Frankfurt,9,0.0,1.0
1,Munich,6,2.0,3.0
2,Hamburg,7,4.0,5.0
3,Cologne,7,6.0,7.0
4,Berlin,6,8.0,9.0
5,Dusseldorf,10,10.0,11.0
6,Leipzig,7,,
7,Stuttgart,9,,
8,Bonn,4,,


In [110]:
#outer join
city_name_length.join(df_even,how="inner")

Unnamed: 0,Cities,Length,even,odd
0,Frankfurt,9,0,1
1,Munich,6,2,3
2,Hamburg,7,4,5
3,Cologne,7,6,7
4,Berlin,6,8,9
5,Dusseldorf,10,10,11


Extend the dataframe df_even with an extra row (12,13). 

In [111]:
extra_row = pd.DataFrame({"even":[12],"odd":[13]})
df_even_extended = df_even.append(extra_row)

df_even_extended


Unnamed: 0,even,odd
0,0,1
1,2,3
2,4,5
3,6,7
4,8,9
5,10,11
0,12,13


In [112]:
#right join
city_name_length.join(df_even_extended,how="left")

Unnamed: 0,Cities,Length,even,odd
0,Frankfurt,9,0.0,1.0
0,Frankfurt,9,12.0,13.0
1,Munich,6,2.0,3.0
2,Hamburg,7,4.0,5.0
3,Cologne,7,6.0,7.0
4,Berlin,6,8.0,9.0
5,Dusseldorf,10,10.0,11.0
6,Leipzig,7,,
7,Stuttgart,9,,
8,Bonn,4,,


Why did we end up with two Frankfurts since we just appended two rows to the end? We didn't need to reset the index when we appended.

In [113]:
# Check what would have happened if we didn't reset the index.
# df_even_extended.reset_index(inplace=True)
# You can also append with ignore_index=True

### CONCAT

Concat is similar to merge. It's maybe more natural to use with both axis but it's also different in the sense that it won't try to match values. It will rather extend an existing dataframe with another one. It is like an outer join ob both axes.


In [114]:
city_name_length_corr = city_name_length.rename(columns={"Length":"odd"})
city_name_length_corr

Unnamed: 0,Cities,odd
0,Frankfurt,9
1,Munich,6
2,Hamburg,7
3,Cologne,7
4,Berlin,6
5,Dusseldorf,10
6,Leipzig,7
7,Stuttgart,9
8,Bonn,4


In [115]:
# Create a new df where the column name "Length" is changed to "odd"
city_name_length_corr = city_name_length.rename(columns={"Length":"odd"})

# No common columns
df_even_concat_1 = pd.concat([city_name_length,df_even],sort=False)

# Common column
df_even_concat_2 = pd.concat([city_name_length_corr,df_even],sort=False)


In [116]:
print(df_even_concat_1,'\n\n',df_even_concat_2)

       Cities  Length  even   odd
0   Frankfurt     9.0   NaN   NaN
1      Munich     6.0   NaN   NaN
2     Hamburg     7.0   NaN   NaN
3     Cologne     7.0   NaN   NaN
4      Berlin     6.0   NaN   NaN
5  Dusseldorf    10.0   NaN   NaN
6     Leipzig     7.0   NaN   NaN
7   Stuttgart     9.0   NaN   NaN
8        Bonn     4.0   NaN   NaN
0         NaN     NaN   0.0   1.0
1         NaN     NaN   2.0   3.0
2         NaN     NaN   4.0   5.0
3         NaN     NaN   6.0   7.0
4         NaN     NaN   8.0   9.0
5         NaN     NaN  10.0  11.0 

        Cities  odd  even
0   Frankfurt    9   NaN
1      Munich    6   NaN
2     Hamburg    7   NaN
3     Cologne    7   NaN
4      Berlin    6   NaN
5  Dusseldorf   10   NaN
6     Leipzig    7   NaN
7   Stuttgart    9   NaN
8        Bonn    4   NaN
0         NaN    1   0.0
1         NaN    3   2.0
2         NaN    5   4.0
3         NaN    7   6.0
4         NaN    9   8.0
5         NaN   11  10.0


Concat is different than merge. It just added them based on the index here instead of the values.

#### Concatanate along the axis 1

In [117]:
df_even_concat_3 = pd.concat([city_name_length,df_even],axis=1)
print(df_even_concat_3)

       Cities  Length  even   odd
0   Frankfurt       9   0.0   1.0
1      Munich       6   2.0   3.0
2     Hamburg       7   4.0   5.0
3     Cologne       7   6.0   7.0
4      Berlin       6   8.0   9.0
5  Dusseldorf      10  10.0  11.0
6     Leipzig       7   NaN   NaN
7   Stuttgart       9   NaN   NaN
8        Bonn       4   NaN   NaN


### Iterrows

print the second row of df_even_concat_2

In [118]:
for row_index, row in df_even_concat_2.iterrows():
   if row_index==2:
    print(row_index,row)

2 Cities    Hamburg
odd             7
even          NaN
Name: 2, dtype: object
2 Cities    NaN
odd         5
even        4
Name: 2, dtype: object


### Dealing with missing values

Let's create a random number dataframe.

    1. normal with mean=250, st.dev=150 and length of column df_even_concat_3

In [119]:
np.random.seed(72)
random_numbers = np.random.normal(250,150,df_even_concat_3.shape[0])
random_numbers

array([ 2.97781825e+02,  1.48066085e+02, -8.12582704e+01,  3.38536024e+02,
        5.01214907e+02,  1.06604028e-01,  1.26349189e+02,  3.31789051e+02,
        2.50177929e+02])

2. Create the data frame and convert the variables to integers.

In [120]:
random_df = pd.DataFrame({"random":random_numbers})

In [121]:
random_df["random"] = random_df.random.astype('int64')

3. Create the column even with booleans for even.

In [122]:
random_df["even"]=random_df[random_df["random"]%2==0]
random_df["even"]

0      NaN
1    148.0
2      NaN
3    338.0
4      NaN
5      0.0
6    126.0
7      NaN
8    250.0
Name: even, dtype: float64

4. Copy the data frame to start working on a clean copy.

In [123]:
random_copy = random_df.copy()
random_copy.head(3)

Unnamed: 0,random,even
0,297,
1,148,148.0
2,-81,


#### Select rows with random < 150 and replace with na's

In [124]:
random_copy.replace(random_copy[random_copy["random"] < 150],np.NaN,inplace=True)

In [125]:
print(random_copy)

   random   even
0   297.0    NaN
1     NaN    NaN
2     NaN    NaN
3   338.0  338.0
4   501.0    NaN
5     NaN    NaN
6     NaN    NaN
7   331.0    NaN
8   250.0  250.0


Why did it not change? 
- don't forget to use inplace=True

#### Drop Na's

In [126]:
random_copy.dropna(how='any')
random_copy=random_copy.append(pd.DataFrame({"random":[12],"even":[13]}))

#random_copy.replace({12:np.NaN,13:np.NaN}, inplace=True)
#random_copy

In [127]:
random_copy.dropna(how='any')
random_copy

Unnamed: 0,random,even
0,297.0,
1,,
2,,
3,338.0,338.0
4,501.0,
5,,
6,,
7,331.0,
8,250.0,250.0
0,12.0,13.0


#### Fill NA's with different options when you don't want to lose data and you can make assumptions

1. Broadcasting 150
2. Mean of random
3. Mean of random under 150

**Note:**  You can also fill it with the most frequent value. 

In [128]:
random_copy.fillna(150)

Unnamed: 0,random,even
0,297.0,150.0
1,150.0,150.0
2,150.0,150.0
3,338.0,338.0
4,501.0,150.0
5,150.0,150.0
6,150.0,150.0
7,331.0,150.0
8,250.0,250.0
0,12.0,13.0


In [129]:
random_copy.fillna(random_copy["even"].mean())
random_copy

Unnamed: 0,random,even
0,297.0,
1,,
2,,
3,338.0,338.0
4,501.0,
5,,
6,,
7,331.0,
8,250.0,250.0
0,12.0,13.0


In [130]:
#random_copy.loc[
#random_copy.loc[random_copy["even"]>150,:]
random_copy.fillna(random_copy.loc[random_copy["even"]<150,\
                                   "even"].mean())

Unnamed: 0,random,even
0,297.0,13.0
1,13.0,13.0
2,13.0,13.0
3,338.0,338.0
4,501.0,13.0
5,13.0,13.0
6,13.0,13.0
7,331.0,13.0
8,250.0,250.0
0,12.0,13.0


#### REPLACE

Above, we already used replace, it can be used both for the dataframe and the series.

In [131]:
random_copy.replace(random_copy[random_copy["random"] < 150],\
                              800)

Unnamed: 0,random,even
0,297.0,
1,,
2,,
3,338.0,338.0
4,501.0,
5,,
6,,
7,331.0,
8,250.0,250.0
0,800.0,800.0


In [132]:
random_copy["random"].replace(random_copy\
                              [random_copy["random"] < 150],\
                              300)

0    297.0
1      NaN
2      NaN
3    338.0
4    501.0
5      NaN
6      NaN
7    331.0
8    250.0
0     12.0
Name: random, dtype: float64

In [133]:
x=list(random_copy.loc[random_copy["random"]<150,\
                       "random"])
y=list(random_copy.loc[random_copy["random"]>=150,\
                         "random"])


print(x,y)
print(random_copy)
print(random_copy.replace({"random":x,"even":y},300))

[12.0] [297.0, 338.0, 501.0, 331.0, 250.0]
   random   even
0   297.0    NaN
1     NaN    NaN
2     NaN    NaN
3   338.0  338.0
4   501.0    NaN
5     NaN    NaN
6     NaN    NaN
7   331.0    NaN
8   250.0  250.0
0    12.0   13.0
   random   even
0   297.0    NaN
1     NaN    NaN
2     NaN    NaN
3   338.0  300.0
4   501.0    NaN
5     NaN    NaN
6     NaN    NaN
7   331.0    NaN
8   250.0  300.0
0   300.0   13.0
