# Pandas Tips & Review
### As a data scientist, you will be utilizing the pandas library in various ways to solve some of the world's most pressing issues

  
### Here are some useful pandas methods to manipulate dataframes to do and show us whatever we want!

![Alt Text](https://media.giphy.com/media/FYnYcQoFmlPeU/giphy.gif)

In [1]:
import pandas as pd
import numpy as np

#### Let's use the merge function from pandas to combine two dataframes together

In [2]:
df1 = pd.read_csv('heart.csv')
df1.head()

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal,target
0,63,1,3,145,233,1,0,150,0,2.3,0,0,1,1
1,37,1,2,130,250,0,1,187,0,3.5,0,0,2,1
2,41,0,1,130,204,0,0,172,0,1.4,2,0,2,1
3,56,1,1,120,236,0,1,178,0,0.8,2,0,2,1
4,57,0,0,120,354,0,1,163,1,0.6,2,0,2,1


In [28]:
df2 = pd.DataFrame(np.random.randint(0,100,size=(303,3)), columns=['B','C','D'])
# 1025 rows and 3 columns
# Names of columns: A, B, C, D
df2

Unnamed: 0,B,C,D
0,81,5,51
1,92,44,39
2,7,96,45
3,32,43,53
4,74,82,66
...,...,...,...
298,45,50,55
299,81,45,99
300,47,17,13
301,25,3,94


In [20]:
df2.head()

Unnamed: 0,B,C,D
0,3,78,33
1,71,76,65
2,66,18,26
3,16,54,46
4,30,47,0


In [21]:
df1.head()

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal,target
0,63,1,3,145,233,1,0,150,0,2.3,0,0,1,1
1,37,1,2,130,250,0,1,187,0,3.5,0,0,2,1
2,41,0,1,130,204,0,0,172,0,1.4,2,0,2,1
3,56,1,1,120,236,0,1,178,0,0.8,2,0,2,1
4,57,0,0,120,354,0,1,163,1,0.6,2,0,2,1


In [30]:
pd.merge(df1, df2, on=df1.index) # Merging by using the index

Unnamed: 0,key_0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal,target,B,C,D
0,0,63,1,3,145,233,1,0,150,0,2.3,0,0,1,1,81,5,51
1,1,37,1,2,130,250,0,1,187,0,3.5,0,0,2,1,92,44,39
2,2,41,0,1,130,204,0,0,172,0,1.4,2,0,2,1,7,96,45
3,3,56,1,1,120,236,0,1,178,0,0.8,2,0,2,1,32,43,53
4,4,57,0,0,120,354,0,1,163,1,0.6,2,0,2,1,74,82,66
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
298,298,57,0,0,140,241,0,1,123,1,0.2,1,0,3,0,45,50,55
299,299,45,1,3,110,264,0,1,132,0,1.2,1,0,3,0,81,45,99
300,300,68,1,0,144,193,1,1,141,0,3.4,1,2,3,0,47,17,13
301,301,57,1,0,130,131,0,1,115,1,1.2,1,1,3,0,25,3,94


Answer Below

In [32]:
df3 = pd.merge(df1,df2, on=df1.index)

In [33]:
df3

Unnamed: 0,key_0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal,target,B,C,D
0,0,63,1,3,145,233,1,0,150,0,2.3,0,0,1,1,81,5,51
1,1,37,1,2,130,250,0,1,187,0,3.5,0,0,2,1,92,44,39
2,2,41,0,1,130,204,0,0,172,0,1.4,2,0,2,1,7,96,45
3,3,56,1,1,120,236,0,1,178,0,0.8,2,0,2,1,32,43,53
4,4,57,0,0,120,354,0,1,163,1,0.6,2,0,2,1,74,82,66
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
298,298,57,0,0,140,241,0,1,123,1,0.2,1,0,3,0,45,50,55
299,299,45,1,3,110,264,0,1,132,0,1.2,1,0,3,0,81,45,99
300,300,68,1,0,144,193,1,1,141,0,3.4,1,2,3,0,47,17,13
301,301,57,1,0,130,131,0,1,115,1,1.2,1,1,3,0,25,3,94


#### What if we wanted to simply add on a few relevant columns with data to our dataframe? (Hint: Use Concat)

In [34]:
lab_df = pd.DataFrame(np.random.randint(0,100,size=(303,3)), columns=['Chemical_A','Chemical_B','Chemical_C'])

In [35]:
lab_df

Unnamed: 0,Chemical_A,Chemical_B,Chemical_C
0,60,96,43
1,82,30,44
2,28,84,98
3,87,71,81
4,93,80,78
...,...,...,...
298,25,75,59
299,77,70,7
300,82,4,83
301,21,72,10


In [43]:
# concat lab_df with df2
pd.concat([df2, lab_df], axis=1)

Unnamed: 0,B,C,D,Chemical_A,Chemical_B,Chemical_C
0,81,5,51,60,96,43
1,92,44,39,82,30,44
2,7,96,45,28,84,98
3,32,43,53,87,71,81
4,74,82,66,93,80,78
...,...,...,...,...,...,...
298,45,50,55,25,75,59
299,81,45,99,77,70,7
300,47,17,13,82,4,83
301,25,3,94,21,72,10


Answer Below

In [44]:
concated_df = pd.concat([df2,lab_df],axis=1)

In [45]:
concated_df

Unnamed: 0,B,C,D,Chemical_A,Chemical_B,Chemical_C
0,81,5,51,60,96,43
1,92,44,39,82,30,44
2,7,96,45,28,84,98
3,32,43,53,87,71,81
4,74,82,66,93,80,78
...,...,...,...,...,...,...
298,45,50,55,25,75,59
299,81,45,99,77,70,7
300,47,17,13,82,4,83
301,25,3,94,21,72,10


### So what exactly does the join method do? Use the join method utilizing the 'left join'

In [58]:
# Join df1 and df2 using .join
df1.join(df2, how="right")

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal,target,B,C,D
0,63,1,3,145,233,1,0,150,0,2.3,0,0,1,1,81,5,51
1,37,1,2,130,250,0,1,187,0,3.5,0,0,2,1,92,44,39
2,41,0,1,130,204,0,0,172,0,1.4,2,0,2,1,7,96,45
3,56,1,1,120,236,0,1,178,0,0.8,2,0,2,1,32,43,53
4,57,0,0,120,354,0,1,163,1,0.6,2,0,2,1,74,82,66
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
298,57,0,0,140,241,0,1,123,1,0.2,1,0,3,0,45,50,55
299,45,1,3,110,264,0,1,132,0,1.2,1,0,3,0,81,45,99
300,68,1,0,144,193,1,1,141,0,3.4,1,2,3,0,47,17,13
301,57,1,0,130,131,0,1,115,1,1.2,1,1,3,0,25,3,94


In [50]:
# Using the .join() method, join df1 to df2 on the left side
df2.join(df1)

Unnamed: 0,B,C,D,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal,target
0,81,5,51,63,1,3,145,233,1,0,150,0,2.3,0,0,1,1
1,92,44,39,37,1,2,130,250,0,1,187,0,3.5,0,0,2,1
2,7,96,45,41,0,1,130,204,0,0,172,0,1.4,2,0,2,1
3,32,43,53,56,1,1,120,236,0,1,178,0,0.8,2,0,2,1
4,74,82,66,57,0,0,120,354,0,1,163,1,0.6,2,0,2,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
298,45,50,55,57,0,0,140,241,0,1,123,1,0.2,1,0,3,0
299,81,45,99,45,1,3,110,264,0,1,132,0,1.2,1,0,3,0
300,47,17,13,68,1,0,144,193,1,1,141,0,3.4,1,2,3,0
301,25,3,94,57,1,0,130,131,0,1,115,1,1.2,1,1,3,0


Answer Below

In [59]:
joined_df = df1.join(df2, on=df1.index,how='left')

In [60]:
joined_df

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal,target,B,C,D
0,63,1,3,145,233,1,0,150,0,2.3,0,0,1,1,81,5,51
1,37,1,2,130,250,0,1,187,0,3.5,0,0,2,1,92,44,39
2,41,0,1,130,204,0,0,172,0,1.4,2,0,2,1,7,96,45
3,56,1,1,120,236,0,1,178,0,0.8,2,0,2,1,32,43,53
4,57,0,0,120,354,0,1,163,1,0.6,2,0,2,1,74,82,66
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
298,57,0,0,140,241,0,1,123,1,0.2,1,0,3,0,45,50,55
299,45,1,3,110,264,0,1,132,0,1.2,1,0,3,0,81,45,99
300,68,1,0,144,193,1,1,141,0,3.4,1,2,3,0,47,17,13
301,57,1,0,130,131,0,1,115,1,1.2,1,1,3,0,25,3,94


## Now let's talk about .loc vs .iloc 

The Pandas loc indexer can be used with DataFrames for two different use cases: 

a.) Selecting rows by label/index  
b.) Selecting rows with a boolean / conditional lookup

a.) Selecting rows by label/index

In [68]:
# Select rows with index values '0' through '100', with all columns between 'age' and 'chol'
joined_df.loc[0:100, 'age':'chol']

Unnamed: 0,age,sex,cp,trestbps,chol
0,63,1,3,145,233
1,37,1,2,130,250
2,41,0,1,130,204
3,56,1,1,120,236
4,57,0,0,120,354
...,...,...,...,...,...
96,62,0,0,140,394
97,52,1,0,108,233
98,43,1,2,130,315
99,53,1,2,130,246


>Notice below .loc[300] does not grab row 300, but rather the index of 300

In [78]:
joined_df.shape  # Shows the # rows and cols

(303, 17)

In [77]:
joined_df.loc[300]

age          68.0
sex           1.0
cp            0.0
trestbps    144.0
chol        193.0
fbs           1.0
restecg       1.0
thalach     141.0
exang         0.0
oldpeak       3.4
slope         1.0
ca            2.0
thal          3.0
target        0.0
B            47.0
C            17.0
D            13.0
Name: 300, dtype: float64

b.) Selecting rows with a boolean / conditional lookup

In [79]:
# select rows with indexes between 100 and 200, and just return 'age' and 'cp' columns
joined_df.loc[(joined_df.index > 100) & (joined_df.index <= 200), ['age', 'cp']] 

Unnamed: 0,age,cp
101,59,3
102,63,1
103,42,2
104,50,2
105,68,2
...,...,...
196,46,2
197,67,0
198,62,0
199,65,0


The iloc indexer for Pandas Dataframe is used for integer-location based indexing / selection by position.

##### Use .iloc to grab specific rows

First 4 rows

In [80]:
joined_df.iloc[0:4]

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal,target,B,C,D
0,63,1,3,145,233,1,0,150,0,2.3,0,0,1,1,81,5,51
1,37,1,2,130,250,0,1,187,0,3.5,0,0,2,1,92,44,39
2,41,0,1,130,204,0,0,172,0,1.4,2,0,2,1,7,96,45
3,56,1,1,120,236,0,1,178,0,0.8,2,0,2,1,32,43,53


First 3 columns

In [81]:
joined_df.iloc[:,0:3] # exclusivly leaves out the 4th cols because of [0:3]

Unnamed: 0,age,sex,cp
0,63,1,3
1,37,1,2
2,41,0,1
3,56,1,1
4,57,0,0
...,...,...,...
298,57,0,0
299,45,1,3
300,68,1,0
301,57,1,0


Or grab rows (0,3,6,24) from columns (0,5,6)

In [82]:
joined_df.iloc[[0,3,6,24], [0,5,6]]  # the first list reps the rows, the second list represents the cols

Unnamed: 0,age,fbs,restecg
0,63,1,0
3,56,0,1
6,56,0,0
24,40,0,1


# Ways to utilize lambda functions
#### .map(), .apply(), .applymap()

The map() method only works on pandas series

The apply () method works on panda series and data frames

The applymap() method works on the entire pandas data frame where the input function is applied to every element individually. In other words, applymap() is appy() + map()!

In [83]:
df = pd.read_csv('heart.csv')
df.head()

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal,target
0,63,1,3,145,233,1,0,150,0,2.3,0,0,1,1
1,37,1,2,130,250,0,1,187,0,3.5,0,0,2,1
2,41,0,1,130,204,0,0,172,0,1.4,2,0,2,1
3,56,1,1,120,236,0,1,178,0,0.8,2,0,2,1
4,57,0,0,120,354,0,1,163,1,0.6,2,0,2,1


In [90]:
new_df = df.age.map(lambda x: x * 10) 

In [88]:
super_df = df.age.apply(lambda x: x * 10)

#### How else could we manipulate the 'age' column?

In [92]:
my_df = df.age.map(lambda x: x * 5)  # multiplying age by 5

Do these two dataframes equal the same thing?

In [93]:
(new_df == super_df).value_counts()

True    303
Name: age, dtype: int64

In [94]:
apply_df = df.applymap(lambda x: x*10)

In [95]:
apply_df

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal,target
0,630,10,30,1450,2330,10,0,1500,0,23.0,0,0,10,10
1,370,10,20,1300,2500,0,10,1870,0,35.0,0,0,20,10
2,410,0,10,1300,2040,0,0,1720,0,14.0,20,0,20,10
3,560,10,10,1200,2360,0,10,1780,0,8.0,20,0,20,10
4,570,0,0,1200,3540,0,10,1630,10,6.0,20,0,20,10
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
298,570,0,0,1400,2410,0,10,1230,10,2.0,10,0,30,0
299,450,10,30,1100,2640,0,10,1320,0,12.0,10,0,30,0
300,680,10,0,1440,1930,10,10,1410,0,34.0,10,20,30,0
301,570,10,0,1300,1310,0,10,1150,10,12.0,10,10,30,0


In [100]:
pip install faker

Collecting faker
[?25l  Downloading https://files.pythonhosted.org/packages/b0/fa/cc588102bbc75983141265d0013dd42cb4e6223961ee03f1bdbcf6e9d4e2/Faker-4.0.0-py3-none-any.whl (963kB)
[K     |████████████████████████████████| 972kB 604kB/s eta 0:00:01
[?25hCollecting text-unidecode==1.3
  Using cached https://files.pythonhosted.org/packages/a6/a5/c0b6468d3824fe3fde30dbb5e1f687b291608f9473681bbf7dabbf5a87d7/text_unidecode-1.3-py2.py3-none-any.whl
Installing collected packages: text-unidecode, faker
Successfully installed faker-4.0.0 text-unidecode-1.3
Note: you may need to restart the kernel to use updated packages.


In [135]:
from faker import Faker # Uses fake data to play around with
fake = Faker() # Made it an object
fake.name()

'Mary Jones'

Let's generate a column of names

In [136]:
df['name'] = [fake.name() for x in range(1025)]
df.head()

ValueError: Length of values does not match length of index

In [137]:
df.name = df.name.map(lambda x: x.replace(" ",'_'))

AttributeError: 'DataFrame' object has no attribute 'name'

## BONUS:
##### Can you grab the row with info on Brent Jones?

In [138]:
brent_jones = None

Answer Below

In [None]:
brent_jones = df.loc[df['name'] == 'Brent_Jones']