# Intro to Pandas 

In [3]:
import numpy as np
import pandas as pd
import rdatasets

In [11]:
dataset = rdatasets.data('iris')
df = pd.DataFrame(dataset)

In [31]:
df.head()

Unnamed: 0,Sepal.Length,Sepal.Width,Petal.Length,Petal.Width,Species
0,5.1,3.5,1.4,0.2,setosa
1,4.9,3.0,1.4,0.2,setosa
2,4.7,3.2,1.3,0.2,setosa
3,4.6,3.1,1.5,0.2,setosa
4,5.0,3.6,1.4,0.2,setosa


<br> Ref 1: https://www.kaggle.com/learn/pandas </br>
Ref 2: https://github.com/ageron/handson-ml3/blob/main/tools_pandas.ipynb

## Creating, reading

In [12]:
new_df = pd.DataFrame({'Apples':[31],'Bananas':[20]})

In [17]:
new_df = pd.DataFrame([[31,20]], columns=['Apples','Bananas']) 

In [18]:
new_df = pd.DataFrame.from_dict({'Apples':[31],'Bananas':[20]})

Adding Index:

In [20]:
new_df = pd.DataFrame({'Apples':[35,41],'Bananas':[21,34]}, index=['2017 Sales','2018 Sales'])

In [21]:
new_df

Unnamed: 0,Apples,Bananas
2017 Sales,35,21
2018 Sales,41,34


In [22]:
new_series = pd.Series(['4 cups','1 cup','2 large', '1 can'], index=['Flour','Milk','Eggs','Spam'])

In [23]:
new_series

Flour     4 cups
Milk       1 cup
Eggs     2 large
Spam       1 can
dtype: object

Use pd.read_x (x belongs to [csv,excel,etc.]) to read data from a file and convert it to a csv

## Indexing, Selecting 

In [26]:
df.Species.iloc[0:100]

0         setosa
1         setosa
2         setosa
3         setosa
4         setosa
         ...    
95    versicolor
96    versicolor
97    versicolor
98    versicolor
99    versicolor
Name: Species, Length: 100, dtype: object

In [27]:
df['Species'][:100]

0         setosa
1         setosa
2         setosa
3         setosa
4         setosa
         ...    
95    versicolor
96    versicolor
97    versicolor
98    versicolor
99    versicolor
Name: Species, Length: 100, dtype: object

In [30]:
df.loc[:99,'Species']

0         setosa
1         setosa
2         setosa
3         setosa
4         setosa
         ...    
95    versicolor
96    versicolor
97    versicolor
98    versicolor
99    versicolor
Name: Species, Length: 100, dtype: object

In [33]:
df.iloc[:100,4]

0         setosa
1         setosa
2         setosa
3         setosa
4         setosa
         ...    
95    versicolor
96    versicolor
97    versicolor
98    versicolor
99    versicolor
Name: Species, Length: 100, dtype: object

### <br>Note:</br> 
Native python ie. df['Species'][0] is column first row second whereas loc and iloc are row first column second

In [34]:
df.loc[df['Species']=='setosa']

Unnamed: 0,Sepal.Length,Sepal.Width,Petal.Length,Petal.Width,Species
0,5.1,3.5,1.4,0.2,setosa
1,4.9,3.0,1.4,0.2,setosa
2,4.7,3.2,1.3,0.2,setosa
3,4.6,3.1,1.5,0.2,setosa
4,5.0,3.6,1.4,0.2,setosa
5,5.4,3.9,1.7,0.4,setosa
6,4.6,3.4,1.4,0.3,setosa
7,5.0,3.4,1.5,0.2,setosa
8,4.4,2.9,1.4,0.2,setosa
9,4.9,3.1,1.5,0.1,setosa


In [35]:
df.loc[(df.Species == 'setosa') & (df['Sepal.Length'] >= 5.3)]

Unnamed: 0,Sepal.Length,Sepal.Width,Petal.Length,Petal.Width,Species
5,5.4,3.9,1.7,0.4,setosa
10,5.4,3.7,1.5,0.2,setosa
14,5.8,4.0,1.2,0.2,setosa
15,5.7,4.4,1.5,0.4,setosa
16,5.4,3.9,1.3,0.4,setosa
18,5.7,3.8,1.7,0.3,setosa
20,5.4,3.4,1.7,0.2,setosa
31,5.4,3.4,1.5,0.4,setosa
33,5.5,4.2,1.4,0.2,setosa
36,5.5,3.5,1.3,0.2,setosa


In [37]:
df.loc[df['Species'].isin(['setosa'])]

Unnamed: 0,Sepal.Length,Sepal.Width,Petal.Length,Petal.Width,Species
0,5.1,3.5,1.4,0.2,setosa
1,4.9,3.0,1.4,0.2,setosa
2,4.7,3.2,1.3,0.2,setosa
3,4.6,3.1,1.5,0.2,setosa
4,5.0,3.6,1.4,0.2,setosa
5,5.4,3.9,1.7,0.4,setosa
6,4.6,3.4,1.4,0.3,setosa
7,5.0,3.4,1.5,0.2,setosa
8,4.4,2.9,1.4,0.2,setosa
9,4.9,3.1,1.5,0.1,setosa


## Summary functions, map

In [38]:
df.describe()

Unnamed: 0,Sepal.Length,Sepal.Width,Petal.Length,Petal.Width
count,150.0,150.0,150.0,150.0
mean,5.843333,3.057333,3.758,1.199333
std,0.828066,0.435866,1.765298,0.762238
min,4.3,2.0,1.0,0.1
25%,5.1,2.8,1.6,0.3
50%,5.8,3.0,4.35,1.3
75%,6.4,3.3,5.1,1.8
max,7.9,4.4,6.9,2.5


In [40]:
df.Species.value_counts()

virginica     50
setosa        50
versicolor    50
Name: Species, dtype: int64

In [41]:
df.Species.unique()

array(['setosa', 'versicolor', 'virginica'], dtype=object)

In [42]:
df.Species.nunique()

3

In [43]:
df['sepal_ratio'] = df.apply(lambda x: x['Sepal.Length']/x['Sepal.Width'],axis=1)

In [44]:
df.sepal_ratio

0      1.457143
1      1.633333
2      1.468750
3      1.483871
4      1.388889
         ...   
145    2.233333
146    2.520000
147    2.166667
148    1.823529
149    1.966667
Name: sepal_ratio, Length: 150, dtype: float64