# Pandas

## Series

In [1]:
import numpy as np
import pandas as pd

In [7]:
lbl = ['a','b','c']
lst = [10,20,30]

arr1 = np.array([10, 20, 30])

d = {'a':10,'b':20,'c':30}

In [5]:
## With list

pd.Series(lst)

0    10
1    20
2    30
dtype: int64

In [9]:
## With list and custom label

pd.Series(data=lst,index=lbl)

a    10
b    20
c    30
dtype: int64

In [10]:
## Dictionary

pd.Series(d)

a    10
b    20
c    30
dtype: int64

In [13]:
d1 = {'US': 50, 'China': 70, 'IND': 100}
d2 = {'US': 30, 'NZ': 45, 'IND': 55}

a = pd.Series(d1)
b = pd.Series(d2)

In [14]:
a+b

China      NaN
IND      155.0
NZ         NaN
US        80.0
dtype: float64

## DataFrames

In [26]:
df = pd.DataFrame(data = np.random.randn(5,3), index = 'a b c d e'.split(' '), columns = ['US', 'AUS', 'UK'])

In [27]:
df

Unnamed: 0,US,AUS,UK
a,-0.640911,0.28593,-0.549811
b,-0.83796,-2.193583,-0.609038
c,-0.130206,0.573355,0.733247
d,2.060353,0.425544,-0.757493
e,-0.37055,1.674514,-1.619657


In [28]:
df['US']

a   -0.640911
b   -0.837960
c   -0.130206
d    2.060353
e   -0.370550
Name: US, dtype: float64

In [29]:
df.US

a   -0.640911
b   -0.837960
c   -0.130206
d    2.060353
e   -0.370550
Name: US, dtype: float64

In [30]:
df.columns

Index(['US', 'AUS', 'UK'], dtype='object')

In [31]:
df[['US','AUS']]

Unnamed: 0,US,AUS
a,-0.640911,0.28593
b,-0.83796,-2.193583
c,-0.130206,0.573355
d,2.060353,0.425544
e,-0.37055,1.674514


In [32]:
df['New'] = df['US'] + df['AUS']

In [33]:
df

Unnamed: 0,US,AUS,UK,New
a,-0.640911,0.28593,-0.549811,-0.354981
b,-0.83796,-2.193583,-0.609038,-3.031544
c,-0.130206,0.573355,0.733247,0.443148
d,2.060353,0.425544,-0.757493,2.485897
e,-0.37055,1.674514,-1.619657,1.303964


In [35]:
df.drop('New',axis=1)
df

Unnamed: 0,US,AUS,UK,New
a,-0.640911,0.28593,-0.549811,-0.354981
b,-0.83796,-2.193583,-0.609038,-3.031544
c,-0.130206,0.573355,0.733247,0.443148
d,2.060353,0.425544,-0.757493,2.485897
e,-0.37055,1.674514,-1.619657,1.303964


In [36]:
df['sum'] = df['US']+df['AUS']+df['UK']

In [37]:
df = df.drop('New',axis = 1)

In [38]:
df

Unnamed: 0,US,AUS,UK,sum
a,-0.640911,0.28593,-0.549811,-0.904792
b,-0.83796,-2.193583,-0.609038,-3.640582
c,-0.130206,0.573355,0.733247,1.176395
d,2.060353,0.425544,-0.757493,1.728404
e,-0.37055,1.674514,-1.619657,-0.315694


In [39]:
df.drop('sum',axis=1,inplace=True)
df

Unnamed: 0,US,AUS,UK
a,-0.640911,0.28593,-0.549811
b,-0.83796,-2.193583,-0.609038
c,-0.130206,0.573355,0.733247
d,2.060353,0.425544,-0.757493
e,-0.37055,1.674514,-1.619657


### Selection by label index using .loc

In [41]:
df.loc['a']

US    -0.640911
AUS    0.285930
UK    -0.549811
Name: a, dtype: float64

In [43]:
df.iloc[0]

US    -0.640911
AUS    0.285930
UK    -0.549811
Name: a, dtype: float64

In [44]:
df.loc['b','UK']

-0.6090383622981055

In [46]:
df[df['US'] > 0.05]

Unnamed: 0,US,AUS,UK
d,2.060353,0.425544,-0.757493


In [47]:
df['US'] > 0.05

a    False
b    False
c    False
d     True
e    False
Name: US, dtype: bool

In [48]:
df[df['US'] > 0.05]['UK']

d   -0.757493
Name: UK, dtype: float64

In [49]:
df

Unnamed: 0,US,AUS,UK
a,-0.640911,0.28593,-0.549811
b,-0.83796,-2.193583,-0.609038
c,-0.130206,0.573355,0.733247
d,2.060353,0.425544,-0.757493
e,-0.37055,1.674514,-1.619657


### We can reset the index

In [50]:
df.reset_index()

Unnamed: 0,index,US,AUS,UK
0,a,-0.640911,0.28593,-0.549811
1,b,-0.83796,-2.193583,-0.609038
2,c,-0.130206,0.573355,0.733247
3,d,2.060353,0.425544,-0.757493
4,e,-0.37055,1.674514,-1.619657


In [51]:
df

Unnamed: 0,US,AUS,UK
a,-0.640911,0.28593,-0.549811
b,-0.83796,-2.193583,-0.609038
c,-0.130206,0.573355,0.733247
d,2.060353,0.425544,-0.757493
e,-0.37055,1.674514,-1.619657


In [53]:
new_idx = '1 2 3 4 5'.split()

In [55]:
df['idx'] = new_idx

df.set_index('idx')

Unnamed: 0_level_0,US,AUS,UK
idx,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1,-0.640911,0.28593,-0.549811
2,-0.83796,-2.193583,-0.609038
3,-0.130206,0.573355,0.733247
4,2.060353,0.425544,-0.757493
5,-0.37055,1.674514,-1.619657


In [56]:
df

Unnamed: 0,US,AUS,UK,idx
a,-0.640911,0.28593,-0.549811,1
b,-0.83796,-2.193583,-0.609038,2
c,-0.130206,0.573355,0.733247,3
d,2.060353,0.425544,-0.757493,4
e,-0.37055,1.674514,-1.619657,5


## Handling Missing Data 

In [58]:
df = pd.DataFrame({'a':[23,43,12],
'b':[25,np.nan,55],
'c':[np.nan,np.nan,92]}
)

df

Unnamed: 0,a,b,c
0,23,25.0,
1,43,,
2,12,55.0,92.0


In [59]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3 entries, 0 to 2
Data columns (total 3 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   a       3 non-null      int64  
 1   b       2 non-null      float64
 2   c       1 non-null      float64
dtypes: float64(2), int64(1)
memory usage: 200.0 bytes


In [60]:
df.describe()

Unnamed: 0,a,b,c
count,3.0,2.0,1.0
mean,26.0,40.0,92.0
std,15.716234,21.213203,
min,12.0,25.0,92.0
25%,17.5,32.5,92.0
50%,23.0,40.0,92.0
75%,33.0,47.5,92.0
max,43.0,55.0,92.0


### Either you fill the missing value or drop

In [63]:
df.dropna(axis=1)

Unnamed: 0,a
0,23
1,43
2,12


In [64]:
df

Unnamed: 0,a,b,c
0,23,25.0,
1,43,,
2,12,55.0,92.0


In [65]:
df.fillna(value='Value to fill')

Unnamed: 0,a,b,c
0,23,25.0,Value to fill
1,43,Value to fill,Value to fill
2,12,55.0,92.0


In [66]:
df

Unnamed: 0,a,b,c
0,23,25.0,
1,43,,
2,12,55.0,92.0


In [67]:
df['b'] = df.fillna(value=df['b'].mean())

In [68]:
df

Unnamed: 0,a,b,c
0,23,23.0,
1,43,43.0,
2,12,12.0,92.0


In [69]:
df['c'] = df.fillna(value = df['c'].mean())

In [70]:
cmp = 'GOOG GOOG GOOG AAPL AAPL AAPL MSFT MSFT'.split()
sp = 'KIRAN VANDANA NAGA VIPUL OSCAR PUSPA SAIMA YESH'.split()
sales = [1000,2100,92121,1281,312818,1292,1821,281921]

In [71]:
df = pd.DataFrame(sales)
df['Company'] = cmp
df['Salesperson'] = sp

In [72]:
df

Unnamed: 0,0,Company,Salesperson
0,1000,GOOG,KIRAN
1,2100,GOOG,VANDANA
2,92121,GOOG,NAGA
3,1281,AAPL,VIPUL
4,312818,AAPL,OSCAR
5,1292,AAPL,PUSPA
6,1821,MSFT,SAIMA
7,281921,MSFT,YESH


In [73]:
df.groupby('Company')

<pandas.core.groupby.generic.DataFrameGroupBy object at 0x11754fd90>

In [74]:
df.groupby('Company').sum()

Unnamed: 0_level_0,0
Company,Unnamed: 1_level_1
AAPL,315391
GOOG,95221
MSFT,283742


In [75]:
df.groupby('Company').describe()

Unnamed: 0_level_0,0,0,0,0,0,0,0,0
Unnamed: 0_level_1,count,mean,std,min,25%,50%,75%,max
Company,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2
AAPL,3.0,105130.333333,179862.79547,1281.0,1286.5,1292.0,157055.0,312818.0
GOOG,3.0,31740.333333,52294.083607,1000.0,1550.0,2100.0,47110.5,92121.0
MSFT,2.0,141871.0,198060.60941,1821.0,71846.0,141871.0,211896.0,281921.0


In [76]:
df.groupby('Company').describe().transpose()

Unnamed: 0,Company,AAPL,GOOG,MSFT
0,count,3.0,3.0,2.0
0,mean,105130.333333,31740.333333,141871.0
0,std,179862.79547,52294.083607,198060.60941
0,min,1281.0,1000.0,1821.0
0,25%,1286.5,1550.0,71846.0
0,50%,1292.0,2100.0,141871.0
0,75%,157055.0,47110.5,211896.0
0,max,312818.0,92121.0,281921.0


In [85]:
df

Unnamed: 0,0,Company,Salesperson
0,1000,GOOG,KIRAN
1,2100,GOOG,VANDANA
2,92121,GOOG,NAGA
3,1281,AAPL,VIPUL
4,312818,AAPL,OSCAR
5,1292,AAPL,PUSPA
6,1821,MSFT,SAIMA
7,281921,MSFT,YESH


In [86]:
df.applymap(lambda x:len(str(x)))

Unnamed: 0,0,Company,Salesperson
0,4,4,5
1,4,4,7
2,5,4,4
3,4,4,5
4,6,4,5
5,4,4,5
6,4,4,5
7,6,4,4


In [90]:
df.columns

Index([0, 'Company', 'Salesperson'], dtype='object')

In [91]:
df.dtypes

0               int64
Company        object
Salesperson    object
dtype: object

In [96]:
df.isin({'Company':['GOOG','MSFT']})['Company']

0     True
1     True
2     True
3    False
4    False
5    False
6     True
7     True
Name: Company, dtype: bool

## Pivot Tables

In [99]:
url = "https://raw.githubusercontent.com/resbaz/r-novice-gapminder-files/master/data/gapminder-FiveYearData.csv"

df = pd.read_csv(url)

df.head()

Unnamed: 0,country,year,pop,continent,lifeExp,gdpPercap
0,Afghanistan,1952,8425333.0,Asia,28.801,779.445314
1,Afghanistan,1957,9240934.0,Asia,30.332,820.85303
2,Afghanistan,1962,10267083.0,Asia,31.997,853.10071
3,Afghanistan,1967,11537966.0,Asia,34.02,836.197138
4,Afghanistan,1972,13079460.0,Asia,36.088,739.981106


In [100]:
df.shape

(1704, 6)

In [102]:
df.describe()

Unnamed: 0,year,pop,lifeExp,gdpPercap
count,1704.0,1704.0,1704.0,1704.0
mean,1979.5,29601210.0,59.474439,7215.327081
std,17.26533,106157900.0,12.917107,9857.454543
min,1952.0,60011.0,23.599,241.165876
25%,1965.75,2793664.0,48.198,1202.060309
50%,1979.5,7023596.0,60.7125,3531.846988
75%,1993.25,19585220.0,70.8455,9325.462346
max,2007.0,1318683000.0,82.603,113523.1329


In [103]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1704 entries, 0 to 1703
Data columns (total 6 columns):
 #   Column     Non-Null Count  Dtype  
---  ------     --------------  -----  
 0   country    1704 non-null   object 
 1   year       1704 non-null   int64  
 2   pop        1704 non-null   float64
 3   continent  1704 non-null   object 
 4   lifeExp    1704 non-null   float64
 5   gdpPercap  1704 non-null   float64
dtypes: float64(3), int64(1), object(2)
memory usage: 80.0+ KB


In [104]:
df1 = df[['continent','lifeExp']]

In [110]:
pd.pivot_table(df1,values="lifeExp",columns="continent")

continent,Africa,Americas,Asia,Europe,Oceania
lifeExp,48.86533,64.658737,60.064903,71.903686,74.326208


In [112]:
df['continent'].value_counts()

Africa      624
Asia        396
Europe      360
Americas    300
Oceania      24
Name: continent, dtype: int64