## Reindex - How To Reindex Pandas Objects

In [1]:
# pd.Series.reindex?
# pd.DataFrame.reindex?

In [2]:
import pandas as pd
import numpy as np
ob = pd.Series([1, 2, 3, 6], index=['d', 'b', 'a', 'c'])

print(ob)
print(ob.reindex(index=['a', 'b', 'c', 'd']))      #int
print(ob.reindex(index=['a', 'b', 'c', 'd', 'e'])) #NaN -> float

d    1
b    2
a    3
c    6
dtype: int64
a    3
b    2
c    6
d    1
dtype: int64
a    3.0
b    2.0
c    6.0
d    1.0
e    NaN
dtype: float64


In [3]:
ob2 = pd.Series([1, 2, 3], index = [0, 1, 2])

print(ob2)
print(ob2.reindex(index=np.arange(6)))
print(ob2.reindex(index=np.arange(6), method = 'ffill'))

0    1
1    2
2    3
dtype: int64
0    1.0
1    2.0
2    3.0
3    NaN
4    NaN
5    NaN
dtype: float64
0    1
1    2
2    3
3    3
4    3
5    3
dtype: int64


In [4]:
ob3 = pd.DataFrame(np.arange(9).reshape((3, 3)), 
                  index=['a', 'c', 'd'], columns=['Andhra', 'Tamilnadu', 'Kerala'])
print(ob3)
print(ob3.reindex(index=['a', 'b', 'c', 'd']))
print(ob3.reindex(columns=['Andhra', 'Telangana', 'Tamilnadu', 'Kerala']))

   Andhra  Tamilnadu  Kerala
a       0          1       2
c       3          4       5
d       6          7       8
   Andhra  Tamilnadu  Kerala
a     0.0        1.0     2.0
b     NaN        NaN     NaN
c     3.0        4.0     5.0
d     6.0        7.0     8.0
   Andhra  Telangana  Tamilnadu  Kerala
a       0        NaN          1       2
c       3        NaN          4       5
d       6        NaN          7       8


## Drop - Droping Entries From an Axis

In [5]:
# pd.Series.drop?
# pd.DataFrame.drop?

In [6]:
import pandas as pd
import numpy as np

data = pd.Series(np.arange(6), index=['a', 'b', 'c', 'd', 'e', 'f'])

print(data)
print(data.drop('a'))
print(data.drop(['a', 'd']))

a    0
b    1
c    2
d    3
e    4
f    5
dtype: int64
b    1
c    2
d    3
e    4
f    5
dtype: int64
b    1
c    2
e    4
f    5
dtype: int64


In [7]:
dataframe = pd.DataFrame(np.arange(16).reshape((4, 4)), 
                  index=['a', 'b', 'd', 'e'], columns=['Karnataka', 'Andhra', 'Tamilnadu', 'Kerala'])
print(dataframe)
print(dataframe.drop(['a', 'e']))
print(dataframe.drop('Kerala', axis=1))
print(dataframe.drop(['Kerala', 'Andhra'], axis=1))
print(dataframe.drop(['Kerala', 'Tamilnadu'], axis='columns'))
print(dataframe.drop(['Kerala', 'Andhra'], axis=1, inplace=True))

   Karnataka  Andhra  Tamilnadu  Kerala
a          0       1          2       3
b          4       5          6       7
d          8       9         10      11
e         12      13         14      15
   Karnataka  Andhra  Tamilnadu  Kerala
b          4       5          6       7
d          8       9         10      11
   Karnataka  Andhra  Tamilnadu
a          0       1          2
b          4       5          6
d          8       9         10
e         12      13         14
   Karnataka  Tamilnadu
a          0          2
b          4          6
d          8         10
e         12         14
   Karnataka  Andhra
a          0       1
b          4       5
d          8       9
e         12      13
None


## Aggregation - Arithmetic and Data Alignment

```Exponentially Weighted Windows Function```: Provide exponential weighted (EW) functions.

**Available EW functions: mean(), var(), std(), corr(), cov()**

* **com**: float, optional
Specify decay in terms of center of mass, α=1/(1+com), for com≥0.

* **span**: float, optional
Specify decay in terms of span, α=2/(span+1), for span≥1.

* **halflife**: float, str, timedelta, optional
Specify decay in terms of half-life, α=1−exp(−ln(2)/halflife), for halflife>0.

* **alpha**: float, optional
Specify smoothing factor α directly, 0<α≤1.

In [8]:
# pd.DataFrame.add?

In [9]:
import pandas as pd
import numpy as np
df = pd.DataFrame({"A": [0, 1, 2, np.nan, 4, 5, 8, np.nan]})
df

Unnamed: 0,A
0,0.0
1,1.0
2,2.0
3,
4,4.0
5,5.0
6,8.0
7,


In [10]:
df.expanding(2).sum()  # lets Try an Expanding Sum Function
df.expanding(3).corr() # Expanding Function with Correlation Function
df.expanding(3).kurt() # Expanding Function with Kurtosis values

Unnamed: 0,A
0,
1,
2,
3,
4,0.342857
5,-1.963223
6,-0.3
7,-0.3


In [11]:
# Expanding Function with Aggregate Functions
df.expanding(3).agg('mean') # Expanding Function with Aggregate Functions
df.expanding(3).agg(['mean', 'median','max'])

Unnamed: 0_level_0,A,A,A
Unnamed: 0_level_1,mean,median,max
0,,,
1,,,
2,1.0,1.0,2.0
3,1.0,1.0,2.0
4,1.75,1.5,4.0
5,2.4,2.0,5.0
6,3.333333,3.0,8.0
7,3.333333,3.0,8.0


In [12]:
# lets Create a DataFrame
df = pd.DataFrame({'B': [78, 85, 56, 2, np.nan, 4, 85, 8, np.nan]})
df

Unnamed: 0,B
0,78.0
1,85.0
2,56.0
3,2.0
4,
5,4.0
6,85.0
7,8.0
8,


In [13]:
df.ewm(com = 0.5).mean()
df.ewm(span = 5).std()
df.ewm(span = 5).corr()
df.ewm(span = 5).var()

Unnamed: 0,B
0,
1,24.5
2,280.763158
3,1738.489879
4,1738.489879
5,1432.635824
6,2079.547342
7,1774.119282
8,1774.119282


In [14]:
import pandas as pd 
import numpy as np

ser1 = pd.Series([7, 5, 4, 1], index=['a', 'c', 'd', 'e'])
ser2 = pd.Series([7, 5, 4, 1, 3], index=['a', 'c', 'e', 'f', 'g'])
ser1 + ser2

a    14.0
c    10.0
d     NaN
e     5.0
f     NaN
g     NaN
dtype: float64

In [15]:
df1 = pd.DataFrame(np.arange(9).reshape((3, 3)), columns=['a', 'c', 'd'], index=['Andhra', 'Tamilnadu', 'Kerala'])
df2 = pd.DataFrame(np.arange(16).reshape((4, 4)), columns=['a', 'b', 'd', 'e'], index=['Karnataka', 'Andhra', 'Tamilnadu', 'Kerala'])
df1 + df2

Unnamed: 0,a,b,c,d,e
Andhra,4.0,,,8.0,
Karnataka,,,,,
Kerala,18.0,,,22.0,
Tamilnadu,11.0,,,15.0,


In [16]:
df3 = pd.DataFrame({'A': [1, 2]})
df4 = pd.DataFrame({'B': [3, 4]})
df3 + df4

Unnamed: 0,A,B
0,,
1,,


In [17]:
import pandas as pd
import numpy as np

df5 = pd.DataFrame(np.arange(12).reshape((3, 4)), columns=list('abcd'))
df6 = pd.DataFrame(np.arange(20).reshape((4, 5)), columns=list('abcde'))
df5 + df6

Unnamed: 0,a,b,c,d,e
0,0.0,2.0,4.0,6.0,
1,9.0,11.0,13.0,15.0,
2,18.0,20.0,22.0,24.0,
3,,,,,


In [18]:
df5.add(df6, fill_value=0)
# df5.add(df6, fill_value=10)

Unnamed: 0,a,b,c,d,e
0,0.0,2.0,4.0,6.0,4.0
1,9.0,11.0,13.0,15.0,9.0
2,18.0,20.0,22.0,24.0,14.0
3,15.0,16.0,17.0,18.0,19.0


In [19]:
df5 + 10
# df5 - 3
# df5 * 2
# 1/df5

Unnamed: 0,a,b,c,d
0,10,11,12,13
1,14,15,16,17
2,18,19,20,21


In [20]:
df5.radd(10)
# df5.rdiv(1)
# df5.rmul(2)
# df5.rpow(2)

Unnamed: 0,a,b,c,d
0,10,11,12,13
1,14,15,16,17
2,18,19,20,21


## Operations Between DataFrame and Series

In [21]:
import pandas as pd
import numpy as np
df7 = pd.DataFrame(np.arange(12.).reshape((4, 3)),  columns=list('bde'), index=['One', 'Two', 'Three', 'Four'])
ds1 = df7.iloc[0]
df7 - ds1

Unnamed: 0,b,d,e
One,0.0,0.0,0.0
Two,3.0,3.0,3.0
Three,6.0,6.0,6.0
Four,9.0,9.0,9.0


In [22]:
ds2 = pd.Series(range(3), index=['b', 'e', 'f'])
df7 + ser2

Unnamed: 0,a,b,c,d,e,f,g
One,,,,,6.0,,
Two,,,,,9.0,,
Three,,,,,12.0,,
Four,,,,,15.0,,


In [23]:
df7.sub(df7, axis='index')

Unnamed: 0,b,d,e
One,0.0,0.0,0.0
Two,0.0,0.0,0.0
Three,0.0,0.0,0.0
Four,0.0,0.0,0.0


## lambda - Function Application and Mapping

In [24]:
# pd.DataFrame.apply?
# pd.DataFrame.applymap?

In [25]:
import pandas as pd
import numpy as np
df8 = pd.DataFrame(np.random.randn(4, 3),  
                  columns=list('bde'), index=['One', 'Two', 'Three', 'Four'])
df8

Unnamed: 0,b,d,e
One,0.606333,-0.586117,-0.134542
Two,-1.200897,0.765115,0.507807
Three,-0.33249,1.258358,1.760344
Four,2.119337,-0.410443,0.040586


In [26]:
abs(df8)

Unnamed: 0,b,d,e
One,0.606333,0.586117,0.134542
Two,1.200897,0.765115,0.507807
Three,0.33249,1.258358,1.760344
Four,2.119337,0.410443,0.040586


In [27]:
f = lambda x: x.max()
#f = lambda x: x.min()
#f = lambda x: x.max() - x.min()
df8.apply(f)

b    2.119337
d    1.258358
e    1.760344
dtype: float64

In [28]:
f = lambda x: x.max()
#f = lambda x: x.max() - x.min()
df8.apply(f, axis='columns')
df8.apply(f, axis='columns')

One      0.606333
Two      0.765115
Three    1.760344
Four     2.119337
dtype: float64

In [29]:
def f(x):  return pd.Series([x.max(), x.min(), x.mean()], index=['max', 'min', 'mean'])
df8.apply(f)

Unnamed: 0,b,d,e
max,2.119337,1.258358,1.760344
min,-1.200897,-0.586117,-0.134542
mean,0.298071,0.256728,0.543549


In [30]:
f = lambda x: '%.3f' %x
df8.applymap(f)

Unnamed: 0,b,d,e
One,0.606,-0.586,-0.135
Two,-1.201,0.765,0.508
Three,-0.332,1.258,1.76
Four,2.119,-0.41,0.041


## sort - Sorting and Ranking

In [31]:
# pd.DataFrame.sort_index?
# pd.DataFrame.rank?
# pd.DataFrame.sort_values?

In [32]:
import pandas as pd
import numpy as np

series = pd.Series(range(6), index=['d', 'a', 'b', 'c', 'f', 'g'])
series.sort_index(axis=0, level=None, ascending=True)

a    1
b    2
c    3
d    0
f    4
g    5
dtype: int64

In [33]:
df8 = pd.DataFrame(np.random.randn(4, 3),  
                  columns=list('bde'), index=['One', 'Two', 'Three', 'Four'])
df8.rank()
# df8.rank(axis='columns')
# df8.loc[:, 'b']

Unnamed: 0,b,d,e
One,3.0,2.0,3.0
Two,2.0,1.0,4.0
Three,4.0,3.0,1.0
Four,1.0,4.0,2.0


In [34]:
df9 = pd.DataFrame(np.random.randn(4, 5),  
                   columns=list('bdeac'), index=['1', '3', '2', '4'])
df9.sort_index(axis=1, level=None, ascending=True)
# df9.sort_index(axis=1, level=None, ascending=False)
# df9.sort_index(axis=0, level=None, ascending=True)
# df9.sort_values(by=['b'])
# df9.sort_values(by=['d'])

Unnamed: 0,a,b,c,d,e
1,0.458762,0.226919,-0.533558,0.992763,0.383123
3,-0.875593,-0.599772,1.223011,2.207204,-1.18564
2,-0.169795,0.224202,0.414512,-0.091752,-1.507841
4,-0.535319,-0.58772,0.969841,-0.404804,-1.791777


In [35]:
df10 =pd.DataFrame(np.arange(12).reshape((3, 4)), 
                   index=['1', '3', '2'], columns=['d', 'a', 'b', 'c'])
df10.sort_index(axis='index', level=None, ascending=True)

Unnamed: 0,d,a,b,c
1,0,1,2,3
2,8,9,10,11
3,4,5,6,7


## describe - How to Summarise and compute Descriptive Statistics?

In [36]:
# pd.Series.describe?

In [37]:
import pandas as pd
import numpy as np
df12 = pd.DataFrame([[1.4, np.nan], [7.1, -4.5], [np.nan, np.nan], [0.75, -1.3]], 
                    index=['a', 'b', 'c', 'd'], columns=['one', 'two'])
df12

Unnamed: 0,one,two
a,1.4,
b,7.1,-4.5
c,,
d,0.75,-1.3


In [38]:
df12.describe()

Unnamed: 0,one,two
count,3.0,2.0
mean,3.083333,-2.9
std,3.493685,2.262742
min,0.75,-4.5
25%,1.075,-3.7
50%,1.4,-2.9
75%,4.25,-2.1
max,7.1,-1.3


In [39]:
df12.sum()
# df12.sum(axis='columns')
# df12.mean(axis='columns', skipna=False)
# df12.idxmax()
# df12.cumsum()

one    9.25
two   -5.80
dtype: float64

## The Melt Function
* This function is useful to massage a DataFrame into a format where one or more columns are identifier variables (id_vars), while all other columns, considered measured variables (value_vars), are “unpivoted” to the row axis, leaving just two non-identifier columns, ‘variable’ and ‘value’.
![image.png](attachment:image.png)

In [40]:
import pandas as pd
df = pd.DataFrame({'A': {0: 'a', 1: 'b', 2: 'c'},
                   'B': {0: 1, 1: 3, 2: 5},
                   'C': {0: 2, 1: 4, 2: 6}})

df

Unnamed: 0,A,B,C
0,a,1,2
1,b,3,4
2,c,5,6


In [41]:
# lets melt the data 
df.melt(id_vars=['A'], value_vars=['B'])

Unnamed: 0,A,variable,value
0,a,B,1
1,b,B,3
2,c,B,5


In [42]:
#create a dataframe
df = pd.DataFrame({'Name': {0:'Ritika',1:'shyam',2:'neil'},
                  'Course': {0:'Masters',1:'Graduate',2:'Masters'},
                  'Age': {0:22,1:20,2:24}})
df

Unnamed: 0,Name,Course,Age
0,Ritika,Masters,22
1,shyam,Graduate,20
2,neil,Masters,24


In [43]:
df.melt(id_vars=['Name'], value_vars=['Course','Age'])

Unnamed: 0,Name,variable,value
0,Ritika,Course,Masters
1,shyam,Course,Graduate
2,neil,Course,Masters
3,Ritika,Age,22
4,shyam,Age,20
5,neil,Age,24


## The Explode Function 

* Transform each element of a list-like to a row, replicating index values. 將類似列表的每個元素轉換為一行，複製索引值
![image.png](attachment:image.png)

In [44]:
# lets Create a Data Frame
df = pd.DataFrame({'A': [[1, 2, 3], 'foo', [], [3, 4]], 'B': 1})
df

Unnamed: 0,A,B
0,"[1, 2, 3]",1
1,foo,1
2,[],1
3,"[3, 4]",1


In [45]:
# lets Explode the Column A
df.explode('A')

Unnamed: 0,A,B
0,1,1
0,2,1
0,3,1
1,foo,1
2,,1
3,3,1
3,4,1


## Squeeze Function
* Series or DataFrames with a single element are squeezed to a scalar. DataFrames with a single column or a single row are squeezed to a Series. Otherwise the object is unchanged.

* This method is most useful when you don’t know if your object is a Series or DataFrame, but you do know it has just a single column. In that case you can safely call squeeze to ensure you have a Series.

![image.png](attachment:image.png)

In [46]:
# lets create a DataFrame
df = pd.DataFrame([[1, 2], [3, 4]], columns=['a', 'b'])
df

Unnamed: 0,a,b
0,1,2
1,3,4


In [47]:
# Slicing a single column will produce a DataFrame with the columns having only one value:
df_a = df[['a']]
df_a

Unnamed: 0,a
0,1
1,3


In [48]:
#lets Squeeze df_a, to get scalar values
df_a.squeeze()

0    1
1    3
Name: a, dtype: int64

## Melt() function
Melt function is used to change the dataframe format from wide to long. It is used to create a specific format of the dataframe object where one or more columns work as identifiers.

![image.png](attachment:image.png)

In [49]:
import pandas as pd
df = pd.read_csv('input/life_expectancy_years.csv')
df.head()

Unnamed: 0,country,1800,1801,1802,1803,1804,1805,1806,1807,1808,...,2091,2092,2093,2094,2095,2096,2097,2098,2099,2100
0,Afghanistan,28.2,28.2,28.2,28.2,28.2,28.2,28.1,28.1,28.1,...,76.5,76.6,76.7,76.9,77.0,77.1,77.3,77.4,77.5,77.7
1,Albania,35.4,35.4,35.4,35.4,35.4,35.4,35.4,35.4,35.4,...,87.4,87.5,87.6,87.7,87.8,87.9,88.0,88.1,88.2,88.3
2,Algeria,28.8,28.8,28.8,28.8,28.8,28.8,28.8,28.8,28.8,...,88.3,88.4,88.5,88.6,88.7,88.8,88.9,89.0,89.1,89.2
3,Andorra,,,,,,,,,,...,,,,,,,,,,
4,Angola,27.0,27.0,27.0,27.0,27.0,27.0,27.0,27.0,27.0,...,78.7,78.9,79.0,79.1,79.3,79.4,79.5,79.7,79.8,79.9


In [50]:
df_melted = pd.melt(df, id_vars = 'country', var_name = 'Year', value_name = 'Life Expectancy')
df_melted

Unnamed: 0,country,Year,Life Expectancy
0,Afghanistan,1800,28.2
1,Albania,1800,35.4
2,Algeria,1800,28.8
3,Andorra,1800,
4,Angola,1800,27.0
...,...,...,...
56282,Venezuela,2100,87.3
56283,Vietnam,2100,85.3
56284,Yemen,2100,78.4
56285,Zambia,2100,78.1


In [51]:
df_melted.sort_values(by = 'Life Expectancy', ascending=False)

Unnamed: 0,country,Year,Life Expectancy
56202,Maldives,2100,94.8
56015,Maldives,2099,94.7
56247,Singapore,2100,94.7
56060,Singapore,2099,94.6
55873,Singapore,2098,94.5
...,...,...,...
55961,Dominica,2099,
56018,Marshall Islands,2099,
56103,Andorra,2100,
56148,Dominica,2100,


In [52]:
df.T

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,177,178,179,180,181,182,183,184,185,186
country,Afghanistan,Albania,Algeria,Andorra,Angola,Antigua and Barbuda,Argentina,Armenia,Australia,Austria,...,United Kingdom,United States,Uruguay,Uzbekistan,Vanuatu,Venezuela,Vietnam,Yemen,Zambia,Zimbabwe
1800,28.2,35.4,28.8,,27.0,33.5,33.2,34.0,34.0,34.4,...,38.6,39.4,32.9,26.9,24.3,32.2,32.0,23.4,32.6,33.7
1801,28.2,35.4,28.8,,27.0,33.5,33.2,34.0,34.0,34.4,...,37.4,39.4,32.9,26.9,24.3,32.2,32.0,23.4,32.6,33.7
1802,28.2,35.4,28.8,,27.0,33.5,33.2,34.0,34.0,34.4,...,38.6,39.4,32.9,26.9,24.3,32.2,32.0,23.4,32.6,33.7
1803,28.2,35.4,28.8,,27.0,33.5,33.2,34.0,34.0,34.4,...,37.3,39.4,32.9,26.9,24.3,32.2,32.0,23.4,32.6,33.7
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2096,77.1,87.9,88.8,,79.4,86.6,87.2,85.9,91.6,91.3,...,90.2,88.1,87.1,79.5,74.0,86.9,84.8,77.9,77.6,75.1
2097,77.3,88.0,88.9,,79.5,86.7,87.3,86.0,91.7,91.5,...,90.3,88.2,87.2,79.6,74.2,87.0,84.9,78.0,77.7,75.3
2098,77.4,88.1,89.0,,79.7,86.8,87.4,86.2,91.8,91.6,...,90.4,88.3,87.3,79.8,74.3,87.1,85.0,78.2,77.8,75.4
2099,77.5,88.2,89.1,,79.8,86.9,87.5,86.3,91.9,91.7,...,90.5,88.4,87.4,79.9,74.4,87.2,85.2,78.3,78.0,75.5
