# Pandas DataFrame overview

## Arithmetic and Data Alignment

In [1]:
import pandas as pd
import numpy as np
df1 = pd.DataFrame(np.arange(9.).reshape((3, 3)), 
                   columns=list('bcd'), 
                   index=['Ohio', 'Texas', 'Colorado'])

df2 = pd.DataFrame(np.arange(12.).reshape((4, 3)), 
                   columns=list('bde'),
                    index=['Utah', 'Ohio', 'Texas', 'Oregon'])

print(df1)
print(df2)
print()
# applying plus operation between two data frames
df3 = df1 + df2

print(df3)
# your work is to fill all Nan values of  this df3 with a number, 
# choice of number is yours

            b    c    d
Ohio      0.0  1.0  2.0
Texas     3.0  4.0  5.0
Colorado  6.0  7.0  8.0
          b     d     e
Utah    0.0   1.0   2.0
Ohio    3.0   4.0   5.0
Texas   6.0   7.0   8.0
Oregon  9.0  10.0  11.0

            b   c     d   e
Colorado  NaN NaN   NaN NaN
Ohio      3.0 NaN   6.0 NaN
Oregon    NaN NaN   NaN NaN
Texas     9.0 NaN  12.0 NaN
Utah      NaN NaN   NaN NaN


## Arithmetic methods with fill values

In [2]:
df1 = pd.DataFrame(np.arange(12.).reshape((3, 4)),
                   columns=list('abcd'))
df2 = pd.DataFrame(np.arange(20.).reshape((4, 5)),
                    columns=list('abcde'))
print(df1)
#print(df2)

df2.loc[1, 'b'] = np.nan
print(df2)
df3 = df1 + df2
print()
print("direct + operation without fill_value")
#print(df3)
print("--------")
print()
# We can use add method for filling NaN cells with a value
# Nan will be replaced by 0 and then addition operation will apply
print("addition using a method with replacing Nan with 0")
df3 = df1.add(df2, fill_value=0)
print(df3)

     a    b     c     d
0  0.0  1.0   2.0   3.0
1  4.0  5.0   6.0   7.0
2  8.0  9.0  10.0  11.0
      a     b     c     d     e
0   0.0   1.0   2.0   3.0   4.0
1   5.0   NaN   7.0   8.0   9.0
2  10.0  11.0  12.0  13.0  14.0
3  15.0  16.0  17.0  18.0  19.0

direct + operation without fill_value
--------

addition using a method with replacing Nan with 0
      a     b     c     d     e
0   0.0   2.0   4.0   6.0   4.0
1   9.0   5.0  13.0  15.0   9.0
2  18.0  20.0  22.0  24.0  14.0
3  15.0  16.0  17.0  18.0  19.0


## Operations between DataFrame and Series

In [3]:
import numpy as np
import pandas as pd
frame = pd.DataFrame(np.arange(12.).reshape((4, 3)),
            columns=list('bde'), 
                     index=['Utah', 'Ohio', 'Texas', 'Oregon'])
# iloc syntax
# iloc[start_row_pos:end_row_pos, start_column_pos: end_col_positin ]
series = frame.iloc[0]
print(frame)
print(series)
print(series.values)
print(series.index)
print(frame - series)
# step 1: run above commands after uncomment
# step 2: comment above prints except the line contains print(frame)
# write following lines
# and run the cell again
print("---- using new series")
series = pd.Series([1,2,3], index =list('bde') )
print(series.index, series.values)
print(frame - series)
# dateframe column names will be match with series index
# because its rows wise broadcasting operation

          b     d     e
Utah    0.0   1.0   2.0
Ohio    3.0   4.0   5.0
Texas   6.0   7.0   8.0
Oregon  9.0  10.0  11.0
b    0.0
d    1.0
e    2.0
Name: Utah, dtype: float64
[0. 1. 2.]
Index(['b', 'd', 'e'], dtype='object')
          b    d    e
Utah    0.0  0.0  0.0
Ohio    3.0  3.0  3.0
Texas   6.0  6.0  6.0
Oregon  9.0  9.0  9.0
---- using new series
Index(['b', 'd', 'e'], dtype='object') [1 2 3]
          b    d    e
Utah   -1.0 -1.0 -1.0
Ohio    2.0  2.0  2.0
Texas   5.0  5.0  5.0
Oregon  8.0  8.0  8.0


## Function Application and Mapping

In [4]:
import numpy as np
import pandas as pd
frame  = pd.DataFrame(np.random.randn(4, 3), 
                             columns=list('bde'), 
                     index=['Utah', 'Ohio', 'Texas', 'Oregon'])
print(frame)
print(np.abs(frame))

print(frame["d"]. min())
print(frame["d"].max())
print(frame["d"].max() - frame["d"]. min())

f = lambda x: x.max() - x.min() 

df = frame.apply(f)
print(df, type(df))

df = frame.apply(f, axis=1)
print(df)

def min_max(x):
  return pd.Series( [x.max() - x.min() ], index=['min-max'])    

df = frame.apply(min_max)
print(df, type(df))

               b         d         e
Utah   -1.748711 -0.043213  1.203116
Ohio   -1.280061  0.136607 -0.083281
Texas  -0.412166  0.086889  0.118777
Oregon -0.750932 -0.767709  1.710582
               b         d         e
Utah    1.748711  0.043213  1.203116
Ohio    1.280061  0.136607  0.083281
Texas   0.412166  0.086889  0.118777
Oregon  0.750932  0.767709  1.710582
-0.7677091857333083
0.13660733742344877
0.904316523156757
b    1.336545
d    0.904317
e    1.793864
dtype: float64 <class 'pandas.core.series.Series'>
Utah      2.951827
Ohio      1.416668
Texas     0.530943
Oregon    2.478292
dtype: float64
                b         d         e
min-max  1.336545  0.904317  1.793864 <class 'pandas.core.frame.DataFrame'>


## Sorting and Ranking

In [5]:
# Sorting and Ranking
import numpy as np
import pandas as pd
frame = pd.DataFrame(np.arange(8).reshape((2, 4)),
                     index=['three', 'one'],
                     columns=['d', 'a', 'b', 'c'])
print(frame)
print()
print(frame.sort_index(axis=1, ascending=True))
print()
print( frame.sort_index())
# defaults in sort: axis = 0, ascending = True

       d  a  b  c
three  0  1  2  3
one    4  5  6  7

       a  b  c  d
three  1  2  3  0
one    5  6  7  4

       d  a  b  c
one    4  5  6  7
three  0  1  2  3


## sort by values

In [6]:
# sort by values
print( frame.sort_values(by='b') )
print(frame.rank(ascending=False, method='max'))
print(frame.rank(ascending=True, method='min'))
print( frame.rank(axis='columns'))


#check details from the book
'''
'average' Default: assign the average rank to each entry in the equal group
'min'
'max'
'first'
'dense'
Use the minimum rank for the whole group
Use the maximum rank for the whole group
Assign ranks in the order the values appear in the data
Like method='min' , but ranks always increase by 1 in between groups rather than the number of equal
elements in a group
'''

       d  a  b  c
three  0  1  2  3
one    4  5  6  7
         d    a    b    c
three  2.0  2.0  2.0  2.0
one    1.0  1.0  1.0  1.0
         d    a    b    c
three  1.0  1.0  1.0  1.0
one    2.0  2.0  2.0  2.0
         d    a    b    c
three  1.0  2.0  3.0  4.0
one    1.0  2.0  3.0  4.0


"\n'average' Default: assign the average rank to each entry in the equal group\n'min'\n'max'\n'first'\n'dense'\nUse the minimum rank for the whole group\nUse the maximum rank for the whole group\nAssign ranks in the order the values appear in the data\nLike method='min' , but ranks always increase by 1 in between groups rather than the number of equal\nelements in a group\n"

## Summarizing and Computing Descriptive Statistics

In [7]:
df = pd.DataFrame([
                    [1.4, np.nan], [7.1, -4.5],
                    [np.nan, np.nan], [0.75, -1.3]
                   ], index=['a', 'b', 'c', 'd'], columns=['one', 'two'])
print(df)
print()
print( df.sum())
print()
print( df.sum(axis='columns'))
# note: use of skipna

    one  two
a  1.40  NaN
b  7.10 -4.5
c   NaN  NaN
d  0.75 -1.3

one    9.25
two   -5.80
dtype: float64

a    1.40
b    2.60
c    0.00
d   -0.55
dtype: float64


In [8]:
print(df)
x = df.mean(axis='columns', skipna=False)
print()
print(x)

    one  two
a  1.40  NaN
b  7.10 -4.5
c   NaN  NaN
d  0.75 -1.3

a      NaN
b    1.300
c      NaN
d   -0.275
dtype: float64


## unique values

In [9]:
# unique values
df = pd.DataFrame([
                    [1.4,1.4, 1.5, np.nan], [7.1, -4.5, 1.5, 1.4],
                    [1.4, np.nan, 0.5, np.nan], [0.75, -1.3, 1.3, np.nan]
                   ], index=['a', 'b', 'c', 'd'], columns=['one', 'two', 'three', 'four'])
print(df)
print()
print(df['one'].unique(), df['two'].unique() )
df['one'].value_counts()

#end of chapter 1

    one  two  three  four
a  1.40  1.4    1.5   NaN
b  7.10 -4.5    1.5   1.4
c  1.40  NaN    0.5   NaN
d  0.75 -1.3    1.3   NaN

[1.4  7.1  0.75] [ 1.4 -4.5  nan -1.3]


1.40    2
7.10    1
0.75    1
Name: one, dtype: int64