# Pandas - Anees Ahmad - Review - 2021/01/16

# Deleting data (row or column from dataframe) 

In [1]:
import numpy as np
import pandas as pd
data_df = pd.DataFrame(np.arange(16).reshape((4, 4)), 
             index=['Ohio', 'Colorado', 'Utah', 'New York'],
             columns=['one', 'two', 'three', 'four'])
print(data_df)

          one  two  three  four
Ohio        0    1      2     3
Colorado    4    5      6     7
Utah        8    9     10    11
New York   12   13     14    15


In [2]:
import numpy as np
import pandas as pd

data_df = pd.DataFrame(np.arange(16).reshape((4, 4)), 
             index=['Ohio', 'Colorado', 'Utah', 'New York'],
             columns=['one', 'two', 'three', 'four'])

print(data_df, "\n")


#You can drop values from the columns by passing axis=1 
#or axis='columns' :
data_df = data_df.drop('two', axis=1)
data_df

          one  two  three  four
Ohio        0    1      2     3
Colorado    4    5      6     7
Utah        8    9     10    11
New York   12   13     14    15 



Unnamed: 0,one,three,four
Ohio,0,2,3
Colorado,4,6,7
Utah,8,10,11
New York,12,14,15


In [3]:
data_df.drop('one', axis=1, inplace = True)
print(data_df)

# if i want to remove a row for example 'Colorado', 
# how can we do that

          three  four
Ohio          2     3
Colorado      6     7
Utah         10    11
New York     14    15


In [4]:
data_df.drop('Ohio', axis=0, inplace = True)
print(data_df)

          three  four
Colorado      6     7
Utah         10    11
New York     14    15


# Indexing, Selection, and Filtering

In [5]:
import numpy as np
import pandas as pd

data_df = pd.DataFrame(np.arange(16).reshape((4, 4)), 
             index=['Ohio', 'Colorado', 'Utah', 'New York'],
             columns=['one', 'two', 'three', 'four'])

print(data_df, "\n")

          one  two  three  four
Ohio        0    1      2     3
Colorado    4    5      6     7
Utah        8    9     10    11
New York   12   13     14    15 



In [6]:
df2 = data_df[ ["one", "three"] ]
print(df2)


          one  three
Ohio        0      2
Colorado    4      6
Utah        8     10
New York   12     14


In [7]:
print( data_df[2:] ) # same like numpy

          one  two  three  four
Utah        8    9     10    11
New York   12   13     14    15


In [8]:
print(data_df["one"] )# dicitonary like style of accessing data

Ohio         0
Colorado     4
Utah         8
New York    12
Name: one, dtype: int32


In [9]:
print(data_df.one[2:])  # filter on both row and column

Utah         8
New York    12
Name: one, dtype: int32


In [10]:
# Conditonal Selection
print ( data_df.three[data_df['three'] > 5] )
print(data_df, "\n")

Colorado     6
Utah        10
New York    14
Name: three, dtype: int32
          one  two  three  four
Ohio        0    1      2     3
Colorado    4    5      6     7
Utah        8    9     10    11
New York   12   13     14    15 



In [11]:
f2 = data_df["three"] > 5  # boolean dataframe
print(f2)
print(df2)

Ohio        False
Colorado     True
Utah         True
New York     True
Name: three, dtype: bool
          one  three
Ohio        0      2
Colorado    4      6
Utah        8     10
New York   12     14


In [12]:
print( data_df[  data_df["three"] > 5  ] )

          one  two  three  four
Colorado    4    5      6     7
Utah        8    9     10    11
New York   12   13     14    15


# Selection with loc and iloc

In [13]:
import numpy as np
import pandas as pd

data_df = pd.DataFrame(np.arange(16).reshape((4, 4)), 
             index=['Ohio', 'Colorado', 'Utah', 'New York'],
             columns=['one', 'two', 'three', 'four'])

print(data_df, "\n")

          one  two  three  four
Ohio        0    1      2     3
Colorado    4    5      6     7
Utah        8    9     10    11
New York   12   13     14    15 



In [14]:
# in the loc method specify row label first 
# then specify column names
# remember! mutiple column names require array notation
print( data_df.loc[['Colorado','Ohio'], ['two',  'three'] ] )


          two  three
Colorado    5      6
Ohio        1      2


In [15]:
print( data_df.iloc[2:, [3, 0, 1] ] ) # using number instead of labels

          four  one  two
Utah        11    8    9
New York    15   12   13


In [16]:
print( data_df.iloc[:])

          one  two  three  four
Ohio        0    1      2     3
Colorado    4    5      6     7
Utah        8    9     10    11
New York   12   13     14    15


In [17]:
print ( data_df.iloc[ :3 , :3 ] )

          one  two  three
Ohio        0    1      2
Colorado    4    5      6
Utah        8    9     10


In [18]:
print( data_df.iloc[:, :3])

          one  two  three
Ohio        0    1      2
Colorado    4    5      6
Utah        8    9     10
New York   12   13     14


# Arithmetic and Data Alignment

In [19]:
#print(list('bcd'))
df1 = pd.DataFrame(np.arange(9.).reshape((3, 3)), 
                   columns=list('bcd'), 
                   index=['Ohio', 'Texas', 'Colorado'])

df2 = pd.DataFrame(np.arange(12.).reshape((4, 3)), 
                   columns=list('bde'),
                    index=['Utah', 'Ohio', 'Texas', 'Oregon'])

print(df1)
print(df2)
print()
# applying plus operation between two data frames
df3 = df1 + df2

print(df3)
# your work is to fill all Nan values of  this df3 with a number, 
# choice of number is yours

            b    c    d
Ohio      0.0  1.0  2.0
Texas     3.0  4.0  5.0
Colorado  6.0  7.0  8.0
          b     d     e
Utah    0.0   1.0   2.0
Ohio    3.0   4.0   5.0
Texas   6.0   7.0   8.0
Oregon  9.0  10.0  11.0

            b   c     d   e
Colorado  NaN NaN   NaN NaN
Ohio      3.0 NaN   6.0 NaN
Oregon    NaN NaN   NaN NaN
Texas     9.0 NaN  12.0 NaN
Utah      NaN NaN   NaN NaN


# Arithmetic methods with fill values

In [20]:
df1 = pd.DataFrame(np.arange(12.).reshape((3, 4)),
                   columns=list('abcd'))
df2 = pd.DataFrame(np.arange(20.).reshape((4, 5)),
                    columns=list('abcde'))
print(df1)
print(df2)

     a    b     c     d
0  0.0  1.0   2.0   3.0
1  4.0  5.0   6.0   7.0
2  8.0  9.0  10.0  11.0
      a     b     c     d     e
0   0.0   1.0   2.0   3.0   4.0
1   5.0   6.0   7.0   8.0   9.0
2  10.0  11.0  12.0  13.0  14.0
3  15.0  16.0  17.0  18.0  19.0


In [21]:
df2.loc[1, 'b'] = np.nan
print(df2)

      a     b     c     d     e
0   0.0   1.0   2.0   3.0   4.0
1   5.0   NaN   7.0   8.0   9.0
2  10.0  11.0  12.0  13.0  14.0
3  15.0  16.0  17.0  18.0  19.0


In [22]:
df3 = df1 + df2
print()
print("direct + operation without fill_value")
print(df3)


direct + operation without fill_value
      a     b     c     d   e
0   0.0   2.0   4.0   6.0 NaN
1   9.0   NaN  13.0  15.0 NaN
2  18.0  20.0  22.0  24.0 NaN
3   NaN   NaN   NaN   NaN NaN


In [23]:
# We can use add method for filling NaN cells with a value
# Nan will be replaced by 0 and then addition operation will apply
print("addition using a method with replacing Nan with 0")
df3 = df1.add(df2, fill_value=0)
print(df3)

addition using a method with replacing Nan with 0
      a     b     c     d     e
0   0.0   2.0   4.0   6.0   4.0
1   9.0   5.0  13.0  15.0   9.0
2  18.0  20.0  22.0  24.0  14.0
3  15.0  16.0  17.0  18.0  19.0


# Operations between DataFrame and Series

In [24]:
import numpy as np
import pandas as pd
frame = pd.DataFrame(np.arange(12.).reshape((4, 3)),
            columns=list('bde'), 
                     index=['Utah', 'Ohio', 'Texas', 'Oregon'])
print(frame)

          b     d     e
Utah    0.0   1.0   2.0
Ohio    3.0   4.0   5.0
Texas   6.0   7.0   8.0
Oregon  9.0  10.0  11.0


In [25]:
# iloc syntax
# iloc[start_row_pos:end_row_pos, start_column_pos: end_col_positin ]
series = frame.iloc[0]
print(series)
print(series.values)
print(series.index)

b    0.0
d    1.0
e    2.0
Name: Utah, dtype: float64
[0. 1. 2.]
Index(['b', 'd', 'e'], dtype='object')


In [26]:
print(frame - series)

          b    d    e
Utah    0.0  0.0  0.0
Ohio    3.0  3.0  3.0
Texas   6.0  6.0  6.0
Oregon  9.0  9.0  9.0


In [27]:
# step 1: run above commands after uncomment
# step 2: comment above prints except the line contains print(frame)
# write following lines
# and run the cell again
print("---- using new series")
series = pd.Series([1,2,3], index =list('bde') )
print(series.index, series.values)
print(frame - series)
# dateframe column names will be match with series index
# because its rows wise broadcasting operation

---- using new series
Index(['b', 'd', 'e'], dtype='object') [1 2 3]
          b    d    e
Utah   -1.0 -1.0 -1.0
Ohio    2.0  2.0  2.0
Texas   5.0  5.0  5.0
Oregon  8.0  8.0  8.0


# Function Application and Mapping

In [28]:
import numpy as np
import pandas as pd
frame  = pd.DataFrame(np.random.randn(4, 3), 
                             columns=list('bde'), 
                     index=['Utah', 'Ohio', 'Texas', 'Oregon'])
print(frame)


               b         d         e
Utah    0.796416  0.174791 -1.489431
Ohio    0.745064  0.600309 -0.696192
Texas   0.572512  1.699900 -0.320685
Oregon  1.209785 -0.311116  0.429570


In [29]:
print(np.abs(frame))

               b         d         e
Utah    0.796416  0.174791  1.489431
Ohio    0.745064  0.600309  0.696192
Texas   0.572512  1.699900  0.320685
Oregon  1.209785  0.311116  0.429570


In [30]:
print(frame["d"]. min())
print(frame["d"].max())
print(frame["d"].max() - frame["d"]. min())

-0.31111571690786577
1.6999002331490787
2.0110159500569447


In [31]:
f = lambda x: x.max() - x.min() 
df = frame.apply(f)
print(df, type(df))

b    0.637273
d    2.011016
e    1.919000
dtype: float64 <class 'pandas.core.series.Series'>


In [32]:
df = frame.apply(f, axis=1)
print(df)

Utah      2.285847
Ohio      1.441256
Texas     2.020585
Oregon    1.520900
dtype: float64


In [33]:
def min_max(x):
  return pd.Series( [x.max() - x.min() ], index=['min-max'])    

df = frame.apply(min_max)
print(df, type(df))

                b         d      e
min-max  0.637273  2.011016  1.919 <class 'pandas.core.frame.DataFrame'>


In [34]:
# Sorting and Ranking
import numpy as np
import pandas as pd
frame = pd.DataFrame(np.arange(8).reshape((2, 4)),
                     index=['three', 'one'],
                     columns=['d', 'a', 'b', 'c'])
print(frame)
print()
print(frame.sort_index(axis=1, ascending=True))
print()
print( frame.sort_index())
# defaults in sort: axis = 0, ascending = True


       d  a  b  c
three  0  1  2  3
one    4  5  6  7

       a  b  c  d
three  1  2  3  0
one    5  6  7  4

       d  a  b  c
one    4  5  6  7
three  0  1  2  3


In [35]:
# sort by values
print( frame.sort_values(by='b') )
print(frame.rank(ascending=False, method='max'))
print(frame.rank(ascending=True, method='min'))
print( frame.rank(axis='columns'))


#check details from the book
'''
'average' Default: assign the average rank to each entry in the equal group
'min'
'max'
'first'
'dense'
Use the minimum rank for the whole group
Use the maximum rank for the whole group
Assign ranks in the order the values appear in the data
Like method='min' , but ranks always increase by 1 in between groups rather than the number of equal
elements in a group
'''

       d  a  b  c
three  0  1  2  3
one    4  5  6  7
         d    a    b    c
three  2.0  2.0  2.0  2.0
one    1.0  1.0  1.0  1.0
         d    a    b    c
three  1.0  1.0  1.0  1.0
one    2.0  2.0  2.0  2.0
         d    a    b    c
three  1.0  2.0  3.0  4.0
one    1.0  2.0  3.0  4.0


"\n'average' Default: assign the average rank to each entry in the equal group\n'min'\n'max'\n'first'\n'dense'\nUse the minimum rank for the whole group\nUse the maximum rank for the whole group\nAssign ranks in the order the values appear in the data\nLike method='min' , but ranks always increase by 1 in between groups rather than the number of equal\nelements in a group\n"

In [36]:
#Summarizing and Computing Descriptive Statistics

df = pd.DataFrame([
                    [1.4, np.nan], [7.1, -4.5],
                    [np.nan, np.nan], [0.75, -1.3]
                   ], index=['a', 'b', 'c', 'd'], columns=['one', 'two'])
print(df)
print()
print( df.sum())
print()
print( df.sum(axis='columns'))

    one  two
a  1.40  NaN
b  7.10 -4.5
c   NaN  NaN
d  0.75 -1.3

one    9.25
two   -5.80
dtype: float64

a    1.40
b    2.60
c    0.00
d   -0.55
dtype: float64


In [37]:
print(df)
x = df.mean(axis='columns', skipna=False)
print()
print(x)

    one  two
a  1.40  NaN
b  7.10 -4.5
c   NaN  NaN
d  0.75 -1.3

a      NaN
b    1.300
c      NaN
d   -0.275
dtype: float64


In [38]:
# unique values
df = pd.DataFrame([
                    [1.4,1.4, 1.5, np.nan], [7.1, -4.5, 1.5, 1.4],
                    [1.4, np.nan, 0.5, np.nan], [0.75, -1.3, 1.3, np.nan]
                   ], index=['a', 'b', 'c', 'd'], columns=['one', 'two', 'three', 'four'])
print(df)
print()
print(df['one'].unique(), df['two'].unique() )
df['one'].value_counts()

#end of chapter 1

    one  two  three  four
a  1.40  1.4    1.5   NaN
b  7.10 -4.5    1.5   1.4
c  1.40  NaN    0.5   NaN
d  0.75 -1.3    1.3   NaN

[1.4  7.1  0.75] [ 1.4 -4.5  nan -1.3]


1.40    2
7.10    1
0.75    1
Name: one, dtype: int64