# DataFrame Operations

### Convert DataType

In [6]:
import pandas as pd
data = {
        'student_id' : [1, 2],
        'name' : ['Ava', 'Kate'],
        'age' : [6, 15],
        'grade' : [73.0, 87.0]
       }

# add two data frames
df = pd.DataFrame(data)
df['grade'] = df["grade"].astype(int) 
print('df = ', df)

df =     student_id  name  age  grade
0           1   Ava    6     73
1           2  Kate   15     87


### Handle NaN data
| Method | Description |
| --- | --- |
| dropna | Filter axis labels based on whether values for each label having missing data, with varying thresholds for how much missing data to tolerate. |
| fillna | Fill in missing data with some value or using an interpolation method such as 'ffill' or 'bfill'. |
| isnull | return boolean values indicating which values are missing N/A. |
| notnull | Negation of isnull |

#### Drop missing data

In [12]:
import pandas as pd
data = {
        'student_id' : [32, 217, 779, 849],
        'name' : ['Piper', 'Grace', None, None],
        'age' : [5, 19, 20, 14]
       }

# add two data frames
df = pd.DataFrame(data)
print("df = ")
print(df)
print("df.dropna(subset = ['name']")
print(df.dropna(subset = ['name']))

df = 
   student_id   name  age
0          32  Piper    5
1         217  Grace   19
2         779   None   20
3         849   None   14
df.dropna(subset = ['name']
   student_id   name  age
0          32  Piper    5
1         217  Grace   19


#### Fill missing data

In [17]:
import pandas as pd
data = {
        'name' : ['Wristwatch', 'WirelessEarbuds', 'GolfClubs', 'Printer'],
        'quantity' : [32, None, None, 849],
        'price' : [135, 821, 9319, 3051]
       }
# add two data frames
df = pd.DataFrame(data)
print("df = ")
print(df)
df['quantity'] = df['quantity'].fillna(0)
print("df['quantity'] = df.fillna(value=0)")
print(df)

df = 
              name  quantity  price
0       Wristwatch      32.0    135
1  WirelessEarbuds       NaN    821
2        GolfClubs       NaN   9319
3          Printer     849.0   3051
df['quantity'] = df.fillna(value=0)
              name  quantity  price
0       Wristwatch      32.0    135
1  WirelessEarbuds       0.0    821
2        GolfClubs       0.0   9319
3          Printer     849.0   3051


### Remove Duplicate

In [20]:
import pandas as pd
data = {
        'customer_id' : [1, 2, 3, 4, 5, 6],
        'name' : ['Ella', 'David', 'Zachary', 'Alice', 'Finn', 'Violet'],
        'email' : ['emily@example.com', 'michael@example.com', 'sarah@example.com', 'john@example.com', 'john@example.com', 'alice@example.com']
       }
df = pd.DataFrame(data)
df = df.drop_duplicates(['email'])
print(df)

   customer_id     name                email
0            1     Ella    emily@example.com
1            2    David  michael@example.com
2            3  Zachary    sarah@example.com
3            4    Alice     john@example.com
5            6   Violet    alice@example.com


## Function application and mapping

#### Example 1: Apply on every row

In [2]:
# Import pandas package 
import pandas as pd
 
# Function to add
def add(a, b, c):
    return a + b + c
 
def main():
     
    # create a dictionary with
    # three fields each
    data = {
            'A':[1, 2, 3], 
            'B':[4, 5, 6], 
            'C':[7, 8, 9] }
     
    # Convert the dictionary into DataFrame 
    df = pd.DataFrame(data)
    print("Original DataFrame:\n")
    print(df)
     
    df['add'] = df.apply(lambda row : add(row['A'],
                     row['B'], row['C']), axis = 1)
  
    print('\nAfter Applying Function: ')
    # printing the new dataframe
    print(df)
  
if __name__ == '__main__':
    main()

Original DataFrame:
    A  B  C
0  1  4  7
1  2  5  8
2  3  6  9

After Applying Function: 
   A  B  C  add
0  1  4  7   12
1  2  5  8   15
2  3  6  9   18


#### Example 2 : Leverage NumPy

In [21]:
import pandas as pd
import numpy as np
  
def main():
     
    # create a dictionary with
    # five fields each
    data = {
            'A':[1, 2, 3], 
            'B':[4, 5, 6], 
            'C':[7, 8, 9] }
     
    # Convert the dictionary into DataFrame 
    df = pd.DataFrame(data)
    print("Original DataFrame:\n", df)
     
    # applying function to each row in the dataframe
    # and storing result in a new column
    df['add'] = df.apply(np.sum, axis = 1)
  
    print('\nAfter Applying Function: ')
    # printing the new dataframe
    print(df)
  
if __name__ == '__main__':
    main()

Original DataFrame:
    A  B  C
0  1  4  7
1  2  5  8
2  3  6  9

After Applying Function: 
   A  B  C  add
0  1  4  7   12
1  2  5  8   15
2  3  6  9   18


#### Example 3: Normalising Data

In [22]:
# Import pandas package 
import pandas as pd
 
def normalize(x, y):
    x_new = ((x - np.mean([x, y])) /
             (max(x, y) - min(x, y)))
     
    # print(x_new)
    return x_new
 
def main():
     
    # create a dictionary with three fields each
    data = {
        'X':[1, 2, 3], 
        'Y':[45, 65, 89] }
     
    # Convert the dictionary into DataFrame 
    df = pd.DataFrame(data)
    print("Original DataFrame:\n", df)
     
    df['X'] = df.apply(lambda row : normalize(row['X'],
                                  row['Y']), axis = 1)
  
    print('\nNormalized:')
    print(df)
  
if __name__ == '__main__':
    main()

Original DataFrame:
    X   Y
0  1  45
1  2  65
2  3  89

Normalized:
     X   Y
0 -0.5  45
1 -0.5  65
2 -0.5  89


#### Example 4: Generate range

In [None]:
import pandas as pd
import numpy as np
  
pd.options.mode.chained_assignment = None
 
# Function to generate range
def generate_range(n):
     
    # printing the range for eg:
    # input is 67 output is 60-70
    n = int(n)
     
    lower_limit = n//10 * 10
    upper_limit = lower_limit + 10
     
    return str(str(lower_limit) + '-' + str(upper_limit))
      
def replace(row):
    for i, item in enumerate(row):
         
        # updating the value of the row
        row[i] = generate_range(item)
    return row
          
  
def main():
    # create a dictionary with
    # three fields each
    data = {
            'A':[0, 2, 3], 
            'B':[4, 15, 6], 
            'C':[47, 8, 19] }
     
    # Convert the dictionary into DataFrame 
    df = pd.DataFrame(data)
  
    print('Before applying function: ')
    print(df)
      
    # applying function to each row in
    # dataframe and storing result in a new column
    df = df.apply(lambda row : replace(row))
      
  
    print('After Applying Function: ')
    # printing the new dataframe
    print(df)
  
if __name__ == '__main__':
    main()

## Applying Lambda Functions to Pandas
#### Dataframe.assign() on a Single Column
In this example, we will apply the lambda function Dataframe.assign() to a single column. The function is applied to the ‘Total_Marks’ column, and a new column ‘Percentage’ is formed with its help.

In [24]:
# importing pandas library
import pandas as pd

# creating and initializing a list
values= [['Rohan',455],['Elvish',250],['Deepak',495],
		['Soni',400],['Radhika',350],['Vansh',450]] 

# creating a pandas dataframe
df = pd.DataFrame(values,columns=['Name','Total_Marks'])

# Applying lambda function to find 
# percentage of 'Total_Marks' column 
# using df.assign()
df = df.assign(Percentage = lambda x: (x['Total_Marks'] /500 * 100))
print(df)


      Name  Total_Marks  Percentage
0    Rohan          455        91.0
1   Elvish          250        50.0
2   Deepak          495        99.0
3     Soni          400        80.0
4  Radhika          350        70.0
5    Vansh          450        90.0


#### Dataframe.assign() on Multiple Columns
In this example, we will apply the lambda function Dataframe.assign() to multiple columns. The lambda function is applied to 3 columns i.e., ‘Field_1’, ‘Field_2’, and ‘Field_3’.

In [25]:
# importing pandas library
import pandas as pd

# creating and initializing a nested list
values_list = [[15, 2.5, 100], [20, 4.5, 50], [25, 5.2, 80],
			[45, 5.8, 48], [40, 6.3, 70], [41, 6.4, 90],
			[51, 2.3, 111]]

# creating a pandas dataframe
df = pd.DataFrame(values_list, columns=['Field_1', 'Field_2', 'Field_3'])

# Applying lambda function to find
# the product of 3 columns using
# df.assign()
df = df.assign(Product=lambda x: (x['Field_1'] * x['Field_2'] * x['Field_3']))

# printing dataframe
print(df)

   Field_1  Field_2  Field_3  Product
0       15      2.5      100   3750.0
1       20      4.5       50   4500.0
2       25      5.2       80  10400.0
3       45      5.8       48  12528.0
4       40      6.3       70  17640.0
5       41      6.4       90  23616.0
6       51      2.3      111  13020.3


#### Dataframe.apply() on a Single Row
In this example, we will apply the lambda function Dataframe.apply() to single row. The lambda function is applied to a row starting with ‘d’ and hence square all values corresponding to it.

In [28]:
# importing pandas and numpy libraries
import pandas as pd
import numpy as np

# creating and initializing a nested list
values_list = [[15, 2.5, 100], [20, 4.5, 50], [25, 5.2, 80],
			[45, 5.8, 48], [40, 6.3, 70], [41, 6.4, 90], 
			[51, 2.3, 111]]

# creating a pandas dataframe
df = pd.DataFrame(values_list, columns=['Field_1', 'Field_2', 'Field_3'],
				index=['a', 'b', 'c', 'd', 'e', 'f', 'g'])

print("Before applying function:")
print(df)
# Apply function numpy.square() to square
# the values of one row only i.e. row 
# with index name 'd'
df = df.apply(lambda x: np.square(x) if x.name == 'd' else x, axis=1)

print("\n\n")
print("After applying function:")
# printing dataframe
print(df)

Before applying function:
   Field_1  Field_2  Field_3
a       15      2.5      100
b       20      4.5       50
c       25      5.2       80
d       45      5.8       48
e       40      6.3       70
f       41      6.4       90
g       51      2.3      111



After applying function:
   Field_1  Field_2  Field_3
a     15.0     2.50    100.0
b     20.0     4.50     50.0
c     25.0     5.20     80.0
d   2025.0    33.64   2304.0
e     40.0     6.30     70.0
f     41.0     6.40     90.0
g     51.0     2.30    111.0


#### Dataframe.apply() on Multiple Rows
In this example, we will apply the lambda function to multiple rows using Dataframe.apply(). The lambda function is applied to 3 rows starting with ‘a’, ‘e’, and ‘g’.

In [29]:
# importing pandas and numpylibraries
import pandas as pd
import numpy as np

# creating and initializing a nested list
values_list = [[15, 2.5, 100], [20, 4.5, 50], [25, 5.2, 80],
			[45, 5.8, 48], [40, 6.3, 70], [41, 6.4, 90],
			[51, 2.3, 111]]

# creating a pandas dataframe
df = pd.DataFrame(values_list, columns=['Field_1', 'Field_2', 'Field_3'],
				index=['a', 'b', 'c', 'd', 'e', 'f', 'g'])


# Apply function numpy.square() to square 
# the values of 3 rows only i.e. with row
# index name 'a', 'e' and 'g' only
df = df.apply(lambda x: np.square(x) if x.name in [
			'a', 'e', 'g'] else x, axis=1)

# printing dataframe
print(df)


   Field_1  Field_2  Field_3
a    225.0     6.25  10000.0
b     20.0     4.50     50.0
c     25.0     5.20     80.0
d     45.0     5.80     48.0
e   1600.0    39.69   4900.0
f     41.0     6.40     90.0
g   2601.0     5.29  12321.0


#### Lambda Function on Multiple Rows and Columns Simultaneously
In this example, we will apply the lambda function simultaneously to multiple columns and rows using dataframe.assign() and dataframe.apply().

In [30]:
# importing pandas and numpylibraries
import pandas as pd
import numpy as np

# creating and initializing a nested list
values_list = [[1.5, 2.5, 10.0], [2.0, 4.5, 5.0], [2.5, 5.2, 8.0],
			[4.5, 5.8, 4.8], [4.0, 6.3, 70], [4.1, 6.4, 9.0],
			[5.1, 2.3, 11.1]]

# creating a pandas dataframe
df = pd.DataFrame(values_list, columns=['Field_1', 'Field_2', 'Field_3'],
				index=['a', 'b', 'c', 'd', 'e', 'f', 'g'])


# Apply function numpy.square() to square 
# the values of 2 rows only i.e. with row
# index name 'b' and 'f' only
df = df.apply(lambda x: np.square(x) if x.name in ['b', 'f'] else x, axis=1)

# Applying lambda function to find product of 3 columns
# i.e 'Field_1', 'Field_2' and 'Field_3'
df = df.assign(Product=lambda x: (x['Field_1'] * x['Field_2'] * x['Field_3']))


# printing dataframe
print(df)

   Field_1  Field_2  Field_3     Product
a     1.50     2.50     10.0     37.5000
b     4.00    20.25     25.0   2025.0000
c     2.50     5.20      8.0    104.0000
d     4.50     5.80      4.8    125.2800
e     4.00     6.30     70.0   1764.0000
f    16.81    40.96     81.0  55771.5456
g     5.10     2.30     11.1    130.2030


## Arithmetic and data alignment

#### arithmetic methods
| Method | Description |
| --- | --- |
| add | Method for addition(+) |
| sub | Method for substraction(-) |
| div | Method for division(/) |
| mul | Method for multiplication(*) |

In [101]:
from pandas import Series, DataFrame
import pandas as pd
import numpy as np

# add two data frames
df1 = DataFrame(np.arange(9).reshape(3,3), columns=list("bcd"), index=['Ohio', 'Texas', 'Colorado'])
df2 = DataFrame(np.arange(12).reshape(4,3), columns=list("bde"), index=['Utah', 'Ohio', 'Texas', 'Oregon'])
print('df1 = ', df1)
print('df2 = ', df2)
print('df1 + df2 = ', df1 + df2)
print("df1.add(df2, fill_value=0)", df1.add(df2, fill_value=0))


df1 =            b  c  d
Ohio      0  1  2
Texas     3  4  5
Colorado  6  7  8
df2 =          b   d   e
Utah    0   1   2
Ohio    3   4   5
Texas   6   7   8
Oregon  9  10  11
df1 + df2 =              b   c     d   e
Colorado  NaN NaN   NaN NaN
Ohio      3.0 NaN   6.0 NaN
Oregon    NaN NaN   NaN NaN
Texas     9.0 NaN  12.0 NaN
Utah      NaN NaN   NaN NaN
df1.add(df2, fill_value=0)             b    c     d     e
Colorado  6.0  7.0   8.0   NaN
Ohio      3.0  1.0   6.0   5.0
Oregon    9.0  NaN  10.0  11.0
Texas     9.0  4.0  12.0   8.0
Utah      0.0  NaN   1.0   2.0


#### Observation 
1. If we want to do arithemtic operations on two data frame, we should use fill_value

In [115]:
from pandas import Series, DataFrame
import pandas as pd
import numpy as np
def f(x) :
    return Series([x.min(), x.max()], index=['min', 'max'])
df = DataFrame(np.arange(12).reshape(4,3), columns=list("bde"), index=['Utah', 'Ohio', 'Texas', 'Oregon'])
print("Before function")
print(df)
df2 = df.apply(f)
print("")
print("After function")
print(df2)
print("f = lambda x : x.max() - x.min()")
print("df.apply(f)")
df3 = df.apply(f)
print(df3)

Before function
        b   d   e
Utah    0   1   2
Ohio    3   4   5
Texas   6   7   8
Oregon  9  10  11

After function
     b   d   e
min  0   1   2
max  9  10  11
f = lambda x : x.max() - x.min()
df.apply(f)
     b   d   e
min  0   1   2
max  9  10  11


#### Sort and Ranking

In [136]:
from pandas import Series, DataFrame
import pandas as pd
import numpy as np
frame = DataFrame({'b' : [4, 7, -3, -2], 'a' : [0, 1, 0, 1]}, index=[0,1,2,3])
print("frame =", frame)
frame = frame.sort_index(axis = 1)
print("frame.sort_values(by=['a', 'b'])", frame.sort_values(by=['a', 'b']))
frame['rank_a'] = frame['a'].rank()
frame['rank_b'] = frame['b'].rank(method='max')
print(frame)


frame =    b  a
0  4  0
1  7  1
2 -3  0
3 -2  1
frame.sort_values(by=['a', 'b'])    a  b
2  0 -3
0  0  4
3  1 -2
1  1  7
   a  b  rank_a  rank_b
0  0  4     1.5     3.0
1  1  7     3.5     4.0
2  0 -3     1.5     1.0
3  1 -2     3.5     2.0


### DataFrame Merge
DataFrame Merge is a database style join operation. It can happen on some key columns or on index.

#### Merge Function

Merge Function Argument
| Argument | Description |
| --- | --- |
| left | DataFrame to be merged on the left side |
| right | DataFrame to be merged on the right side |
| how | One of 'inner', 'outer', 'left', or 'right'. 'inner' by default |
| on | Column names to join on. Must be found in both DataFrame objects. If not specified and no other join keys given, will use the intersection of the column names in left and right as the join keys |
| left_on | Columns in left DataFrame to use as join keys |
| right_on | Analogous to left_on for left DataFrame |
| left_index | Use row index in left as its join key (or keys, if a MultiIndex)
| right_index | Analogous to left_index
| Sort | Sort merged data lexicographically by join keys; True bu default. Disable to get better performance in some cases on large datasets. |
| suffixes | Tuple of string values to append to column names in case of overlap; default to ('_x', '_y'. For example if 'data' in both DataFrame objects, would appear as 'data_x' and 'data_y' in result. |
| copy | If False, avoid copying data into resulting data structure in some exceptional cases. By default always copies. |



In [12]:
from pandas import Series, DataFrame
import pandas as pd
import numpy as np
from numpy import NaN
df1 = DataFrame({'lkey': ['b', 'b', 'a', 'c', 'a', 'a', 'b'], 'data':range(7)})
df2 = DataFrame({'rkey': ['a', 'b', 'c', 'd'], 'data':range(4)})
df3 = pd.merge(df1, df2, left_on = 'lkey', right_on = 'rkey', how='outer');
print("df1 = ", df1)
print("df2 = ", df2)
print("df3 = ", df3)

df1 =    lkey  data
0    b     0
1    b     1
2    a     2
3    c     3
4    a     4
5    a     5
6    b     6
df2 =    rkey  data
0    a     0
1    b     1
2    c     2
3    d     3
df3 =    lkey  data_x rkey  data_y
0    b     0.0    b       1
1    b     1.0    b       1
2    b     6.0    b       1
3    a     2.0    a       0
4    a     4.0    a       0
5    a     5.0    a       0
6    c     3.0    c       2
7  NaN     NaN    d       3


#### Join on index

In [13]:
from pandas import Series, DataFrame
import pandas as pd
import numpy as np
from numpy import NaN
left = DataFrame([[1,2],[3,4],[5,6]], index=['a', 'c', 'e'], columns=['Ohio', 'Nevada'])
right = DataFrame([[7,8],[9,10],[11,12],[13,14]], index=['b', 'c', 'd', 'e'], columns=['Missouri', 'Alabama'])
print("left = \n", left)
print("right = \n", right)
left.join(right, how='outer')

left = 
    Ohio  Nevada
a     1       2
c     3       4
e     5       6
right = 
    Missouri  Alabama
b         7        8
c         9       10
d        11       12
e        13       14


Unnamed: 0,Ohio,Nevada,Missouri,Alabama
a,1.0,2.0,,
b,,,7.0,8.0
c,3.0,4.0,9.0,10.0
d,,,11.0,12.0
e,5.0,6.0,13.0,14.0


#### Concatenating Along and Axis

#### concat function argument
| Argument | Description |
| --- | --- |
| objs | List or dict of pandas objects to be concatenated. The only required argument. |
| axis | Axis to concatenate along; default to 0 |
| join | One of 'inner', 'outer', defaulting to 'outer'; whether to intersection (inner) or union (outer) together indexes along the other axes |
| join_axes | Specific indexes to use for the other n - 1 axes instead of performing union / intersection logic. |
| keys | Values to associate with objects being concatenated, forming a hierachical index along the concatenation axis. Can either be a list or arrat of arbitrary values, an array of tuples. or a list of arrays (if multiple level arrays passed in levels) |
| levels | Specific indexes to use as hierarchical index level or levels if keys passed. |
| names | Names for created hierarchical levels if keys and / or levels passed. |
| verify_integrity | Check new axis in concatenated object for duplicates and raise exception if so. By default (False) allows duplicates. |
| ignore_index | Do not preserve indexes along concatenation axis, instead producing a new range (total_length) index |

In [23]:
from pandas import Series, DataFrame
import pandas as pd
import numpy as np
from numpy import NaN
s1 = Series([0, 1], index=['a', 'b'])
s2 = Series([2, 3, 4], index=['c', 'd', 'e'])
s3 = Series([5, 6], index=['g', 'f'])
print("pd.concat([s1, s2, s3], ignore_index=True)")
s4 = pd.concat([s1, s2, s3], ignore_index=True)
print(s4)
print("pd.concat([s1, s2, s3], axis=1)")
s5 = pd.concat([s1, s2, s3], axis=1)
print(s5)
print("pd.concat([s1, s2, s3], axis=1, keys=['one', 'two', 'three'])")
s6 = pd.concat([s1, s2, s3], axis=1, keys=['one', 'two', 'three'])
print(s6)

pd.concat([s1, s2, s3], ignore_index=True)
0    0
1    1
2    2
3    3
4    4
5    5
6    6
dtype: int64
pd.concat([s1, s2, s3], axis=1)
     0    1    2
a  0.0  NaN  NaN
b  1.0  NaN  NaN
c  NaN  2.0  NaN
d  NaN  3.0  NaN
e  NaN  4.0  NaN
g  NaN  NaN  5.0
f  NaN  NaN  6.0
pd.concat([s1, s2, s3], axis=1, keys=['one', 'two', 'three'])
   one  two  three
a  0.0  NaN    NaN
b  1.0  NaN    NaN
c  NaN  2.0    NaN
d  NaN  3.0    NaN
e  NaN  4.0    NaN
g  NaN  NaN    5.0
f  NaN  NaN    6.0


#### Combination Data with Overlap

In [24]:
from pandas import Series, DataFrame
import pandas as pd
import numpy as np
a = Series([np.nan, 2.5, np.nan, 3.5, 4.5, np.nan], index=['a', 'b', 'c', 'd', 'e', 'f'])
b = Series(np.arange(len(a), dtype=np.float64))
c = np.where(pd.notnull(a), a, b)
print("np.where(pd.notnull(a), a, b)")
print(c)

np.where(pd.notnull(a), a, b)
[0.  2.5 2.  3.5 4.5 5. ]


### Reshaping and Pivoting

#### stack() and unstack()

In [30]:
from pandas import Series, DataFrame
import pandas as pd
import numpy as np
data = DataFrame(np.arange(6).reshape((2,3)), 
                index=pd.Index(['Ohio', 'Colorado'], name='state'), 
                columns=pd.Index(['one', 'two', 'three'], name='number'))
print(data)
print("data.stack()")
df1 = data.stack()
print(df1)
# unstack inner level
print("df1.unstack()")
df2 = df1.unstack()
print(df2)
# unstack outer level
print("df1.unstack(0)")
df3 = df1.unstack(0)
print(df3)

number    one  two  three
state                    
Ohio        0    1      2
Colorado    3    4      5
data.stack()
state     number
Ohio      one       0
          two       1
          three     2
Colorado  one       3
          two       4
          three     5
dtype: int32
df1.unstack()
number    one  two  three
state                    
Ohio        0    1      2
Colorado    3    4      5
df1.unstack(0)
state   Ohio  Colorado
number                
one        0         3
two        1         4
three      2         5


#### Pivoting "long" to "wide" format

In [35]:
from pandas import Series, DataFrame
import pandas as pd
data = {'sequence' : [1, 1, 1, 2, 2, 2, 3, 3, 3],
        'item' : ['gdp', 'infl', 'unemp', 'gdp', 'infl', 'unemp', 'gdp', 'infl', 'unemp'],
        'values' : [2710, 0.0, 5.8, 2778, 2.34, 5.10, 2775, 2.74, 5.3]
        }
df = DataFrame(data)
print(df)
pivoted = df.pivot(index='sequence', columns='item', values='values')
print(pivoted)

   sequence   item   values
0         1    gdp  2710.00
1         1   infl     0.00
2         1  unemp     5.80
3         2    gdp  2778.00
4         2   infl     2.34
5         2  unemp     5.10
6         3    gdp  2775.00
7         3   infl     2.74
8         3  unemp     5.30
item         gdp  infl  unemp
sequence                     
1         2710.0  0.00    5.8
2         2778.0  2.34    5.1
3         2775.0  2.74    5.3


### Data Transformation

#### Mapping data

In [53]:
from pandas import Series, DataFrame
import pandas as pd
import numpy as np
data = DataFrame({'food' : ['bacon', 'pulled pork', 'bacon', 'Pastrami', 'corned beef', 
                            'Bacon', 'pastrami', 'honey ham', 'noval lox'], 
                  'ounces' : [4, 3, 12, 6, 7.5, 8, 3, 5, 6]})

meat_to_animal = {
    'bacon': 'pig',
    'pulled pork' : 'pig',
    'pastrami' : 'cow',
    'corned beef' : 'cow',
    'honey ham' : 'pig',
    'noval lox' : 'salmon'
    }
data['animal'] = data['food'].map(lambda x : meat_to_animal[x.lower()])
data['animal1'] = data['food'].map(str.lower).map(meat_to_animal)
print(data)

          food  ounces  animal animal1
0        bacon     4.0     pig     pig
1  pulled pork     3.0     pig     pig
2        bacon    12.0     pig     pig
3     Pastrami     6.0     cow     cow
4  corned beef     7.5     cow     cow
5        Bacon     8.0     pig     pig
6     pastrami     3.0     cow     cow
7    honey ham     5.0     pig     pig
8    noval lox     6.0  salmon  salmon


#### Replacing values

In [59]:
from pandas import Series
import numpy as np
data = Series([1., -999., 2, -999., -1000, 3])
print(data)
print("data.replace([-999, -1000], [0, 0])")
df1 = data.replace([-999, -1000], [0, 0])
print(df1)
print("np.where(data < 0, 0, data)")
df2 = Series(np.where(data < 0, 0, data))
print(df2)

0       1.0
1    -999.0
2       2.0
3    -999.0
4   -1000.0
5       3.0
dtype: float64
data.replace([-999, -1000], [0, 0])
0    1.0
1    0.0
2    2.0
3    0.0
4    0.0
5    3.0
dtype: float64
np.where(data < 0, 0, data)
0    1.0
1    0.0
2    2.0
3    0.0
4    0.0
5    3.0
dtype: float64


#### Rename Axis Indexes

In [64]:
from pandas import Series, DataFrame
import pandas as pd
import numpy as np
data = DataFrame(np.arange(12).reshape((3,4)), 
                 index=['Ohio', 'Colorado', 'New York'],
                 columns=['one', 'two', 'three', 'four'])
data.index = data.index.map(str.upper)
data.columns = data.columns.map(str.title)
print(data) 

          One  Two  Three  Four
OHIO        0    1      2     3
COLORADO    4    5      6     7
NEW YORK    8    9     10    11


#### Discretization and Binning

In [66]:
from pandas import Series, DataFrame
import pandas as pd
import numpy as np
ages = [20, 22, 25, 27, 21, 23, 37, 31, 61, 45, 41, 32]
bins = [18, 25, 35, 60, 100]
groups = ["Youth", "YoungAdult", "MiddleAged", "Senior"]
df1 = pd.cut(ages, bins, labels=groups, right=False) 
print(df1)

['Youth', 'Youth', 'YoungAdult', 'YoungAdult', 'Youth', ..., 'YoungAdult', 'Senior', 'MiddleAged', 'MiddleAged', 'YoungAdult']
Length: 12
Categories (4, object): ['Youth' < 'YoungAdult' < 'MiddleAged' < 'Senior']


#### Detect and Filter

In [96]:
from pandas import Series, DataFrame
import pandas as pd
import numpy as np
data = DataFrame(np.arange(18).reshape((6,3)))
print(data[data[2]>10])

    0   1   2
3   9  10  11
4  12  13  14
5  15  16  17


#### Computing Indicator / Dummy Variables

In [98]:
from pandas import Series, DataFrame
import pandas as pd
import numpy as np
df = DataFrame({'key' : ['b', 'b', 'a', 'c', 'a', 'b'],
                  'data1' :range(6)})
pd.get_dummies(df['key'])

Unnamed: 0,a,b,c
0,False,True,False
1,False,True,False
2,True,False,False
3,False,False,True
4,True,False,False
5,False,True,False
