# Pandas - Nasir Hussain - 03/01/2021

# The pandas DataFrame Object

A DataFrame represents a rectangular table of data and contains an ordered collection of columns, each of which can be a different value type (numeric, string,
boolean, etc.). 

<h4>Creating a DataFrame from scratch</h4>

In [1]:
# create a DataFrame from a 2-d ndarray
import pandas as pd
import numpy as np

df = pd.DataFrame(np.array([[10, 11, 12, 13], [20, 21, 22, 23]]))
df

# default row and columns indexes

Unnamed: 0,0,1,2,3
0,10,11,12,13
1,20,21,22,23


In [2]:
# create a DataFrame for a list of Series objects

df1 = pd.DataFrame([pd.Series(np.arange(10, 15)),
                    
                    pd.Series(np.arange(15, 20))])
df1
# default row and columns indexes

Unnamed: 0,0,1,2,3,4
0,10,11,12,13,14
1,15,16,17,18,19


In [3]:
# create a DataFrame with two Series objects
# and a dictionary
s1 = pd.Series(np.arange(1, 6, 1))

s2 = pd.Series(np.arange(6, 11, 1))

df2= pd.DataFrame({'boys': s1, 'girls': s2})
df2

Unnamed: 0,boys,girls
0,1,6
1,2,7
2,3,8
3,4,9
4,5,10


In [4]:
data = {'name':["Asad","Saad","Fahad", 'Ali'], 'age':[23,34,23,21], "grades":["A","B","A","B"]}
data = pd.DataFrame(data)
data

Unnamed: 0,name,age,grades
0,Asad,23,A
1,Saad,34,B
2,Fahad,23,A
3,Ali,21,B


In [5]:
# specify column names
df3 = pd.DataFrame(np.array([[10, 11], [20, 21]]),columns=['apples', 'oranges'])
df3

Unnamed: 0,apples,oranges
0,10,11
1,20,21


In [6]:
# create a DataFrame with named columns and rows

df4 = pd.DataFrame(np.array([[10, 11, 12, 13], [20, 21, 22, 23]]), 
                   index=['apples', 'oranges'],
                   columns=['Mon', 'Tue','Wed', 'Thu'])
df4

Unnamed: 0,Mon,Tue,Wed,Thu
apples,10,11,12,13
oranges,20,21,22,23


In [7]:
# demonstrate alignment during creation

s3 = pd.Series(np.arange(12, 14), index=[1, 2]) #034

df5 = pd.DataFrame({'c1': s1, 'c2': s2, 'c3': s3})
df5

Unnamed: 0,c1,c2,c3
0,1,6,
1,2,7,12.0
2,3,8,13.0
3,4,9,
4,5,10,


In [8]:
# Examples of creating data frames

In [9]:
data = {'state': ['Ohio', 'Ohio', 'Ohio', 'Nevada', 'Nevada', 'Nevada'],
        'year':  [2000, 2001, 2002, 2001, 2002, 2003],
        'pop':   [1.5, 1.7, 3.6, 2.4, 2.9, 3.2]}

frame = pd.DataFrame(data)
frame

Unnamed: 0,state,year,pop
0,Ohio,2000,1.5
1,Ohio,2001,1.7
2,Ohio,2002,3.6
3,Nevada,2001,2.4
4,Nevada,2002,2.9
5,Nevada,2003,3.2


In [10]:
pd.DataFrame(frame, columns=['year', 'state', 'pop']) # inplace nahi hoga 

Unnamed: 0,year,state,pop
0,2000,Ohio,1.5
1,2001,Ohio,1.7
2,2002,Ohio,3.6
3,2001,Nevada,2.4
4,2002,Nevada,2.9
5,2003,Nevada,3.2


In [11]:
frame.year

0    2000
1    2001
2    2002
3    2001
4    2002
5    2003
Name: year, dtype: int64

In [12]:
frame['pop']

0    1.5
1    1.7
2    3.6
3    2.4
4    2.9
5    3.2
Name: pop, dtype: float64

In [13]:
# If you pass a column that isn’t contained in the dict(debt), it will appear with missing values
# in the result:
frame2 = pd.DataFrame(data, columns=['year', 'state', 'pop', 'debt'],
                              index=['one', 'two', 'three', 'four','five', 'six'])
frame2

Unnamed: 0,year,state,pop,debt
one,2000,Ohio,1.5,
two,2001,Ohio,1.7,
three,2002,Ohio,3.6,
four,2001,Nevada,2.4,
five,2002,Nevada,2.9,
six,2003,Nevada,3.2,


In [14]:
frame2.debt = 100
frame2

Unnamed: 0,year,state,pop,debt
one,2000,Ohio,1.5,100
two,2001,Ohio,1.7,100
three,2002,Ohio,3.6,100
four,2001,Nevada,2.4,100
five,2002,Nevada,2.9,100
six,2003,Nevada,3.2,100


In [15]:
frame2['debt']=np.arange(6)
frame2

Unnamed: 0,year,state,pop,debt
one,2000,Ohio,1.5,0
two,2001,Ohio,1.7,1
three,2002,Ohio,3.6,2
four,2001,Nevada,2.4,3
five,2002,Nevada,2.9,4
six,2003,Nevada,3.2,5


In [16]:
val = pd.Series([-1.2, -1.5, -1.7], index=['two', 'four', 'five'])

In [17]:
frame2['debt'] = val
frame2

Unnamed: 0,year,state,pop,debt
one,2000,Ohio,1.5,
two,2001,Ohio,1.7,-1.2
three,2002,Ohio,3.6,
four,2001,Nevada,2.4,-1.5
five,2002,Nevada,2.9,-1.7
six,2003,Nevada,3.2,


In [18]:
#Adding more columns to dataframe

frame2['eastern'] = frame2.state == 'Ohio' # true false
frame2

Unnamed: 0,year,state,pop,debt,eastern
one,2000,Ohio,1.5,,True
two,2001,Ohio,1.7,-1.2,True
three,2002,Ohio,3.6,,True
four,2001,Nevada,2.4,-1.5,False
five,2002,Nevada,2.9,-1.7,False
six,2003,Nevada,3.2,,False


In [19]:
frame2['pop_greaterThan2']= frame2['pop'] > 2
frame2

Unnamed: 0,year,state,pop,debt,eastern,pop_greaterThan2
one,2000,Ohio,1.5,,True,False
two,2001,Ohio,1.7,-1.2,True,False
three,2002,Ohio,3.6,,True,True
four,2001,Nevada,2.4,-1.5,False,True
five,2002,Nevada,2.9,-1.7,False,True
six,2003,Nevada,3.2,,False,True


In [20]:
del frame2['eastern']

In [21]:
frame2

Unnamed: 0,year,state,pop,debt,pop_greaterThan2
one,2000,Ohio,1.5,,False
two,2001,Ohio,1.7,-1.2,False
three,2002,Ohio,3.6,,True
four,2001,Nevada,2.4,-1.5,True
five,2002,Nevada,2.9,-1.7,True
six,2003,Nevada,3.2,,True


In [22]:
data = {'name':["Asad","Saad","Fahad", 'Ali'], 
        'age':[23,34,23,21], 
        'AiforEveryONe':[89,78,90,98],
        'python':[78,89,87,89],
        'git': [90,98,87,86],
        'numpy':[98,87,98,99]       }
        
data = pd.DataFrame(data)
data

Unnamed: 0,name,age,AiforEveryONe,python,git,numpy
0,Asad,23,89,78,90,98
1,Saad,34,78,89,98,87
2,Fahad,23,90,87,87,98
3,Ali,21,98,89,86,99


In [23]:
data["Total"] = data['AiforEveryONe']+data['python']+data['git']+data['numpy']
data

Unnamed: 0,name,age,AiforEveryONe,python,git,numpy,Total
0,Asad,23,89,78,90,98,355
1,Saad,34,78,89,98,87,352
2,Fahad,23,90,87,87,98,362
3,Ali,21,98,89,86,99,372


In [24]:
#Another common form of data is a nested dict of dicts:
pop = {'Nevada': {2001: 2.4, 2002: 2.9},
       
       'Ohio':   {2000: 1.5, 2001: 1.7, 2002: 3.6}}
df3 =pd.DataFrame(pop)
df3

Unnamed: 0,Nevada,Ohio
2001,2.4,1.7
2002,2.9,3.6
2000,,1.5


<b>If the nested dict is passed to the DataFrame, pandas will interpret the outer dict keys
as the columns and the inner keys as the row indices</b>

In [25]:
df3.T

Unnamed: 0,2001,2002,2000
Nevada,2.4,2.9,
Ohio,1.7,3.6,1.5


In [26]:
df3

Unnamed: 0,Nevada,Ohio
2001,2.4,1.7
2002,2.9,3.6
2000,,1.5


In [27]:
pop1 =pd.DataFrame(pop, index=[2001, 2002, 2003])
pop1

Unnamed: 0,Nevada,Ohio
2001,2.4,1.7
2002,2.9,3.6
2003,,


In [28]:
pdata = {'Ohio': df3['Ohio'][:-1],
         
        'Nevada': df3['Nevada'][:2]
        }

pd.DataFrame(pdata)

Unnamed: 0,Ohio,Nevada
2001,1.7,2.4
2002,3.6,2.9


In [29]:
df3.index.name = 'year'
df3.columns.name = 'state_names'

df3

state_names,Nevada,Ohio
year,Unnamed: 1_level_1,Unnamed: 2_level_1
2001,2.4,1.7
2002,2.9,3.6
2000,,1.5


# Index Objects

pandas’s Index objects are responsible for holding the axis labels and other metadata
(like the axis name or names). Any array or other sequence of labels you use when
constructing a Series or DataFrame is internally converted to an Index:

In [30]:

obj = pd.Series(range(3), index=['a', 'b', 'c'])
obj

a    0
b    1
c    2
dtype: int64

In [31]:
index =obj.index
index

Index(['a', 'b', 'c'], dtype='object')

In [32]:
index[1:]

Index(['b', 'c'], dtype='object')

In [33]:
# index[1] = 'd'  # indices are immutable

In [34]:
labels = pd.Index(("a","b","c","d","e","f")) # creatind an ndarray that is immutable
                                # coz created via Index function and index are immutable

In [35]:
labels

Index(['a', 'b', 'c', 'd', 'e', 'f'], dtype='object')

In [36]:
# labels[0]="z"

In [37]:
print(frame)

    state  year  pop
0    Ohio  2000  1.5
1    Ohio  2001  1.7
2    Ohio  2002  3.6
3  Nevada  2001  2.4
4  Nevada  2002  2.9
5  Nevada  2003  3.2


In [38]:
frame.index=labels
frame

Unnamed: 0,state,year,pop
a,Ohio,2000,1.5
b,Ohio,2001,1.7
c,Ohio,2002,3.6
d,Nevada,2001,2.4
e,Nevada,2002,2.9
f,Nevada,2003,3.2


In [39]:
frame.index   # is index type object

Index(['a', 'b', 'c', 'd', 'e', 'f'], dtype='object')

In [40]:
frame.columns  # is also inde type object

Index(['state', 'year', 'pop'], dtype='object')

# Essential Functionality

In [41]:
print("The frame is", end="\n\n")
print(frame2,end="\n\n")
frame2['debt']=np.arange(6)
print("The row indices are", end="\n\n")
print(frame2.index,end="\n\n")
print("The col indeces are",end="\n\n")
print(frame2.columns)

The frame is

       year   state  pop  debt  pop_greaterThan2
one    2000    Ohio  1.5   NaN             False
two    2001    Ohio  1.7  -1.2             False
three  2002    Ohio  3.6   NaN              True
four   2001  Nevada  2.4  -1.5              True
five   2002  Nevada  2.9  -1.7              True
six    2003  Nevada  3.2   NaN              True

The row indices are

Index(['one', 'two', 'three', 'four', 'five', 'six'], dtype='object')

The col indeces are

Index(['year', 'state', 'pop', 'debt', 'pop_greaterThan2'], dtype='object')


In [42]:
######by default row are reindexed via reindex function#####

reindex_frame = frame2.reindex(['five','two', 'three', 'six', 'four','one','seven'])


In [43]:
reindex_frame

Unnamed: 0,year,state,pop,debt,pop_greaterThan2
five,2002.0,Nevada,2.9,4.0,True
two,2001.0,Ohio,1.7,1.0,False
three,2002.0,Ohio,3.6,2.0,True
six,2003.0,Nevada,3.2,5.0,True
four,2001.0,Nevada,2.4,3.0,True
one,2000.0,Ohio,1.5,0.0,False
seven,,,,,


The columns can be reindexed with the columns keyword:

In [44]:
reindex_frame = frame2.reindex(columns=['pop','year','imports', 'debt', 'state',"exports" ])

In [45]:
reindex_frame

Unnamed: 0,pop,year,imports,debt,state,exports
one,1.5,2000,,0,Ohio,
two,1.7,2001,,1,Ohio,
three,3.6,2002,,2,Ohio,
four,2.4,2001,,3,Nevada,
five,2.9,2002,,4,Nevada,
six,3.2,2003,,5,Nevada,


# Dropping Entries from an Axis


In [46]:
reindex_frame

Unnamed: 0,pop,year,imports,debt,state,exports
one,1.5,2000,,0,Ohio,
two,1.7,2001,,1,Ohio,
three,3.6,2002,,2,Ohio,
four,2.4,2001,,3,Nevada,
five,2.9,2002,,4,Nevada,
six,3.2,2003,,5,Nevada,


In [47]:
row_dropped_frame = reindex_frame.drop(['three','six'])   # not dropping inplace
                        # by default dropping row labels axis =0
row_dropped_frame

Unnamed: 0,pop,year,imports,debt,state,exports
one,1.5,2000,,0,Ohio,
two,1.7,2001,,1,Ohio,
four,2.4,2001,,3,Nevada,
five,2.9,2002,,4,Nevada,


In [48]:
col_dropped_frame = reindex_frame.drop(['imports','exports'],axis=1)
col_dropped_frame

Unnamed: 0,pop,year,debt,state
one,1.5,2000,0,Ohio
two,1.7,2001,1,Ohio
three,3.6,2002,2,Ohio
four,2.4,2001,3,Nevada
five,2.9,2002,4,Nevada
six,3.2,2003,5,Nevada


# Another Example:

In [49]:
index = ['Firefox', 'Chrome', 'Safari', 'IE10', 'Konqueror']
df = pd.DataFrame({
                      'http_status': [200,200,404,404,301],
                      'response_time': [0.04, 0.02, 0.07, 0.08, 1.0]},
                                                                      index=index)
df

Unnamed: 0,http_status,response_time
Firefox,200,0.04
Chrome,200,0.02
Safari,404,0.07
IE10,404,0.08
Konqueror,301,1.0


Create a new index and reindex the dataframe.
By default values in the new index that do not have corresponding records in the dataframe are assigned ``NaN``.



In [50]:
new_index= ['Safari', 'Iceweasel', 'Comodo Dragon', 'IE10','Chrome']
df.reindex(new_index)

Unnamed: 0,http_status,response_time
Safari,404.0,0.07
Iceweasel,,
Comodo Dragon,,
IE10,404.0,0.08
Chrome,200.0,0.02


We can fill in the missing values by passing a value to the keyword ``fill_value``. Because the index is not monotonically
increasing or decreasing, we cannot use arguments to the keyword
``method`` to fill the ``NaN`` values.

In [51]:
df.reindex(new_index, fill_value=0)

Unnamed: 0,http_status,response_time
Safari,404,0.07
Iceweasel,0,0.0
Comodo Dragon,0,0.0
IE10,404,0.08
Chrome,200,0.02


In [52]:
df.reindex(new_index, fill_value='missing')

Unnamed: 0,http_status,response_time
Safari,404,0.07
Iceweasel,missing,missing
Comodo Dragon,missing,missing
IE10,404,0.08
Chrome,200,0.02


In [53]:
#We can also reindex the columns.

df.reindex(columns=['http_status', 'user_agent'])

Unnamed: 0,http_status,user_agent
Firefox,200,
Chrome,200,
Safari,404,
IE10,404,
Konqueror,301,


In [54]:
# Or we can use "axis-style" keyword arguments
df.reindex(['http_status', 'user_agent'], axis="columns")

Unnamed: 0,http_status,user_agent
Firefox,200,
Chrome,200,
Safari,404,
IE10,404,
Konqueror,301,


To further illustrate the filling functionality in
``reindex``, we will create a dataframe with a
monotonically increasing index (for example, a sequence
of dates)

In [55]:
date_index = pd.date_range('1/1/2010', periods=6, freq='D')

df2 = pd.DataFrame({"prices": [100, 101, np.nan, 100, 89, 88]},index=date_index)
df2

Unnamed: 0,prices
2010-01-01,100.0
2010-01-02,101.0
2010-01-03,
2010-01-04,100.0
2010-01-05,89.0
2010-01-06,88.0


Suppose we decide to expand the dataframe to cover a wider
date range.

In [56]:
date_index2 = pd.date_range('12/29/2009', periods=10, freq='D')
df2.reindex(date_index2)

Unnamed: 0,prices
2009-12-29,
2009-12-30,
2009-12-31,
2010-01-01,100.0
2010-01-02,101.0
2010-01-03,
2010-01-04,100.0
2010-01-05,89.0
2010-01-06,88.0
2010-01-07,


The index entries that did not have a value in the original data frame
(for example, '2009-12-29') are by default filled with ``NaN``.
If desired, we can fill in the missing values using one of several
options.

For example, to back-propagate the last valid value to fill the ``NaN``
values, pass ``bfill`` as an argument to the ``method`` keyword.

In [57]:
df2.reindex(date_index2, method='bfill')

Unnamed: 0,prices
2009-12-29,100.0
2009-12-30,100.0
2009-12-31,100.0
2010-01-01,100.0
2010-01-02,101.0
2010-01-03,
2010-01-04,100.0
2010-01-05,89.0
2010-01-06,88.0
2010-01-07,


Please note that the ``NaN`` value present in the original dataframe
(at index value 2010-01-03) will not be filled by any of the
value propagation schemes. This is because filling while reindexing
does not look at dataframe values, but only compares the original and
desired indexes. If you do want to fill in the ``NaN`` values present
in the original dataframe, use the ``fillna()`` method.


In [58]:
naDf = pd.DataFrame({"ages":[2,3,4,np.NaN,6,7], "gender":["m","f","m","f",np.NaN,"m"]})
naDf

Unnamed: 0,ages,gender
0,2.0,m
1,3.0,f
2,4.0,m
3,,f
4,6.0,
5,7.0,m


# Indexing, Selection, and Filtering

In [59]:
data = pd.DataFrame(np.arange(40).reshape((10, 4)),
    index=['Ohio', 'Colorado', 'Washington','Nebraska','Utah', 'New York','California', 'Texas', 'Georgia', 'Alaska'],
    columns=['Jan', 'Feb', 'Mar', 'Apr'])
data

Unnamed: 0,Jan,Feb,Mar,Apr
Ohio,0,1,2,3
Colorado,4,5,6,7
Washington,8,9,10,11
Nebraska,12,13,14,15
Utah,16,17,18,19
New York,20,21,22,23
California,24,25,26,27
Texas,28,29,30,31
Georgia,32,33,34,35
Alaska,36,37,38,39


In [60]:
# getting a single col
data['Jan']

Ohio           0
Colorado       4
Washington     8
Nebraska      12
Utah          16
New York      20
California    24
Texas         28
Georgia       32
Alaska        36
Name: Jan, dtype: int32

In [61]:
#getting multiple cols
data[['Jan', 'Apr']]

Unnamed: 0,Jan,Apr
Ohio,0,3
Colorado,4,7
Washington,8,11
Nebraska,12,15
Utah,16,19
New York,20,23
California,24,27
Texas,28,31
Georgia,32,35
Alaska,36,39


In [62]:
#integer based 
data[:2]  #slicing rows starts from 0 & take two rows

Unnamed: 0,Jan,Feb,Mar,Apr
Ohio,0,1,2,3
Colorado,4,5,6,7


In [63]:
#label based
data["Utah":"Texas"]  #slicing rows starts from "Utah" & goto "Texas"

Unnamed: 0,Jan,Feb,Mar,Apr
Utah,16,17,18,19
New York,20,21,22,23
California,24,25,26,27
Texas,28,29,30,31


In [64]:
# data[2:6,0:2]   # Slicing Subsets of Rows and Columns either by label index 
                # or by integer indexing is not possible, we have some other sol

In [65]:
# data["Utah":"Texas", "Jan":'Mar']    # Slicing Subsets of Rows and Columns either by label index 
                                 # or by integer indexing isnot possible, we have some other sol

We can select specific ranges of our data in both the row and column directions using either label or integer-based indexing.

<b>loc</b> is primarily label based indexing. Integers may be used but they are interpreted as a label.

<b>iloc</b> is primarily integer based indexing
To select a subset of rows and columns from our DataFrame, we can use the iloc method.

In [66]:
# use if loc (label based)

data.loc["Utah":"Texas", "Jan":'Mar']


Unnamed: 0,Jan,Feb,Mar
Utah,16,17,18
New York,20,21,22
California,24,25,26
Texas,28,29,30


In [67]:
#use if iloc (integer based)

data.iloc[2:6,0:2] 

Unnamed: 0,Jan,Feb
Washington,8,9
Nebraska,12,13
Utah,16,17
New York,20,21


In [68]:
a = pd.DataFrame({"p":[2,4,6]})
a.rdiv(2)   

Unnamed: 0,p
0,1.0
1,0.5
2,0.333333


In [69]:
# select all the data from the month of march that have value greater than 15

data['Mar'] > 15

Ohio          False
Colorado      False
Washington    False
Nebraska      False
Utah           True
New York       True
California     True
Texas          True
Georgia        True
Alaska         True
Name: Mar, dtype: bool

In [70]:
data[data['Mar'] > 20]

Unnamed: 0,Jan,Feb,Mar,Apr
New York,20,21,22,23
California,24,25,26,27
Texas,28,29,30,31
Georgia,32,33,34,35
Alaska,36,37,38,39


In [71]:
data[data < 5] = 0
data

Unnamed: 0,Jan,Feb,Mar,Apr
Ohio,0,0,0,0
Colorado,0,5,6,7
Washington,8,9,10,11
Nebraska,12,13,14,15
Utah,16,17,18,19
New York,20,21,22,23
California,24,25,26,27
Texas,28,29,30,31
Georgia,32,33,34,35
Alaska,36,37,38,39


# Function Application and Mapping

In [72]:
frame = np.abs(
               pd.DataFrame(
                      np.random.randn(4, 3),
                      columns=list('bde'),
                      index=['Utah', 'Ohio', 'Texas', 'Oregon']))
frame

Unnamed: 0,b,d,e
Utah,0.785912,0.804265,0.990255
Ohio,0.085851,0.905635,0.864531
Texas,0.473205,0.143252,1.16516
Oregon,0.738731,1.68657,0.967961


In [73]:
f = lambda x: x.max() - x.min()  # subtract the min value of each col from max of each col

In [74]:
frame.apply(f,axis=0)  # row or 0 for each row wise 

b    0.700061
d    1.543318
e    0.300629
dtype: float64

In [75]:
frame.apply(f,axis=1)  # columns or 1 for each col wise 

Utah      0.204344
Ohio      0.819784
Texas     1.021908
Oregon    0.947840
dtype: float64

# What are Lambda Functions?

A <b><em>lambda<em></b> function is a small function containing a single expression. Lambda functions can also act as anonymous functions where they don’t require any name. These are very helpful when we have to perform small tasks with less code.

Lambda functions are handy and used in many programming languages but we’ll be focusing on using them in Python here. In Python, lambda functions have the following syntax:


# IIFEs using lambda functions
IIFEs are <i>Immediately Invoked Function Expressions</i>. These are functions that are executed as soon as they are created. IIFEs require no explicit call to invoke the function. In Python, IIFEs can be created using the lambda function.

Here, created an IIFE that returns the cube of a number:

In [76]:
(lambda x: x*x*x)(10)

1000

In [77]:
#awsome

# Application of Lambda Functions with Different Functions

created a random dataset that contains information about a family of 5 people with their id, names, ages, and income per month. I will be using this dataframe to show you how to apply lambda functions using different functions on a dataframe in Python.

In [78]:
df=pd.DataFrame({
                'id':[1,2,3,4,5],
                'name':['Asad','Saad','Numi','Roman','Maria'],
                'age':[20,25,15,10,30],
                'income':[4000,7000,200,0,10000]})
df

Unnamed: 0,id,name,age,income
0,1,Asad,20,4000
1,2,Saad,25,7000
2,3,Numi,15,200
3,4,Roman,10,0
4,5,Maria,30,10000


# Application of Lambda with Apply

Let’s say we have got an error in the age variable. We recorded ages with a difference of 3 years. So, to remove this error from the Pandas dataframe, we have to add three years to every person’s age. We can do this with the <b>apply() function</b> in Pandas.

<b>apply() function</b>  in Pandas calls the lambda function and applies it to every row or column of the dataframe and returns a modified copy of the dataframe:

In [79]:
df['age']=df.apply(lambda x: x['age']+3,axis='columns')  # on frame

In [80]:
df

Unnamed: 0,id,name,age,income
0,1,Asad,23,4000
1,2,Saad,28,7000
2,3,Numi,18,200
3,4,Roman,13,0
4,5,Maria,33,10000


In [81]:
df['age']=df['age'].apply(lambda x: x+3) #on particular series

In [82]:
df

Unnamed: 0,id,name,age,income
0,1,Asad,26,4000
1,2,Saad,31,7000
2,3,Numi,21,200
3,4,Roman,16,0
4,5,Maria,36,10000


# Application of Lambda with Filter

Now, let’s see how many of these people are above the age of 18.

We can do this using the <b>filter() function</b>. 

The <b>filter() function</b> takes a lambda function and a Pandas series and applies the lambda function on the series and filters the data.



In [83]:
list(filter(lambda x: x>18, df['age']))

[26, 31, 21, 36]

# Application of Lambda with Map

You’ll be able to relate to the next statement. 🙂 It’s performance appraisal time and the income of all the employees gets increased by 20%. This means we have to increase the salary of each person by 20% in our Pandas dataframe.

We can do this using the map() function. This map() function maps the series according to input correspondence. It is very helpful when we have to substitute a series with other values.

In [84]:
df['income2']=list(map(lambda x: int(x+x*0.2),df['income']))

In [85]:
df

Unnamed: 0,id,name,age,income,income2
0,1,Asad,26,4000,4800
1,2,Saad,31,7000,8400
2,3,Numi,21,200,240
3,4,Roman,16,0,0
4,5,Maria,36,10000,12000


In [86]:
df['income3'] = df['income'].apply(lambda x: x+x*.2)

In [87]:
df

Unnamed: 0,id,name,age,income,income2,income3
0,1,Asad,26,4000,4800,4800.0
1,2,Saad,31,7000,8400,8400.0
2,3,Numi,21,200,240,240.0
3,4,Roman,16,0,0,0.0
4,5,Maria,36,10000,12000,12000.0


# Conditional Statements using Lambda Functions

Lambda functions also support conditional statements, such as if..else. This makes lambda functions very powerful.

Let’s say in the family dataframe we have to categorize people into ‘Adult’ or ‘Child’. For this, we can simply apply the lambda function to our dataframe:

In [88]:
df['category']=df['age'].apply(lambda x: 'Adult' if x>=18 else 'Child')

In [89]:
df

Unnamed: 0,id,name,age,income,income2,income3,category
0,1,Asad,26,4000,4800,4800.0,Adult
1,2,Saad,31,7000,8400,8400.0,Adult
2,3,Numi,21,200,240,240.0,Adult
3,4,Roman,16,0,0,0.0,Child
4,5,Maria,36,10000,12000,12000.0,Adult


# Lambda with Reduce
Now, let’s see the total income of the family. To calculate this, we can use the reduce() function in Python. It is used to apply a particular function to the list of elements in the sequence. The reduce() function is defined in the ‘functools’ module.

For using the reduce() function, we have to import the functools module first:

In [90]:
import functools
functools.reduce(lambda a,b: a+b,df['income'])

21200

# Summarizing and Computing Descriptive Statistics

In [91]:
#do your self

# Correlation and Covariance

study link:https://machinelearningmastery.com/how-to-use-correlation-to-understand-the-relationship-between-variables/

Example:
    

In [92]:
import pandas_datareader.data as web

In [93]:
# dictionary comprehension

all_data = {ticker: web.get_data_yahoo(ticker) for ticker in ['AAPL', 'IBM', 'MSFT', 'GOOG']}

In [94]:
all_data['AAPL']

Unnamed: 0_level_0,High,Low,Open,Close,Volume,Adj Close
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
2017-04-27,36.040001,35.827499,35.980000,35.947498,56985200.0,33.940182
2017-04-28,36.075001,35.817501,36.022499,35.912498,83441600.0,33.907143
2017-05-01,36.799999,36.240002,36.275002,36.645000,134411600.0,34.598740
2017-05-02,37.022499,36.709999,36.884998,36.877499,181408800.0,34.818264
2017-05-03,36.872501,36.067501,36.397499,36.764999,182788000.0,34.712036
...,...,...,...,...,...,...
2022-04-20,168.880005,166.100006,168.759995,167.229996,67929800.0,167.229996
2022-04-21,171.529999,165.910004,168.910004,166.419998,87227800.0,166.419998
2022-04-22,167.869995,161.500000,166.460007,161.789993,84775200.0,161.789993
2022-04-25,163.169998,158.460007,161.119995,162.880005,96046400.0,162.880005
