## Pandas -used for data manipulation and analysis

In [1]:
import numpy as np 
import pandas as pd
# Creating a MultiIndex DataFrame
df = pd.DataFrame(np.random.randn(6, 4), index=[['A', 'A', 'B', 'B', 'C', 'C'], [1, 2, 1, 2, 1, 2]], columns=['X', 'Y', 'Z', 'W'])


In [2]:
df

Unnamed: 0,Unnamed: 1,X,Y,Z,W
A,1,0.118041,1.654963,-0.55986,0.862181
A,2,-0.876498,-0.761774,-0.352031,-2.502224
B,1,-1.602282,0.068541,-0.221547,-0.036979
B,2,-0.100374,1.005556,-0.264791,-0.980054
C,1,-2.019244,2.330996,-1.857757,-0.39467
C,2,0.214406,0.350458,-1.275875,1.222746


In [4]:
# Using vectorized operations
df['new_column'] = df['X'] + df['Y']
df['new_column'] 


A  1    1.773004
   2   -1.638272
B  1   -1.533740
   2    0.905182
C  1    0.311753
   2    0.564863
Name: new_column, dtype: float64

In [6]:
# Boolean indexing
df[df['X'] > 0] #creates a dataframe where values are > than X
df



Unnamed: 0,Unnamed: 1,X,Y,Z,W,new_column
A,1,0.118041,1.654963,-0.55986,0.862181,1.773004
A,2,-0.876498,-0.761774,-0.352031,-2.502224,-1.638272
B,1,-1.602282,0.068541,-0.221547,-0.036979,-1.53374
B,2,-0.100374,1.005556,-0.264791,-0.980054,0.905182
C,1,-2.019244,2.330996,-1.857757,-0.39467,0.311753
C,2,0.214406,0.350458,-1.275875,1.222746,0.564863


In [7]:
# Using .loc for label-based indexing
df.loc['A'] #assumes that 'A' is in the index of the DataFrame.
df



Unnamed: 0,Unnamed: 1,X,Y,Z,W,new_column
A,1,0.118041,1.654963,-0.55986,0.862181,1.773004
A,2,-0.876498,-0.761774,-0.352031,-2.502224,-1.638272
B,1,-1.602282,0.068541,-0.221547,-0.036979,-1.53374
B,2,-0.100374,1.005556,-0.264791,-0.980054,0.905182
C,1,-2.019244,2.330996,-1.857757,-0.39467,0.311753
C,2,0.214406,0.350458,-1.275875,1.222746,0.564863


In [8]:
# Using .iloc for positional indexing
df.iloc[0:2, 1:3]
df

Unnamed: 0,Unnamed: 1,X,Y,Z,W,new_column
A,1,0.118041,1.654963,-0.55986,0.862181,1.773004
A,2,-0.876498,-0.761774,-0.352031,-2.502224,-1.638272
B,1,-1.602282,0.068541,-0.221547,-0.036979,-1.53374
B,2,-0.100374,1.005556,-0.264791,-0.980054,0.905182
C,1,-2.019244,2.330996,-1.857757,-0.39467,0.311753
C,2,0.214406,0.350458,-1.275875,1.222746,0.564863


In [9]:
# Boolean indexing
df[df['X'] > 0]

# Using .loc for label-based indexing
df.loc['A']

# Using .iloc for positional indexing
df.iloc[0:2, 1:3]


Unnamed: 0,Unnamed: 1,Y,Z
A,1,1.654963,-0.55986
A,2,-0.761774,-0.352031


In [10]:
# Grouping by a column and calculating mean
df.groupby('X').mean()

# Grouping by multiple columns
#df.groupby(['X', 'Y']).sum()


Unnamed: 0_level_0,Y,Z,W,new_column
X,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
-2.019244,2.330996,-1.857757,-0.39467,0.311753
-1.602282,0.068541,-0.221547,-0.036979,-1.53374
-0.876498,-0.761774,-0.352031,-2.502224,-1.638272
-0.100374,1.005556,-0.264791,-0.980054,0.905182
0.118041,1.654963,-0.55986,0.862181,1.773004
0.214406,0.350458,-1.275875,1.222746,0.564863


In [11]:
# Grouping by a column and calculating mean
#df.groupby('X').mean()

# Grouping by multiple columns
df.groupby(['X', 'Y']).sum()


Unnamed: 0_level_0,Unnamed: 1_level_0,Z,W,new_column
X,Y,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
-2.019244,2.330996,-1.857757,-0.39467,0.311753
-1.602282,0.068541,-0.221547,-0.036979,-1.53374
-0.876498,-0.761774,-0.352031,-2.502224,-1.638272
-0.100374,1.005556,-0.264791,-0.980054,0.905182
0.118041,1.654963,-0.55986,0.862181,1.773004
0.214406,0.350458,-1.275875,1.222746,0.564863


In [12]:
# Grouping by a column and calculating mean
df.groupby('X').mean()

# Grouping by multiple columns
df.groupby(['X', 'Y']).sum()


Unnamed: 0_level_0,Unnamed: 1_level_0,Z,W,new_column
X,Y,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
-2.019244,2.330996,-1.857757,-0.39467,0.311753
-1.602282,0.068541,-0.221547,-0.036979,-1.53374
-0.876498,-0.761774,-0.352031,-2.502224,-1.638272
-0.100374,1.005556,-0.264791,-0.980054,0.905182
0.118041,1.654963,-0.55986,0.862181,1.773004
0.214406,0.350458,-1.275875,1.222746,0.564863


## Handling missing data

In [14]:
# Dropping NaN values
df.dropna()

# Filling NaN values with a specific value
df.fillna(0)


Unnamed: 0,Unnamed: 1,X,Y,Z,W,new_column
A,1,0.118041,1.654963,-0.55986,0.862181,1.773004
A,2,-0.876498,-0.761774,-0.352031,-2.502224,-1.638272
B,1,-1.602282,0.068541,-0.221547,-0.036979,-1.53374
B,2,-0.100374,1.005556,-0.264791,-0.980054,0.905182
C,1,-2.019244,2.330996,-1.857757,-0.39467,0.311753
C,2,0.214406,0.350458,-1.275875,1.222746,0.564863


In [16]:
# Melting a DataFrame
pd.melt(df, id_vars=['X'], value_vars=['Y', 'Z'])




Unnamed: 0,X,variable,value
0,0.118041,Y,1.654963
1,-0.876498,Y,-0.761774
2,-1.602282,Y,0.068541
3,-0.100374,Y,1.005556
4,-2.019244,Y,2.330996
5,0.214406,Y,0.350458
6,0.118041,Z,-0.55986
7,-0.876498,Z,-0.352031
8,-1.602282,Z,-0.221547
9,-0.100374,Z,-0.264791


In [18]:
# Pivoting a DataFrame
df.pivot(index='A', columns='B', values='C')

KeyError: 'A'

## using lambda

In [19]:
# Applying a function to each column
df.apply(lambda x: x.max() - x.min()) #apply is used to apply a function along the axis of a DataFrame.

X             2.233649
Y             3.092771
Z             1.636210
W             3.724970
new_column    3.411275
dtype: float64

In [20]:
# Applying a function element-wise
df.applymap(lambda x: x**2) 
#applymap is used to apply a function element-wise to every element in a DataFrame.

Unnamed: 0,Unnamed: 1,X,Y,Z,W,new_column
A,1,0.013934,2.738902,0.313443,0.743357,3.143542
A,2,0.768248,0.5803,0.123926,6.261124,2.683935
B,1,2.567306,0.004698,0.049083,0.001367,2.352359
B,2,0.010075,1.011143,0.070114,0.960506,0.819355
C,1,4.077346,5.433544,3.45126,0.155765,0.09719
C,2,0.04597,0.122821,1.627857,1.495108,0.319071


In [23]:
data=[[1,'neha'],[2,'himani'],[3,'harsh']]
df1=pd.DataFrame(data,columns=['rollno','name'])
df1
                 #np.random.

Unnamed: 0,rollno,name
0,1,neha
1,2,himani
2,3,harsh


In [26]:
data=[[4,'chirag'],[5,'chetna'],[3,'harsh']]
df2=pd.DataFrame(data,columns=['rollno','na'])
df2
       

Unnamed: 0,rollno,na
0,4,chirag
1,5,chetna
2,3,arsh


In [27]:
# Concatenating DataFrames
pd.concat([df1, df2])


Unnamed: 0,rollno,name,na
0,1,neha,
1,2,himani,
2,3,harsh,
0,4,,chirag
1,5,,chetna
2,3,,arsh


In [30]:
# Creating a DateTimeIndex
df.index = pd.date_range('2024-01-01', periods=len(df))
df


Unnamed: 0,X,Y,Z,W,new_column
2024-01-01,0.118041,1.654963,-0.55986,0.862181,1.773004
2024-01-02,-0.876498,-0.761774,-0.352031,-2.502224,-1.638272
2024-01-03,-1.602282,0.068541,-0.221547,-0.036979,-1.53374
2024-01-04,-0.100374,1.005556,-0.264791,-0.980054,0.905182
2024-01-05,-2.019244,2.330996,-1.857757,-0.39467,0.311753
2024-01-06,0.214406,0.350458,-1.275875,1.222746,0.564863


In [31]:
# Resampling time series data
df.resample('M').sum()


Unnamed: 0,X,Y,Z,W,new_column
2024-01-31,-4.26595,4.64874,-4.531861,-1.829,0.38279


In [33]:
# Using vectorized operations
df['new_column'] = df['X'] + df['Y']
df


Unnamed: 0,X,Y,Z,W,new_column
2024-01-01,0.118041,1.654963,-0.55986,0.862181,1.773004
2024-01-02,-0.876498,-0.761774,-0.352031,-2.502224,-1.638272
2024-01-03,-1.602282,0.068541,-0.221547,-0.036979,-1.53374
2024-01-04,-0.100374,1.005556,-0.264791,-0.980054,0.905182
2024-01-05,-2.019244,2.330996,-1.857757,-0.39467,0.311753
2024-01-06,0.214406,0.350458,-1.275875,1.222746,0.564863
