In [4]:
pip install pandas

Note: you may need to restart the kernel to use updated packages.


In [5]:
import pandas as pd

In [3]:
s = pd.Series([10,20,30,40,50])
s

0    10
1    20
2    30
3    40
4    50
dtype: int64

In [4]:
s.dtype

dtype('int64')

In [5]:
s.values

array([10, 20, 30, 40, 50])

In [6]:
s.index

RangeIndex(start=0, stop=5, step=1)

In [7]:
s.name = 'calories'

In [8]:
print(s.name)

calories


In [9]:
s

0    10
1    20
2    30
3    40
4    50
Name: calories, dtype: int64

# Indexing

In [10]:
s[0]

np.int64(10)

In [11]:
s[0:2] #start(included) : stop(excluded): step value (values to jump)

0    10
1    20
Name: calories, dtype: int64

In [12]:
s[2:4]

2    30
3    40
Name: calories, dtype: int64

# iloc -> location based indexing

In [13]:
s.iloc[3]

np.int64(40)

In [14]:
s.iloc[[1,3,4]]

1    20
3    40
4    50
Name: calories, dtype: int64

In [15]:
index = ['apple' , 'banana' , 'grapes' , 'orange' , 'strawberry']

In [16]:
index

['apple', 'banana', 'grapes', 'orange', 'strawberry']

In [17]:
s.index = index

In [18]:
s

apple         10
banana        20
grapes        30
orange        40
strawberry    50
Name: calories, dtype: int64

In [19]:
s['grapes']

np.int64(30)

In [20]:
s.iloc[3]

np.int64(40)

# loc -> label based indexing

In [21]:
s.loc[['grapes' , 'apple' , 'strawberry']]

grapes        30
apple         10
strawberry    50
Name: calories, dtype: int64

# In label based indexing your start as well as stop value both are included in the output.

In [22]:
s['banana' : 'orange']

banana    20
grapes    30
orange    40
Name: calories, dtype: int64

In [23]:
fruit_protein = {'Avocado' : 2.0 , 'Guava' : 2.6 , 'Blackberries' : 2.0 , 'Orange' : 0.9 , 'Banana' : 1.1 , 'Apples' : 0.3 , 'Kiwi' : 1.1 , 'Pomegranate' : 1.7 , 'Mango' : 0.8 , 'Cherries' : 1.0}

In [24]:
fruit_protein # grams of protein

{'Avocado': 2.0,
 'Guava': 2.6,
 'Blackberries': 2.0,
 'Orange': 0.9,
 'Banana': 1.1,
 'Apples': 0.3,
 'Kiwi': 1.1,
 'Pomegranate': 1.7,
 'Mango': 0.8,
 'Cherries': 1.0}

In [25]:
type(fruit_protein)

dict

In [26]:
s2 = pd.Series(fruit_protein , name = 'protein')
s2

Avocado         2.0
Guava           2.6
Blackberries    2.0
Orange          0.9
Banana          1.1
Apples          0.3
Kiwi            1.1
Pomegranate     1.7
Mango           0.8
Cherries        1.0
Name: protein, dtype: float64

# Conditional Selection

In [27]:
s2 > 1.0

Avocado          True
Guava            True
Blackberries     True
Orange          False
Banana           True
Apples          False
Kiwi             True
Pomegranate      True
Mango           False
Cherries        False
Name: protein, dtype: bool

In [28]:
s2[s2  > 1.0]

Avocado         2.0
Guava           2.6
Blackberries    2.0
Banana          1.1
Kiwi            1.1
Pomegranate     1.7
Name: protein, dtype: float64

# Logical operator

In [29]:
[(s2 > 0.5) & (s2 < 2)]

[Avocado         False
 Guava           False
 Blackberries    False
 Orange           True
 Banana           True
 Apples          False
 Kiwi             True
 Pomegranate      True
 Mango            True
 Cherries         True
 Name: protein, dtype: bool]

In [30]:
s2[(s2 > 0.5) & (s2 < 2)]

Orange         0.9
Banana         1.1
Kiwi           1.1
Pomegranate    1.7
Mango          0.8
Cherries       1.0
Name: protein, dtype: float64

In [31]:
# not operation
s2[~(s2 > 1.0)]

Orange      0.9
Apples      0.3
Mango       0.8
Cherries    1.0
Name: protein, dtype: float64

In [32]:
# Modifying the series
s2['mango'] = 2.8

In [33]:
s2

Avocado         2.0
Guava           2.6
Blackberries    2.0
Orange          0.9
Banana          1.1
Apples          0.3
Kiwi            1.1
Pomegranate     1.7
Mango           0.8
Cherries        1.0
mango           2.8
Name: protein, dtype: float64

In [34]:
import numpy as np

In [35]:
ser = pd.Series(['a' , np.nan , 1 , np.nan , 2]) #ans
ser.notnull().sum()

np.int64(3)

In [36]:
data  = {
    'Name' : ['Alice' , 'Bob' , 'Charlie' , 'Dravid' , 'Eve' , 'Alice'] , 
    'Age' : [25 , 30 , 35 , np.nan , 29 , 25 ], 
    'Department' : ['HR' , 'IT' , 'Finance' , 'IT' , 'HR' , 'HR'] , 
    'Salary' : [50000, 60000 , 70000 , 62000 , np.nan , 50000]
}

In [37]:
data

{'Name': ['Alice', 'Bob', 'Charlie', 'Dravid', 'Eve', 'Alice'],
 'Age': [25, 30, 35, nan, 29, 25],
 'Department': ['HR', 'IT', 'Finance', 'IT', 'HR', 'HR'],
 'Salary': [50000, 60000, 70000, 62000, nan, 50000]}

In [38]:
df = pd.DataFrame(data)

In [39]:
df

Unnamed: 0,Name,Age,Department,Salary
0,Alice,25.0,HR,50000.0
1,Bob,30.0,IT,60000.0
2,Charlie,35.0,Finance,70000.0
3,Dravid,,IT,62000.0
4,Eve,29.0,HR,
5,Alice,25.0,HR,50000.0


In [40]:
df.head(2)

Unnamed: 0,Name,Age,Department,Salary
0,Alice,25.0,HR,50000.0
1,Bob,30.0,IT,60000.0


In [41]:
df.tail(3)

Unnamed: 0,Name,Age,Department,Salary
3,Dravid,,IT,62000.0
4,Eve,29.0,HR,
5,Alice,25.0,HR,50000.0


# loc and iloc

In [42]:
df.iloc[0:2] #if i want to aceess first 2 data set by using of iloc

Unnamed: 0,Name,Age,Department,Salary
0,Alice,25.0,HR,50000.0
1,Bob,30.0,IT,60000.0


In [43]:
df.loc[0:1] #if i want to access first 2 data set by using of loc

Unnamed: 0,Name,Age,Department,Salary
0,Alice,25.0,HR,50000.0
1,Bob,30.0,IT,60000.0


In [44]:
df.iloc[1:3]

Unnamed: 0,Name,Age,Department,Salary
1,Bob,30.0,IT,60000.0
2,Charlie,35.0,Finance,70000.0


In [45]:
df.columns

Index(['Name', 'Age', 'Department', 'Salary'], dtype='object')

In [46]:
df.loc[1 : 3 , ['Age' , 'Department']]

Unnamed: 0,Age,Department
1,30.0,IT
2,35.0,Finance
3,,IT


In [47]:
df.iloc[1 : 3 , : 2] #rows,columns

Unnamed: 0,Name,Age
1,Bob,30.0
2,Charlie,35.0


In [48]:
df['Age']

0    25.0
1    30.0
2    35.0
3     NaN
4    29.0
5    25.0
Name: Age, dtype: float64

In [49]:
df[['Age' , 'Department']]

Unnamed: 0,Age,Department
0,25.0,HR
1,30.0,IT
2,35.0,Finance
3,,IT
4,29.0,HR
5,25.0,HR


In [50]:
df.drop('Age' , axis = 1 ) # if i want to change on the data frame then use parameter inplace = True

Unnamed: 0,Name,Department,Salary
0,Alice,HR,50000.0
1,Bob,IT,60000.0
2,Charlie,Finance,70000.0
3,Dravid,IT,62000.0
4,Eve,HR,
5,Alice,HR,50000.0


In [51]:
df

Unnamed: 0,Name,Age,Department,Salary
0,Alice,25.0,HR,50000.0
1,Bob,30.0,IT,60000.0
2,Charlie,35.0,Finance,70000.0
3,Dravid,,IT,62000.0
4,Eve,29.0,HR,
5,Alice,25.0,HR,50000.0


In [52]:
df.shape

(6, 4)

In [53]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6 entries, 0 to 5
Data columns (total 4 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   Name        6 non-null      object 
 1   Age         5 non-null      float64
 2   Department  6 non-null      object 
 3   Salary      5 non-null      float64
dtypes: float64(2), object(2)
memory usage: 324.0+ bytes


In [54]:
df.describe()

Unnamed: 0,Age,Salary
count,5.0,5.0
mean,28.8,58400.0
std,4.147288,8532.291603
min,25.0,50000.0
25%,25.0,50000.0
50%,29.0,60000.0
75%,30.0,62000.0
max,35.0,70000.0


# Broadcasting

In [55]:
df

Unnamed: 0,Name,Age,Department,Salary
0,Alice,25.0,HR,50000.0
1,Bob,30.0,IT,60000.0
2,Charlie,35.0,Finance,70000.0
3,Dravid,,IT,62000.0
4,Eve,29.0,HR,
5,Alice,25.0,HR,50000.0


In [56]:
df['Salary'] = df['Salary'] + 5000

In [57]:
df

Unnamed: 0,Name,Age,Department,Salary
0,Alice,25.0,HR,55000.0
1,Bob,30.0,IT,65000.0
2,Charlie,35.0,Finance,75000.0
3,Dravid,,IT,67000.0
4,Eve,29.0,HR,
5,Alice,25.0,HR,55000.0


# Rnaming Columns

In [58]:
df.rename(columns = {'Department' : 'Dept'} , inplace = True)

In [59]:
df

Unnamed: 0,Name,Age,Dept,Salary
0,Alice,25.0,HR,55000.0
1,Bob,30.0,IT,65000.0
2,Charlie,35.0,Finance,75000.0
3,Dravid,,IT,67000.0
4,Eve,29.0,HR,
5,Alice,25.0,HR,55000.0


In [60]:
df['Salary'].unique()

array([55000., 65000., 75000., 67000.,    nan])

In [61]:
df['Dept'].unique()

array(['HR', 'IT', 'Finance'], dtype=object)

In [62]:
df['Dept'].value_counts()

Dept
HR         3
IT         2
Finance    1
Name: count, dtype: int64

In [63]:
df['Promoted Salary'] = df['Salary'] * 10 #New columns creation

In [64]:
df

Unnamed: 0,Name,Age,Dept,Salary,Promoted Salary
0,Alice,25.0,HR,55000.0,550000.0
1,Bob,30.0,IT,65000.0,650000.0
2,Charlie,35.0,Finance,75000.0,750000.0
3,Dravid,,IT,67000.0,670000.0
4,Eve,29.0,HR,,
5,Alice,25.0,HR,55000.0,550000.0


# Data Cleaning

In [65]:
df.isnull().sum()

Name               0
Age                1
Dept               0
Salary             1
Promoted Salary    1
dtype: int64

In [66]:
df.dropna(how = 'any') #any row that had any null value

Unnamed: 0,Name,Age,Dept,Salary,Promoted Salary
0,Alice,25.0,HR,55000.0,550000.0
1,Bob,30.0,IT,65000.0,650000.0
2,Charlie,35.0,Finance,75000.0,750000.0
5,Alice,25.0,HR,55000.0,550000.0


In [67]:
df.dropna(how = 'all') #if all the values in any row are null then we drop

Unnamed: 0,Name,Age,Dept,Salary,Promoted Salary
0,Alice,25.0,HR,55000.0,550000.0
1,Bob,30.0,IT,65000.0,650000.0
2,Charlie,35.0,Finance,75000.0,750000.0
3,Dravid,,IT,67000.0,670000.0
4,Eve,29.0,HR,,
5,Alice,25.0,HR,55000.0,550000.0


In [68]:
df

Unnamed: 0,Name,Age,Dept,Salary,Promoted Salary
0,Alice,25.0,HR,55000.0,550000.0
1,Bob,30.0,IT,65000.0,650000.0
2,Charlie,35.0,Finance,75000.0,750000.0
3,Dravid,,IT,67000.0,670000.0
4,Eve,29.0,HR,,
5,Alice,25.0,HR,55000.0,550000.0


In [69]:
df.fillna(0)

Unnamed: 0,Name,Age,Dept,Salary,Promoted Salary
0,Alice,25.0,HR,55000.0,550000.0
1,Bob,30.0,IT,65000.0,650000.0
2,Charlie,35.0,Finance,75000.0,750000.0
3,Dravid,0.0,IT,67000.0,670000.0
4,Eve,29.0,HR,0.0,0.0
5,Alice,25.0,HR,55000.0,550000.0


In [70]:
df['Age'].fillna(df['Age'].mean() )

0    25.0
1    30.0
2    35.0
3    28.8
4    29.0
5    25.0
Name: Age, dtype: float64

In [71]:
df['Salary'].fillna(df['Salary'].median())

0    55000.0
1    65000.0
2    75000.0
3    67000.0
4    65000.0
5    55000.0
Name: Salary, dtype: float64

In [72]:
df['Promoted Salary'].fillna(df['Promoted Salary'].median())

0    550000.0
1    650000.0
2    750000.0
3    670000.0
4    650000.0
5    550000.0
Name: Promoted Salary, dtype: float64

In [73]:
df

Unnamed: 0,Name,Age,Dept,Salary,Promoted Salary
0,Alice,25.0,HR,55000.0,550000.0
1,Bob,30.0,IT,65000.0,650000.0
2,Charlie,35.0,Finance,75000.0,750000.0
3,Dravid,,IT,67000.0,670000.0
4,Eve,29.0,HR,,
5,Alice,25.0,HR,55000.0,550000.0


In [74]:
df['Age'].fillna(method = 'ffill')

  df['Age'].fillna(method = 'ffill')


0    25.0
1    30.0
2    35.0
3    35.0
4    29.0
5    25.0
Name: Age, dtype: float64

In [75]:
df['Age'].fillna(method = 'bfill')

  df['Age'].fillna(method = 'bfill')


0    25.0
1    30.0
2    35.0
3    29.0
4    29.0
5    25.0
Name: Age, dtype: float64

In [76]:
df

Unnamed: 0,Name,Age,Dept,Salary,Promoted Salary
0,Alice,25.0,HR,55000.0,550000.0
1,Bob,30.0,IT,65000.0,650000.0
2,Charlie,35.0,Finance,75000.0,750000.0
3,Dravid,,IT,67000.0,670000.0
4,Eve,29.0,HR,,
5,Alice,25.0,HR,55000.0,550000.0


In [77]:
df['Name'] = df['Name'].replace('Charlie' , 'Rose' ) # for permenent change the data

In [78]:
df

Unnamed: 0,Name,Age,Dept,Salary,Promoted Salary
0,Alice,25.0,HR,55000.0,550000.0
1,Bob,30.0,IT,65000.0,650000.0
2,Rose,35.0,Finance,75000.0,750000.0
3,Dravid,,IT,67000.0,670000.0
4,Eve,29.0,HR,,
5,Alice,25.0,HR,55000.0,550000.0


# Duplicates :

In [79]:
df_dup = df[df.duplicated()]
df_dup 

Unnamed: 0,Name,Age,Dept,Salary,Promoted Salary
5,Alice,25.0,HR,55000.0,550000.0


In [80]:
df

Unnamed: 0,Name,Age,Dept,Salary,Promoted Salary
0,Alice,25.0,HR,55000.0,550000.0
1,Bob,30.0,IT,65000.0,650000.0
2,Rose,35.0,Finance,75000.0,750000.0
3,Dravid,,IT,67000.0,670000.0
4,Eve,29.0,HR,,
5,Alice,25.0,HR,55000.0,550000.0


In [81]:
df = df.drop_duplicates()

In [82]:
df

Unnamed: 0,Name,Age,Dept,Salary,Promoted Salary
0,Alice,25.0,HR,55000.0,550000.0
1,Bob,30.0,IT,65000.0,650000.0
2,Rose,35.0,Finance,75000.0,750000.0
3,Dravid,,IT,67000.0,670000.0
4,Eve,29.0,HR,,


# Invail values :
# Lamda -> python

In [83]:
df['Promoted Salary'] = df['Promoted Salary'].apply(lambda x : x/10 if x > 650000 else x)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['Promoted Salary'] = df['Promoted Salary'].apply(lambda x : x/10 if x > 650000 else x)


In [84]:
df

Unnamed: 0,Name,Age,Dept,Salary,Promoted Salary
0,Alice,25.0,HR,55000.0,550000.0
1,Bob,30.0,IT,65000.0,650000.0
2,Rose,35.0,Finance,75000.0,75000.0
3,Dravid,,IT,67000.0,67000.0
4,Eve,29.0,HR,,


# apply and lambda

In [85]:
name = 'alice_fernandes'
df[['first_name' , 'last_name']] = df['name'].str.split('_')

KeyError: 'name'

In [None]:
df

In [86]:
def multiplying_age(x):
    return x * 2
df['Age'] = df['Age'].apply(multiplying_age)



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['Age'] = df['Age'].apply(multiplying_age)


In [87]:
df

Unnamed: 0,Name,Age,Dept,Salary,Promoted Salary
0,Alice,50.0,HR,55000.0,550000.0
1,Bob,60.0,IT,65000.0,650000.0
2,Rose,70.0,Finance,75000.0,75000.0
3,Dravid,,IT,67000.0,67000.0
4,Eve,58.0,HR,,


In [88]:
df['Age'] = df['Age'].apply(lambda x: x/2)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['Age'] = df['Age'].apply(lambda x: x/2)


In [89]:
df

Unnamed: 0,Name,Age,Dept,Salary,Promoted Salary
0,Alice,25.0,HR,55000.0,550000.0
1,Bob,30.0,IT,65000.0,650000.0
2,Rose,35.0,Finance,75000.0,75000.0
3,Dravid,,IT,67000.0,67000.0
4,Eve,29.0,HR,,


In [90]:
df['Age'] = df['Age'].apply(lambda x: x*2)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['Age'] = df['Age'].apply(lambda x: x*2)


In [91]:
df

Unnamed: 0,Name,Age,Dept,Salary,Promoted Salary
0,Alice,50.0,HR,55000.0,550000.0
1,Bob,60.0,IT,65000.0,650000.0
2,Rose,70.0,Finance,75000.0,75000.0
3,Dravid,,IT,67000.0,67000.0
4,Eve,58.0,HR,,


# Joins & Merges

In [100]:
department_info = {
    'Dept' : ['HR' , 'IT' , 'Finance'],
    'Location' : ['New York' , 'San Francisco' , 'Chicago'],
    'Manager' : ['Laura' , 'Steve' , 'Nina']
}
df2 = pd.DataFrame(department_info)

In [101]:
df2

Unnamed: 0,Dept,Location,Manager
0,HR,New York,Laura
1,IT,San Francisco,Steve
2,Finance,Chicago,Nina


In [102]:
df2.columns

Index(['Dept', 'Location', 'Manager'], dtype='object')

In [103]:
pd.concat([df , df2])

Unnamed: 0,Name,Age,Dept,Salary,Promoted Salary,Location,Manager
0,Alice,50.0,HR,55000.0,550000.0,,
1,Bob,60.0,IT,65000.0,650000.0,,
2,Rose,70.0,Finance,75000.0,75000.0,,
3,Dravid,,IT,67000.0,67000.0,,
4,Eve,58.0,HR,,,,
0,,,HR,,,New York,Laura
1,,,IT,,,San Francisco,Steve
2,,,Finance,,,Chicago,Nina


In [108]:
pd.concat([df , df2] , axis = 1)

Unnamed: 0,Name,Age,Dept,Salary,Promoted Salary,Dept.1,Location,Manager
0,Alice,50.0,HR,55000.0,550000.0,HR,New York,Laura
1,Bob,60.0,IT,65000.0,650000.0,IT,San Francisco,Steve
2,Rose,70.0,Finance,75000.0,75000.0,Finance,Chicago,Nina
3,Dravid,,IT,67000.0,67000.0,,,
4,Eve,58.0,HR,,,,,


In [110]:
pd.merge(df,df2 , on = 'Dept' )

Unnamed: 0,Name,Age,Dept,Salary,Promoted Salary,Location,Manager
0,Alice,50.0,HR,55000.0,550000.0,New York,Laura
1,Bob,60.0,IT,65000.0,650000.0,San Francisco,Steve
2,Rose,70.0,Finance,75000.0,75000.0,Chicago,Nina
3,Dravid,,IT,67000.0,67000.0,San Francisco,Steve
4,Eve,58.0,HR,,,New York,Laura


In [112]:
data = pd.read_excel(r'C:\Users\Dell\Downloads\house_price_dataset.xlsx')

In [113]:
data

Unnamed: 0,House_ID,Bedrooms,Bathrooms,Area_sqft,Location,Year_Built,Furnishing,Parking,Floor,Total_Floors,Price_Lakhs
0,1,2,1,900,Delhi,2010,Semi-Furnished,1,2,5,65
1,2,3,2,1200,Mumbai,2015,Furnished,1,5,10,120
2,3,4,3,1800,Delhi,2012,Unfurnished,2,3,6,150
3,4,3,2,1400,Pune,2018,Furnished,1,7,12,90
4,5,2,1,950,Pune,2005,Semi-Furnished,0,1,4,50
5,6,1,1,700,Delhi,2000,Unfurnished,0,0,3,30
6,7,5,4,2400,Bangalore,2020,Furnished,2,10,15,220
7,8,4,3,2000,Mumbai,2017,Semi-Furnished,2,6,12,160
8,9,3,2,1500,Pune,2013,Unfurnished,1,4,8,95
9,10,2,1,1000,Delhi,2008,Semi-Furnished,0,2,5,55


In [114]:
data.head()

Unnamed: 0,House_ID,Bedrooms,Bathrooms,Area_sqft,Location,Year_Built,Furnishing,Parking,Floor,Total_Floors,Price_Lakhs
0,1,2,1,900,Delhi,2010,Semi-Furnished,1,2,5,65
1,2,3,2,1200,Mumbai,2015,Furnished,1,5,10,120
2,3,4,3,1800,Delhi,2012,Unfurnished,2,3,6,150
3,4,3,2,1400,Pune,2018,Furnished,1,7,12,90
4,5,2,1,950,Pune,2005,Semi-Furnished,0,1,4,50


In [116]:
data.shape

(20, 11)

In [117]:
data.size

220

In [118]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 20 entries, 0 to 19
Data columns (total 11 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   House_ID      20 non-null     int64 
 1   Bedrooms      20 non-null     int64 
 2   Bathrooms     20 non-null     int64 
 3   Area_sqft     20 non-null     int64 
 4   Location      20 non-null     object
 5   Year_Built    20 non-null     int64 
 6   Furnishing    20 non-null     object
 7   Parking       20 non-null     int64 
 8   Floor         20 non-null     int64 
 9   Total_Floors  20 non-null     int64 
 10  Price_Lakhs   20 non-null     int64 
dtypes: int64(9), object(2)
memory usage: 1.8+ KB


In [6]:
data = {"names" : ["Amit" , "Rohit" , "Sara" , "Neha"] ,
      "age" : [21 , 25 , 19 , 22] ,
        "marks" : [85 , 90 , 88 ,70]
       }

In [7]:
data

{'names': ['Amit', 'Rohit', 'Sara', 'Neha'],
 'age': [21, 25, 19, 22],
 'marks': [85, 90, 88, 70]}

In [13]:
df = pd.DataFrame(data)

In [14]:
df

Unnamed: 0,names,age,marks
0,Amit,21,85
1,Rohit,25,90
2,Sara,19,88
3,Neha,22,70


In [19]:
df.rename(columns = {"marks" : "obtained marks"} , inplace = True)

In [20]:
df

Unnamed: 0,names,age,obtained marks
0,Amit,21,85
1,Rohit,25,90
2,Sara,19,88
3,Neha,22,70


In [33]:
df.sort_values(by = "obtained marks" , ascending = True)

Unnamed: 0,names,age,obtained marks
3,Neha,22,70
0,Amit,21,85
2,Sara,19,88
1,Rohit,25,90


In [29]:
df

Unnamed: 0,names,age,obtained marks
0,Amit,21,85
1,Rohit,25,90
2,Sara,19,88
3,Neha,22,70


In [31]:
df.reset_index(drop = True , inplace = True)

In [32]:
df

Unnamed: 0,names,age,obtained marks
0,Amit,21,85
1,Rohit,25,90
2,Sara,19,88
3,Neha,22,70


In [38]:
df[["names" , "obtained marks"]]

Unnamed: 0,names,obtained marks
0,Amit,85
1,Rohit,90
2,Sara,88
3,Neha,70


In [49]:
data = pd.read_excel(r'C:\Users\Dell\Downloads\people_data_with_missing.xlsx')

In [50]:
data

Unnamed: 0,Name,Age,City
0,Amit Sharma,21.0,Jabalpur
1,Rohit Verma,,Bhopal
2,Neha Gupta,22.0,
3,Saurabh Singh,24.0,Delhi
4,,20.0,Mumbai
5,Kunal Yadav,23.0,Pune
6,Ayesha Khan,,Lucknow
7,Manish Patel,27.0,
8,Riya Sen,18.0,Kolkata
9,Arjun Nair,26.0,Kochi


In [51]:
df = pd.DataFrame(data)

In [52]:
df

Unnamed: 0,Name,Age,City
0,Amit Sharma,21.0,Jabalpur
1,Rohit Verma,,Bhopal
2,Neha Gupta,22.0,
3,Saurabh Singh,24.0,Delhi
4,,20.0,Mumbai
5,Kunal Yadav,23.0,Pune
6,Ayesha Khan,,Lucknow
7,Manish Patel,27.0,
8,Riya Sen,18.0,Kolkata
9,Arjun Nair,26.0,Kochi


In [53]:
df.head(10)

Unnamed: 0,Name,Age,City
0,Amit Sharma,21.0,Jabalpur
1,Rohit Verma,,Bhopal
2,Neha Gupta,22.0,
3,Saurabh Singh,24.0,Delhi
4,,20.0,Mumbai
5,Kunal Yadav,23.0,Pune
6,Ayesha Khan,,Lucknow
7,Manish Patel,27.0,
8,Riya Sen,18.0,Kolkata
9,Arjun Nair,26.0,Kochi


In [54]:
df['Age']

0     21.0
1      NaN
2     22.0
3     24.0
4     20.0
5     23.0
6      NaN
7     27.0
8     18.0
9     26.0
10    21.0
11     NaN
12    23.0
13     NaN
14    29.0
15    24.0
16    20.0
17    21.0
18    26.0
19     NaN
Name: Age, dtype: float64

In [55]:
df.describe()

Unnamed: 0,Age
count,15.0
mean,23.0
std,3.023716
min,18.0
25%,21.0
50%,23.0
75%,25.0
max,29.0


In [57]:
df['Age'].fillna(df['Age'].mean())

0     21.0
1     23.0
2     22.0
3     24.0
4     20.0
5     23.0
6     23.0
7     27.0
8     18.0
9     26.0
10    21.0
11    23.0
12    23.0
13    23.0
14    29.0
15    24.0
16    20.0
17    21.0
18    26.0
19    23.0
Name: Age, dtype: float64

In [59]:
df['Age'].fillna(0)

0     21.0
1      0.0
2     22.0
3     24.0
4     20.0
5     23.0
6      0.0
7     27.0
8     18.0
9     26.0
10    21.0
11     0.0
12    23.0
13     0.0
14    29.0
15    24.0
16    20.0
17    21.0
18    26.0
19     0.0
Name: Age, dtype: float64

In [61]:
df.isnull().sum()

Name    2
Age     5
City    5
dtype: int64

In [65]:
df.isnull()

TypeError: DataFrame.isnull() takes 1 positional argument but 2 were given

In [66]:
df1 = pd.DataFrame({
    "Name": ["Amit", "Rohit", "Neha", "Simran", "Kunal", "Ayesha"],
    "Salary": [25000, 45000, 37000, 52000, 30000, 28000]
})

In [67]:
df1

Unnamed: 0,Name,Salary
0,Amit,25000
1,Rohit,45000
2,Neha,37000
3,Simran,52000
4,Kunal,30000
5,Ayesha,28000


In [73]:
df1[df1['Salary']>30000]

Unnamed: 0,Name,Salary
1,Rohit,45000
2,Neha,37000
3,Simran,52000


In [78]:
df1[(df1['Salary']>=25000) & (df1['Salary']<=40000)]

Unnamed: 0,Name,Salary
0,Amit,25000
2,Neha,37000
4,Kunal,30000
5,Ayesha,28000


In [80]:
df1.nlargest(3 , 'Salary')

Unnamed: 0,Name,Salary
3,Simran,52000
1,Rohit,45000
2,Neha,37000


In [82]:
df1.nsmallest(3 , 'Salary')

Unnamed: 0,Name,Salary
0,Amit,25000
5,Ayesha,28000
4,Kunal,30000


In [84]:
df1

Unnamed: 0,Name,Salary
0,Amit,25000
1,Rohit,45000
2,Neha,37000
3,Simran,52000
4,Kunal,30000
5,Ayesha,28000


In [88]:
df2 = pd.DataFrame({'Salary' :(25000 , 45000 , 28000 , 47000 , 35000 , 60000) , 
                   'Name' : ('Virat' , 'MS Dhoni' , 'Rohit' , 'Hardik Pandya' , 'Shreyas iyer' , 'Sir Jadeja') , 
                    'Department' : ('HR' , 'IT' , 'Finance' , 'IT' , 'HR' , 'Finance') 
                   })

In [89]:
df2

Unnamed: 0,Salary,Name,Department
0,25000,Virat,HR
1,45000,MS Dhoni,IT
2,28000,Rohit,Finance
3,47000,Hardik Pandya,IT
4,35000,Shreyas iyer,HR
5,60000,Sir Jadeja,Finance


In [94]:
df2.groupby('Department')['Salary'].mean()

Department
Finance    44000.0
HR         30000.0
IT         46000.0
Name: Salary, dtype: float64

In [96]:
df2.groupby('Department')['Salary'].max()

Department
Finance    60000
HR         35000
IT         47000
Name: Salary, dtype: int64

In [101]:
df2.groupby('Department')['Salary'].count()

Department
Finance    2
HR         2
IT         2
Name: Salary, dtype: int64

In [102]:
df2.groupby('Department')['Salary'].sum()

Department
Finance    88000
HR         60000
IT         92000
Name: Salary, dtype: int64

In [105]:
df2.groupby('Department').agg({
    'Salary' : ['mean' , 'max' , 'min' , 'count' , 'sum']
})

Unnamed: 0_level_0,Salary,Salary,Salary,Salary,Salary
Unnamed: 0_level_1,mean,max,min,count,sum
Department,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2
Finance,44000.0,60000,28000,2,88000
HR,30000.0,35000,25000,2,60000
IT,46000.0,47000,45000,2,92000


In [106]:
df2['Salary'].sort_values()

0    25000
2    28000
4    35000
1    45000
3    47000
5    60000
Name: Salary, dtype: int64

In [114]:
df2['Salary'].sort_values(ascending = False)

5    60000
3    47000
1    45000
4    35000
2    28000
0    25000
Name: Salary, dtype: int64