# Pandas Part1
## installation 
pip install pandas

In [1]:
# import pandas as pd

## Series

## Creating a Pandas Series

In [1]:
# # # empty series
# import pandas as pd

# ser = pd.Series()

# print(ser)

Series([], dtype: float64)


  after removing the cwd from sys.path.


In [2]:
# Creating a Series from a NumPy Array
# If we already have data stored in a NumPy array we can easily convert it into a Pandas Series. This is helpful when working with numerical data.

import pandas as pd
import numpy as np

data = np.array(['g', 'e', 'e', 'k', 's'])

ser = pd.Series(data)
print(ser)

0    g
1    e
2    e
3    k
4    s
dtype: object


In [3]:
# Creating a Series from a List
# we can create a Series by passing a Python list to the pd.Series() function. Pandas automatically assigns an index to each element starting from 0. 
# This is a simple way to store and manipulate data.


import pandas as pd

data_list = ['g', 'e', 'e', 'k', 's']

ser = pd.Series(data_list)
print(ser)

0    g
1    e
2    e
3    k
4    s
dtype: object


In [4]:
# Creating a Series from a Dictionary
# A dictionary in Python stores data as key-value pairs. 
# When we convert Dictionary into a Pandas Series the keys become index labels and the values become the data.
# This method is useful for labeled data preserving structure and enabling quick access. Below is an example.

import pandas as pd

data_dict = {'Geeks': 10, 'for': 20, 'geeks': 30}

ser = pd.Series(data_dict)
print(ser)

Geeks    10
for      20
geeks    30
dtype: int64


In [5]:
# Creating a Series Using NumPy Functions
# In order to create a series using numpy function. Some commonly used NumPy functions for generating sequences include numpy.linspace() 
# for creating evenly spaced numbers over a specified range and numpy.random.randn() for generating random numbers from a normal distribution.
# This is particularly useful when working with scientific computations, statistical modeling or large datasets


import numpy as np
import pandas as pd

ser = pd.Series(np.linspace(1, 10, 5))
print(ser)

0     1.00
1     3.25
2     5.50
3     7.75
4    10.00
dtype: float64


In [6]:
# Creating a Series Using range()
# The range() function in Python is commonly used to generate sequences of numbers and it can be easily converted into a Pandas Series. 
# This is particularly useful for creating a sequence of values in a structured format without need of manually specify each element. 
# Below is an how range() can be used to create a Series.


import pandas as pd

ser = pd.Series(range(5, 15))
print(ser)

0     5
1     6
2     7
3     8
4     9
5    10
6    11
7    12
8    13
9    14
dtype: int64


In [7]:
# Creating a Series Using List Comprehension
# List comprehension is a concise way to generate sequences and apply transformations in a single line of code. 
# This method is useful when we need to create structured sequences dynamically. 
# Below is an example demonstrating how list comprehension is used to create a Series with a custom index.


import pandas as pd
ser=pd.Series(range(1,20,3), index=[x for x in 'abcdefg'])
print(ser)

a     1
b     4
c     7
d    10
e    13
f    16
g    19
dtype: int64


### Accessing elements of a Pandas Series

In [8]:
# # Accessing Element from Series with Position
# # import pandas and numpy
import pandas as pd
import numpy as np

# creating simple array
data = np.array(['g', 'e', 'e', 'k', 's', 'f',
                 'o', 'r', 'g', 'e', 'e', 'k', 's'])
ser = pd.Series(data)
# retrieve the first element
print(ser[0])

g


In [9]:
# # Accessing First 5 Elements of Series
# import pandas and numpy
import pandas as pd
import numpy as np

# creating simple array
data = np.array(['g', 'e', 'e', 'k', 's', 'f',
                 'o', 'r', 'g', 'e', 'e', 'k', 's'])
ser = pd.Series(data)
# retrieve the first element
print(ser[:5])

0    g
1    e
2    e
3    k
4    s
dtype: object


In [10]:
# # Accessing Last 10 Elements of Series
# # import pandas and numpy
import pandas as pd
import numpy as np

# creating simple array
data = np.array(['g', 'e', 'e', 'k', 's', 'f',
                 'o', 'r', 'g', 'e', 'e', 'k', 's'])
ser = pd.Series(data)

# retrieve the first element
print(ser[-10:])

3     k
4     s
5     f
6     o
7     r
8     g
9     e
10    e
11    k
12    s
dtype: object


In [11]:
# Accessing a Single Element Using index Label
# import pandas and numpy
import pandas as pd
import numpy as np

# creating simple array
data = np.array(['g', 'e', 'e', 'k', 's', 'f',
                 'o', 'r', 'g', 'e', 'e', 'k', 's'])
ser = pd.Series(data, index=[10, 11, 12, 13, 14,
                             15, 16, 17, 18, 19, 20, 21, 22])

# accessing a element using index element
print(ser[16])

o


In [12]:
# # Accessing a Multiple Element Using index Label
# # import pandas and numpy
import pandas as pd
import numpy as np

# creating simple array
data = np.array(['g', 'e', 'e', 'k', 's', 'f',
                 'o', 'r', 'g', 'e', 'e', 'k', 's'])
ser = pd.Series(data, index=[10, 11, 12, 13, 14,
                             15, 16, 17, 18, 19, 20, 21, 22])

# accessing a multiple element using
# index element
print(ser[[10, 11, 12, 13, 14]])

10    g
11    e
12    e
13    k
14    s
dtype: object


In [13]:
# # Access Multiple Elements by Providing Label of Index
# # importing pandas and numpy
import pandas as pd
import numpy as np

ser = pd.Series(np.arange(3, 9), index=['a', 'b', 'c', 'd', 'e', 'f'])

print(ser[['a', 'd']])

a    3
d    6
dtype: int32


### Operations on Pandas Series

In [14]:
# # 1. Arithmetic Operations on Series
# # Arithmetic operations between two Series is applied element-wise. The index labels must align for the operation to work. 
# # If the indexes don’t match, Pandas will fill in missing values with NaN.

import pandas as pd
s1 = pd.Series([10, 20, 30], index=['a', 'b', 'c'])
s2 = pd.Series([1, 2, 3], index=['a', 'b', 'c'])

# Adding the two Series
result = s1 + s2
print(result)

a    11
b    22
c    33
dtype: int64


In [15]:
# 2. Comparison Operations on Series
# Comparison operations return a Series of boolean values, indicating whether the comparison is True or False for each corresponding element.
import pandas as pd
s1 = pd.Series([10, 20, 30])
s2 = pd.Series([10, 25, 30])

# Comparing the two Series
result = s1 == s2
print(result)

0     True
1    False
2     True
dtype: bool


### Pandas Series Index Attribute

In [16]:
# # example1
import pandas as pd

data = pd.Series([10, 20, 30, 40], index=['a', 'b', 'c', 'd'])

# Accessing the index
print("Original Index:", data.index)

# Modifying the index
data.index = ['w', 'x', 'y', 'z']
print("Modified Series:\n", data)

Original Index: Index(['a', 'b', 'c', 'd'], dtype='object')
Modified Series:
 w    10
x    20
y    30
z    40
dtype: int64


In [17]:
# example2
import pandas as pd

series = pd.Series(['New York', 'Chicago', 'Toronto', 'Lisbon'])

# Creating the row axis labels
series.index = ['City 1', 'City 1', 'City 3', 'City 3'] 
print(series)

City 1    New York
City 1     Chicago
City 3     Toronto
City 3      Lisbon
dtype: object


In [19]:
# # example3
import pandas as pd

Date = ['1/1/2018', '2/1/2018', '3/1/2018', '4/1/2018']
idx_name = ['Day 1', 'Day 2', 'Day 3', 'Day 4']

sr = pd.Series(data = Date,index = idx_name)
print(sr.index)
print(sr)

Index(['Day 1', 'Day 2', 'Day 3', 'Day 4'], dtype='object')
Day 1    1/1/2018
Day 2    2/1/2018
Day 3    3/1/2018
Day 4    4/1/2018
dtype: object


In [20]:
# # example4
import pandas as pd

Date = ['1/1/2018', '2/1/2018', '3/1/2018', '4/1/2018']
idx_name = ['Day 1', 'Day 2', 'Day 3', 'Day 4']

sr = pd.Series(data = Date,        # Series Data
              index = idx_name   # Index
              )

# Resetting index to default
sr.reset_index(drop=True, inplace=True)
print(sr)

0    1/1/2018
1    2/1/2018
2    3/1/2018
3    4/1/2018
dtype: object


## DataFrame

In [21]:
# example 1
import pandas as pd

lst = ['Geeks', 'For', 'Geeks', 'is', 
            'portal', 'for', 'Geeks']

df = pd.DataFrame(lst)
print(df)

        0
0   Geeks
1     For
2   Geeks
3      is
4  portal
5     for
6   Geeks


In [22]:
# # example2
import numpy as np
import pandas as pd

data = np.array([[1, 2, 3], [4, 5, 6], [7, 8, 9]])
df = pd.DataFrame(data, columns=['A', 'B', 'C'])
print(df)

   A  B  C
0  1  2  3
1  4  5  6
2  7  8  9


In [23]:
# # example 3
import pandas as pd

dict = {'name':["aparna", "pankaj", "sudhir", "Geeku"],
        'degree': ["MBA", "BCA", "M.Tech", "MBA"],
        'score':[90, 40, 80, 98]}

df = pd.DataFrame(dict)

print(df)

     name  degree  score
0  aparna     MBA     90
1  pankaj     BCA     40
2  sudhir  M.Tech     80
3   Geeku     MBA     98


## 
Pandas Dataframe Index

Index in pandas dataframe act as reference for each row in dataset. It can be numeric or based on specific column values. The default index is usually a RangeIndex starting from 0, but you can customize it for better data understanding. You can easily access the current index of a dataframe using the index attribute. Let's us understand with the help of an example:

1. Accessing and Modifying the Index

In [24]:
import pandas as pd

data = {'Name': ['John', 'Alice', 'Bob', 'Eve', 'Charlie'],
        'Age': [25, 30, 22, 35, 28],
        'Gender': ['Male', 'Female', 'Male', 'Female', 'Male'],
        'Salary': [50000, 55000, 40000, 70000, 48000]}

df = pd.DataFrame(data)
print(df.index)  # Accessing the index

RangeIndex(start=0, stop=5, step=1)


##### 2. Setting a Custom Index
To set a custom index, you can use the set_index()  method, allowing you to set a custom index based on a column, such as Name or Age.

In [25]:
import pandas as pd

data = {'Name': ['John', 'Alice', 'Bob', 'Eve', 'Charlie'],
        'Age': [25, 30, 22, 35, 28],
        'Gender': ['Male', 'Female', 'Male', 'Female', 'Male'],
        'Salary': [50000, 55000, 40000, 70000, 48000]}

df = pd.DataFrame(data)

# Set 'Name' column as the index
df_with_index = df.set_index('Name')
print(df_with_index)

         Age  Gender  Salary
Name                        
John      25    Male   50000
Alice     30  Female   55000
Bob       22    Male   40000
Eve       35  Female   70000
Charlie   28    Male   48000


#### 3.resetting the index

In [26]:
import pandas as pd

data = {'Name': ['John', 'Alice', 'Bob', 'Eve', 'Charlie'],
        'Age': [25, 30, 22, 35, 28],
        'Gender': ['Male', 'Female', 'Male', 'Female', 'Male'],
        'Salary': [50000, 55000, 40000, 70000, 48000]}

df = pd.DataFrame(data)

# Set 'Name' column as the index
df_with_index = df.set_index('Name')
# print(df_with_index)
df_reset = df.reset_index()
print(df_reset)

   index     Name  Age  Gender  Salary
0      0     John   25    Male   50000
1      1    Alice   30  Female   55000
2      2      Bob   22    Male   40000
3      3      Eve   35  Female   70000
4      4  Charlie   28    Male   48000


#### 4. Indexing with loc
The loc[] method in pandas allows to access rows and columns of a dataFrame using their labels, making it easy to retrieve specific data points.

In [27]:
import pandas as pd

data = {'Name': ['John', 'Alice', 'Bob', 'Eve', 'Charlie'],
        'Age': [25, 30, 22, 35, 28],
        'Gender': ['Male', 'Female', 'Male', 'Female', 'Male'],
        'Salary': [50000, 55000, 40000, 70000, 48000]}

df = pd.DataFrame(data)
df_with_index = df.set_index('Name')
row = df_with_index .loc['Alice']  # has to use dataframe with name index,here if you use df,will throw an error
print(row)

Age           30
Gender    Female
Salary     55000
Name: Alice, dtype: object


#### 5.Accessing Columns From DataFrame

In [28]:
import pandas as pd

data = {'Name': ['John', 'Alice', 'Bob', 'Eve', 'Charlie'],
        'Age': [25, 30, 22, 35, 28],
        'Gender': ['Male', 'Female', 'Male', 'Female', 'Male'],
        'Salary': [50000, 55000, 40000, 70000, 48000]}

df = pd.DataFrame(data)
age = df['Age'] # to access the age column,you will get a series
print(age)

0    25
1    30
2    22
3    35
4    28
Name: Age, dtype: int64


#### 6.Accessing Rows by Index
###### To access specific rows in a DataFrame, you can use iloc (for positional indexing) 
###### or loc (for label-based indexing). These methods allow you to retrieve rows based on their index positions or labels.

In [29]:
import pandas as pd

data = {'Name': ['John', 'Alice', 'Bob', 'Eve', 'Charlie'],
        'Age': [25, 30, 22, 35, 28],
        'Gender': ['Male', 'Female', 'Male', 'Female', 'Male'],
        'Salary': [50000, 55000, 40000, 70000, 48000]}

df = pd.DataFrame(data)
row2 = df.iloc[1]
print(row2)

Name       Alice
Age           30
Gender    Female
Salary     55000
Name: 1, dtype: object


#### 7.Accessing Multiple Rows or Columns
###### You can access multiple rows or columns at once by passing a list of column names or index positions. 
###### This is useful when you need to select several columns or rows for further analysis.

In [30]:
import pandas as pd

data = {'Name': ['John', 'Alice', 'Bob', 'Eve', 'Charlie'],
        'Age': [25, 30, 22, 35, 28],
        'Gender': ['Male', 'Female', 'Male', 'Female', 'Male'],
        'Salary': [50000, 55000, 40000, 70000, 48000]}

df = pd.DataFrame(data)
ret = df.loc[0:2,["Name","Age","Salary"]]
print(ret)

    Name  Age  Salary
0   John   25   50000
1  Alice   30   55000
2    Bob   22   40000


#### 8.Accessing Rows Based on Conditions
Pandas allows you to filter rows based on conditions, which can be very powerful for exploring subsets of data that meet specific criteria.

In [31]:
## bool index
import pandas as pd

data = {'Name': ['John', 'Alice', 'Bob', 'Eve', 'Charlie'],
        'Age': [25, 30, 22, 35, 28],
        'Gender': ['Male', 'Female', 'Male', 'Female', 'Male'],
        'Salary': [50000, 55000, 40000, 70000, 48000]}

df = pd.DataFrame(data)
ret = df[df['Age']>25]
print(ret)

      Name  Age  Gender  Salary
1    Alice   30  Female   55000
3      Eve   35  Female   70000
4  Charlie   28    Male   48000


#### 9.Accessing Specific Cells with at and iat
###### If you need to access a specific cell, you can use the .at[] method for label-based indexing 
###### and the .iat[] method for integer position-based indexing. These are optimized for fast access to single values.

In [32]:
import pandas as pd

data = {'Name': ['John', 'Alice', 'Bob', 'Eve', 'Charlie'],
        'Age': [25, 30, 22, 35, 28],
        'Gender': ['Male', 'Female', 'Male', 'Female', 'Male'],
        'Salary': [50000, 55000, 40000, 70000, 48000]}

df = pd.DataFrame(data)

salary2 = df.at[2,"Salary"]
print(salary2)

40000


#### 10.access multiple columns

In [33]:
import pandas as pd

data = {'Name': ['John', 'Alice', 'Bob', 'Eve', 'Charlie'],
        'Age': [25, 30, 22, 35, 28],
        'Gender': ['Male', 'Female', 'Male', 'Female', 'Male'],
        'Salary': [50000, 55000, 40000, 70000, 48000]}

df = pd.DataFrame(data)
n_a_s = df[["Name","Age","Salary"]]
print(n_a_s)

      Name  Age  Salary
0     John   25   50000
1    Alice   30   55000
2      Bob   22   40000
3      Eve   35   70000
4  Charlie   28   48000


#### 11.Selecting Multiple Rows by Label

In [34]:
import pandas as pd

data = {'Name': ['John', 'Alice', 'Bob', 'Eve', 'Charlie'],
        'Age': [25, 30, 22, 35, 28],
        'Gender': ['Male', 'Female', 'Male', 'Female', 'Male'],
        'Salary': [50000, 55000, 40000, 70000, 48000]}

df = pd.DataFrame(data)
dfn = df.set_index("Name")
names_data = dfn.loc[["Alice","Eve"]]
print(names_data)

       Age  Gender  Salary
Name                      
Alice   30  Female   55000
Eve     35  Female   70000


#### 12. Selecting All Rows and Specific Columns
###### We can select all rows and specific columns by using a colon [:] to indicate all rows followed by the list of column names:

###### Dataframe.loc[:, ["column1", "column2", "column3"]]

In [35]:
import pandas as pd

data = {'Name': ['John', 'Alice', 'Bob', 'Eve', 'Charlie'],
        'Age': [25, 30, 22, 35, 28],
        'Gender': ['Male', 'Female', 'Male', 'Female', 'Male'],
        'Salary': [50000, 55000, 40000, 70000, 48000]}

df = pd.DataFrame(data)
all_row_data = df.loc[:,["Name","Age","Gender"]]
print(all_row_data)

      Name  Age  Gender
0     John   25    Male
1    Alice   30  Female
2      Bob   22    Male
3      Eve   35  Female
4  Charlie   28    Male


#### 13.Selecting Specific Rows and Columns by Position
We can select specific rows and columns by providing integer positions for both rows and columns:

In [36]:
import pandas as pd

data = {'Name': ['John', 'Alice', 'Bob', 'Eve', 'Charlie'],
        'Age': [25, 30, 22, 35, 28],
        'Gender': ['Male', 'Female', 'Male', 'Female', 'Male'],
        'Salary': [50000, 55000, 40000, 70000, 48000]}

df = pd.DataFrame(data)
select = df.iloc[[0,1,2],[0,2]]
print(select)

    Name  Gender
0   John    Male
1  Alice  Female
2    Bob    Male


#### 14..query(): Query the DataFrame using a boolean expression


In [37]:
import pandas as pd

data = {'Name': ['John', 'Alice', 'Bob', 'Eve', 'Charlie'],
        'Age': [25, 30, 22, 35, 28],
        'Gender': ['Male', 'Female', 'Male', 'Female', 'Male'],
        'Salary': [50000, 55000, 40000, 70000, 48000]}

df = pd.DataFrame(data)
qret = df.query("Gender=='Female' and Age>25")
print(qret)

    Name  Age  Gender  Salary
1  Alice   30  Female   55000
3    Eve   35  Female   70000


#### 15.DataFrame.where() function replace values in a DataFrame based on a condition.
###### It allows you to keep the original value where a condition is True and replace it with something else e.g.
###### , NaN or a custom value where the condition is False.

In [38]:
import pandas as pd

data = {'Name': ['John', 'Alice', 'Bob', 'Eve', 'Charlie'],
        'Age': [25, 30, 22, 35, 28],
        'Gender': ['Male', 'Female', 'Male', 'Female', 'Male'],
        'Salary': [50000, 55000, 40000, 70000, 48000]}

df = pd.DataFrame(data)
res = df.where(df["Salary"]>50000) # show the match data and the one that not match replace with NaN
print(res)

    Name   Age  Gender   Salary
0    NaN   NaN     NaN      NaN
1  Alice  30.0  Female  55000.0
2    NaN   NaN     NaN      NaN
3    Eve  35.0  Female  70000.0
4    NaN   NaN     NaN      NaN


#### 16.DataFrame.get()	Get item from object for given key (e.g DataFrame column).

In [39]:
import pandas as pd

data = {'Name': ['John', 'Alice', 'Bob', 'Eve', 'Charlie'],
        'Age': [25, 30, 22, 35, 28],
        'Gender': ['Male', 'Female', 'Male', 'Female', 'Male'],
        'Salary': [50000, 55000, 40000, 70000, 48000]}

df = pd.DataFrame(data)
wage = df.get("Salary")
print(wage)

0    50000
1    55000
2    40000
3    70000
4    48000
Name: Salary, dtype: int64


#### 17.isin() Function in Pandas Examples
The DataFrame.isin() method in Pandas is a powerful tool for filtering and selecting data within a DataFrame based on specified conditions. It allows you to create boolean masks to identify rows where the values in one or more columns match certain criteria. Let's delve into the details of the isin() method

Single Parameter Filtering
Multiple Parameter Filtering

In [40]:
import pandas as pd

data = {'Name': ['John', 'Alice', 'Bob', 'Eve', 'Charlie'],
        'Age': [25, 30, 22, 35, 28],
        'Gender': ['Male', 'Female', 'Male', 'Female', 'Male'],
        'Salary': [50000, 55000, 40000, 70000, 48000]}

df = pd.DataFrame(data)
females = df["Gender"].isin(["Female"]) # return a list of True and False
print(df[females]) # get the True data and output 

    Name  Age  Gender  Salary
1  Alice   30  Female   55000
3    Eve   35  Female   70000


#### 18.Multiple parameter Filtering Using Pandas DataFrame.isin()

In [42]:
import pandas as pd

data = {'Name': ['John', 'Alice', 'Bob', 'Eve', 'Charlie'],
        'Age': [25, 30, 22, 35, 28],
        'Gender': ['Male', 'Female', 'Male', 'Female', 'Male'],
        'Salary': [50000, 55000, 40000, 70000, 48000]}

df = pd.DataFrame(data)
term1 = df["Gender"].isin(["Male"])
# term2 = df["Salary"].isin([48000,5000])
term2 = df["Salary"].isin([48000,50000])
ret = df[term1&term2]
print(ret)

      Name  Age Gender  Salary
0     John   25   Male   50000
4  Charlie   28   Male   48000


#### 19.Pandas dataframe.mask()

In [43]:
import pandas as pd

# Creating the dataframe 
df = pd.DataFrame({"A":[12, 4, 5, 44, 1],
                   "B":[5, 2, 54, 3, 2],
                   "C":[20, 16, 7, 3, 8],
                   "D":[14, 3, 17, 2, 6]})

# Print the dataframe
# df
df.mask(df<10,0) # replace every value that is less than 10 with 0

Unnamed: 0,A,B,C,D
0,12,0,20,14
1,0,0,16,0
2,0,54,0,17
3,44,0,0,0
4,0,0,0,0


In [44]:
import pandas as pd

# Creating the dataframe 
df = pd.DataFrame({"A":[12, 4, 5, None, 1],
                   "B":[7, 2, 54, 3, None],
                   "C":[20, 16, 11, 3, 8],
                   "D":[14, 3, None, 2, 6]})

# replace the Na values with 10
df.mask(df.isna(),0)

Unnamed: 0,A,B,C,D
0,12.0,7.0,20,14.0
1,4.0,2.0,16,3.0
2,5.0,54.0,11,0.0
3,0.0,3.0,3,2.0
4,1.0,0.0,8,6.0


#### 20.Pandas dataframe.insert()-Python
###### DataFrame.insert() function in pandas inserts a new column into a DataFrame at a specified position.
###### It allows you to specify the column index, column label and values to insert.
###### This is particularly useful when you want to place a new column in a specific position instead of just appending it at the end.

In [45]:
import pandas as pd
df = pd.DataFrame({'A': [1, 2], 'B': [3, 4]})
df.insert(1, 'C', [5, 6])

print(df)

   A  C  B
0  1  5  3
1  2  6  4


#### 21.Duplicate row and column

In [46]:
# Example 1,duplicate row: In this example, we insert a new column at index 1. Since the value is a scalar, it is broadcasted to all rows.

import pandas as pd
df = pd.DataFrame({'A': [1, 2]})
df.insert(1, 'B', 100)

print(df)

   A    B
0  1  100
1  2  100


In [48]:
# Example 2,duplicate column: Here, we already have a column 'A' and we attempt to insert another column with the same name. This works only because we explicitly allow duplicates.

import pandas as pd
df = pd.DataFrame({'A': [1, 2]})
df.insert(1, 'A', [3, 4], allow_duplicates=True)

print(df)

   A  A
0  1  3
1  2  4


#### 24 This example demonstrates inserting a column by passing a Series object. The values are aligned with the DataFrame’s index.






In [49]:
import pandas as pd
df = pd.DataFrame({'A': [10, 20, 30]})
s = pd.Series([100, 200, 300], index=[0, 1, 2])
df.insert(1, 'B', s)

print(df)

    A    B
0  10  100
1  20  200
2  30  300


#### 25.Filter Pandas Dataframe with multiple conditions Using loc

In [50]:
import pandas as pd

# assign data
dataFrame = pd.DataFrame({'Name': [' RACHEL  ', ' MONICA  ', ' PHOEBE  ',
                                   '  ROSS    ', 'CHANDLER', ' JOEY    '],
                          
                          'Age': [30, 35, 37, 33, 34, 30],
                          
                          'Salary': [100000, 93000, 88000, 120000, 94000, 95000],
                          
                          'JOB': ['DESIGNER', 'CHEF', 'MASUS', 'PALENTOLOGY',
                                  'IT', 'ARTIST']})

# display dataframe
# display(dataFrame)
# filter dataframe
display(dataFrame.loc[(dataFrame['Salary']>=100000) & (dataFrame['Age']< 40) & (dataFrame['JOB'].str.startswith('D')),
                    ['Name','JOB']])

Unnamed: 0,Name,JOB
0,RACHEL,DESIGNER


In [65]:
# ## Filter Pandas Dataframe Using NumPy
# import pandas as pd
# import numpy as np

# # assign data
# dataFrame = pd.DataFrame({'Name': [' RACHEL  ', ' MONICA  ', ' PHOEBE  ',
#                                    '  ROSS    ', 'CHANDLER', ' JOEY    '],
                          
#                           'Age': [30, 35, 37, 33, 34, 30],
                          
#                           'Salary': [100000, 93000, 88000, 120000, 94000, 95000],
                          
#                           'JOB': ['DESIGNER', 'CHEF', 'MASUS', 'PALENTOLOGY',
#                                   'IT', 'ARTIST']})

# # filter dataframe                                   
# filtered_values = np.where((dataFrame['Salary']>=100000) & (dataFrame['Age']< 40) & (dataFrame['JOB'].str.startswith('D')))
# print(filtered_values)
# display(dataFrame.loc[filtered_values])

(array([0], dtype=int64),)


Unnamed: 0,Name,Age,Salary,JOB
0,RACHEL,30,100000,DESIGNER


## 上一次直播结束标记12/7/25

In [1]:
# Filter Pandas Dataframe Using Query (eval and query works only with columns)
import pandas as pd

# assign data
dataFrame = pd.DataFrame({'Name': [' RACHEL  ', ' MONICA  ', ' PHOEBE  ',
                                   '  ROSS    ', 'CHANDLER', ' JOEY    '],
                          
                          'Age': [30, 35, 37, 33, 34, 30],
                          
                          'Salary': [100000, 93000, 88000, 120000, 94000, 95000],
                          
                          'JOB': ['DESIGNER', 'CHEF', 'MASUS', 'PALENTOLOGY',
                                  'IT', 'ARTIST']})

# filter dataframe 
display(dataFrame.query('Salary  <= 100000 & Age < 40 & JOB.str.startswith("C").values'))

Unnamed: 0,Name,Age,Salary,JOB
1,MONICA,35,93000,CHEF


In [2]:
# Pandas Boolean indexing multiple conditions standard way ("Boolean indexing" works with values in a column only)
import pandas as pd

# assign data
dataFrame = pd.DataFrame({'Name': [' RACHEL  ', ' MONICA  ', ' PHOEBE  ',
                                   '  ROSS    ', 'CHANDLER', ' JOEY    '],
                          
                          'Age': [30, 35, 37, 33, 34, 30],
                          
                          'Salary': [100000, 93000, 88000, 120000, 94000, 95000],
                          
                          'JOB': ['DESIGNER', 'CHEF', 'MASUS', 'PALENTOLOGY',
                                  'IT', 'ARTIST']})

# filter dataframe 
display(dataFrame[(dataFrame['Salary']>=100000) & (dataFrame['Age']<40) & dataFrame['JOB'].str.startswith('P')][['Name','Age','Salary']])

Unnamed: 0,Name,Age,Salary
3,ROSS,33,120000


In [3]:
# Eval multiple conditions  ("eval" and "query" works only with columns )
# import module
import pandas as pd

# assign data
dataFrame = pd.DataFrame({'Name': [' RACHEL  ', ' MONICA  ', ' PHOEBE  ',
                                   '  ROSS    ', 'CHANDLER', ' JOEY    '],
                          
                          'Age': [30, 35, 37, 33, 34, 30],
                          
                          'Salary': [100000, 93000, 88000, 120000, 94000, 95000],
                          
                          'JOB': ['DESIGNER', 'CHEF', 'MASUS', 'PALENTOLOGY',
                                  'IT', 'ARTIST']})

# filter dataframe 
display(dataFrame[dataFrame.eval("Salary <=100000 & (Age <40) & JOB.str.startswith('A').values")])

Unnamed: 0,Name,Age,Salary,JOB
5,JOEY,30,95000,ARTIST


#### 26.Concatenating DataFrames
1. Concatenating DataFrame using pandas.concat()
To concatenate DataFrames, we use the pd.concat() function. 
This function allows us to combine multiple DataFrames into one by specifying the axis (rows or columns).

In [4]:
import pandas as pd

data1 = {'Name': ['Jai', 'Princi', 'Gaurav', 'Anuj'],
         'Age': [27, 24, 22, 32],
         'Address': ['Nagpur', 'Kanpur', 'Allahabad', 'Kannuaj'],
         'Qualification': ['Msc', 'MA', 'MCA', 'Phd']}

data2 = {'Name': ['Abhi', 'Ayushi', 'Dhiraj', 'Hitesh'],
         'Age': [17, 14, 12, 52],
         'Address': ['Nagpur', 'Kanpur', 'Allahabad', 'Kannuaj'],
         'Qualification': ['Btech', 'B.A', 'Bcom', 'B.hons']}

df = pd.DataFrame(data1, index=[0, 1, 2, 3])

df1 = pd.DataFrame(data2, index=[4, 5, 6, 7])

# print(df, "\n\n", df1)
new_df = pd.concat([df,df1])
print(new_df)

     Name  Age    Address Qualification
0     Jai   27     Nagpur           Msc
1  Princi   24     Kanpur            MA
2  Gaurav   22  Allahabad           MCA
3    Anuj   32    Kannuaj           Phd
4    Abhi   17     Nagpur         Btech
5  Ayushi   14     Kanpur           B.A
6  Dhiraj   12  Allahabad          Bcom
7  Hitesh   52    Kannuaj        B.hons


#### 2. Concatenating DataFrames by Setting Logic on Axes
###### We can modify the concatenation by setting logic on the axes.
###### Specifically we can choose whether to take the Union (join='outer') or Intersection (join='inner') of columns.

In [5]:
import pandas as pd

data1 = {'Name': ['Jai', 'Princi', 'Gaurav', 'Anuj'],
         'Age': [27, 24, 22, 32],
         'Address': ['Nagpur', 'Kanpur', 'Allahabad', 'Kannuaj'],
         'Qualification': ['Msc', 'MA', 'MCA', 'Phd'],
         'Mobile No': [97, 91, 58, 76]}

data2 = {'Name': ['Gaurav', 'Anuj', 'Dhiraj', 'Hitesh'],
         'Age': [22, 32, 12, 52],
         'Address': ['Allahabad', 'Kannuaj', 'Allahabad', 'Kannuaj'],
         'Qualification': ['MCA', 'Phd', 'Bcom', 'B.hons'],
         'Salary': [1000, 2000, 3000, 4000]}

df = pd.DataFrame(data1, index=[0, 1, 2, 3])

df1 = pd.DataFrame(data2, index=[2, 3, 6, 7])

# print(df, "\n\n", df1)
# res2 = pd.concat([df, df1], axis=1, join='inner')
res2 = pd.concat([df, df1], axis=1, sort=False)
print(res2)

     Name   Age    Address Qualification  Mobile No    Name   Age    Address  \
0     Jai  27.0     Nagpur           Msc       97.0     NaN   NaN        NaN   
1  Princi  24.0     Kanpur            MA       91.0     NaN   NaN        NaN   
2  Gaurav  22.0  Allahabad           MCA       58.0  Gaurav  22.0  Allahabad   
3    Anuj  32.0    Kannuaj           Phd       76.0    Anuj  32.0    Kannuaj   
6     NaN   NaN        NaN           NaN        NaN  Dhiraj  12.0  Allahabad   
7     NaN   NaN        NaN           NaN        NaN  Hitesh  52.0    Kannuaj   

  Qualification  Salary  
0           NaN     NaN  
1           NaN     NaN  
2           MCA  1000.0  
3           Phd  2000.0  
6          Bcom  3000.0  
7        B.hons  4000.0  


#### 3. Concatenating DataFrames by Ignoring Indexes
###### Sometimes the indexes of the original DataFrames may not be relevant.
###### We can ignore the indexes and reset them using the ignore_index argument. 
###### This is useful when we don't want to carry over any index information.

In [6]:
import pandas as pd 
  
data1 = {'Name':['Jai', 'Princi', 'Gaurav', 'Anuj'], 
        'Age':[27, 24, 22, 32], 
        'Address':['Nagpur', 'Kanpur', 'Allahabad', 'Kannuaj'], 
        'Qualification':['Msc', 'MA', 'MCA', 'Phd'],
        'Mobile No': [97, 91, 58, 76]} 

data2 = {'Name':['Gaurav', 'Anuj', 'Dhiraj', 'Hitesh'], 
        'Age':[22, 32, 12, 52], 
        'Address':['Allahabad', 'Kannuaj', 'Allahabad', 'Kannuaj'], 
        'Qualification':['MCA', 'Phd', 'Bcom', 'B.hons'],
        'Salary':[1000, 2000, 3000, 4000]} 
  
df = pd.DataFrame(data1,index=[0, 1, 2, 3])
  
df1 = pd.DataFrame(data2, index=[2, 3, 6, 7]) 
  
  
# print(df, "\n\n", df1)
res = pd.concat([df, df1], ignore_index=True)
 
res


Unnamed: 0,Name,Age,Address,Qualification,Mobile No,Salary
0,Jai,27,Nagpur,Msc,97.0,
1,Princi,24,Kanpur,MA,91.0,
2,Gaurav,22,Allahabad,MCA,58.0,
3,Anuj,32,Kannuaj,Phd,76.0,
4,Gaurav,22,Allahabad,MCA,,1000.0
5,Anuj,32,Kannuaj,Phd,,2000.0
6,Dhiraj,12,Allahabad,Bcom,,3000.0
7,Hitesh,52,Kannuaj,B.hons,,4000.0


#### 4. Concatenating DataFrame with group keys :
###### If we want to retain information about the DataFrame from which each row came, we can use the keys argument. 
###### This assigns a label to each group of rows based on the source DataFrame

In [7]:
import pandas as pd

data1 = {'Name':['Jai', 'Princi', 'Gaurav', 'Anuj'],
        'Age':[27, 24, 22, 32],
        'Address':['Nagpur', 'Kanpur', 'Allahabad', 'Kannuaj'],
        'Qualification':['Msc', 'MA', 'MCA', 'Phd']}

data2 = {'Name':['Abhi', 'Ayushi', 'Dhiraj', 'Hitesh'],
        'Age':[17, 14, 12, 52],
        'Address':['Nagpur', 'Kanpur', 'Allahabad', 'Kannuaj'],
        'Qualification':['Btech', 'B.A', 'Bcom', 'B.hons']}

df = pd.DataFrame(data1,index=[0, 1, 2, 3])

df1 = pd.DataFrame(data2, index=[4, 5, 6, 7])

# print(df, "\n\n", df1)
frames = [df, df1 ]

res = pd.concat(frames, keys=['x', 'y'])
res

Unnamed: 0,Unnamed: 1,Name,Age,Address,Qualification
x,0,Jai,27,Nagpur,Msc
x,1,Princi,24,Kanpur,MA
x,2,Gaurav,22,Allahabad,MCA
x,3,Anuj,32,Kannuaj,Phd
y,4,Abhi,17,Nagpur,Btech
y,5,Ayushi,14,Kanpur,B.A
y,6,Dhiraj,12,Allahabad,Bcom
y,7,Hitesh,52,Kannuaj,B.hons


#### 5. Concatenating Mixed DataFrames and Series
###### We can also concatenate a mix of Series and DataFrames. 
###### If we include a Series in the list, it will automatically be converted to a DataFrame and we can specify the column name.

In [8]:
import pandas as pd

data1 = {'Name':['Jai', 'Princi', 'Gaurav', 'Anuj'],
        'Age':[27, 24, 22, 32],
        'Address':['Nagpur', 'Kanpur', 'Allahabad', 'Kannuaj'],
        'Qualification':['Msc', 'MA', 'MCA', 'Phd']}

df = pd.DataFrame(data1,index=[0, 1, 2, 3])

s1 = pd.Series([1000, 2000, 3000, 4000], name='Salary')

# print(df, "\n\n", s1)
res = pd.concat([df, s1], axis=1)

res

Unnamed: 0,Name,Age,Address,Qualification,Salary
0,Jai,27,Nagpur,Msc,1000
1,Princi,24,Kanpur,MA,2000
2,Gaurav,22,Allahabad,MCA,3000
3,Anuj,32,Kannuaj,Phd,4000


## Merging DataFrame
#### Merging DataFrames in Pandas is similar to performing SQL joins. It is useful when we need to combine two DataFrames based on a common column or index. The merge() function provides flexibility for different types of joins.
#### 
#### There are four basic ways to handle the join (inner, left, right and outer) depending on which rows must retain their data.
![image.png](attachment:1aa3d213-01b2-4e56-a1bf-6de209e45b77.png)

#### 1. Merging DataFrames Using One Key

In [9]:
# We can merge DataFrames based on a common column by using the on argument. This allows us to combine the DataFrames where values in a specific column match.




import pandas as pd

data1 = {'key': ['K0', 'K1', 'K2', 'K3'],
         'Name':['Jai', 'Princi', 'Gaurav', 'Anuj'],
        'Age':[27, 24, 22, 32],}

data2 = {'key': ['K0', 'K1', 'K2', 'K3'],
         'Address':['Nagpur', 'Kanpur', 'Allahabad', 'Kannuaj'],
        'Qualification':['Btech', 'B.A', 'Bcom', 'B.hons']}

df = pd.DataFrame(data1)

df1 = pd.DataFrame(data2)


# print(df, "\n\n", df1)
res = pd.merge(df, df1, on='key')

res

Unnamed: 0,key,Name,Age,Address,Qualification
0,K0,Jai,27,Nagpur,Btech
1,K1,Princi,24,Kanpur,B.A
2,K2,Gaurav,22,Allahabad,Bcom
3,K3,Anuj,32,Kannuaj,B.hons


#### 2. Merging DataFrames Using Multiple Keys

In [10]:
# We can also merge DataFrames based on more than one column by passing a list of column names to the on argument.




import pandas as pd

data1 = {'key': ['K0', 'K1', 'K2', 'K3'],
         'key1': ['K0', 'K1', 'K0', 'K1'],
         'Name':['Jai', 'Princi', 'Gaurav', 'Anuj'],
        'Age':[27, 24, 22, 32],}

data2 = {'key': ['K0', 'K1', 'K2', 'K3'],
         'key1': ['K0', 'K0', 'K0', 'K0'],
         'Address':['Nagpur', 'Kanpur', 'Allahabad', 'Kannuaj'],
        'Qualification':['Btech', 'B.A', 'Bcom', 'B.hons']}

df = pd.DataFrame(data1)

df1 = pd.DataFrame(data2)


# print(df, "\n\n", df1)
res1 = pd.merge(df, df1, on=['key', 'key1'])

res1

Unnamed: 0,key,key1,Name,Age,Address,Qualification
0,K0,K0,Jai,27,Nagpur,Btech
1,K2,K0,Gaurav,22,Allahabad,Bcom


#### 3. Merging DataFrames Using the how Argument
##### We use how argument to merge specifies how to find which keys are to be included in the resulting table. 
##### If a key combination does not appear in either the left or right tables, the values in the joined table will be NA.
##### Here is a summary of the how options and their SQL equivalent names:
![image.png](attachment:b77b660f-a97a-45db-8f62-773dfdf581cc.png)

In [16]:
import pandas as pd

data1 = {'key': ['K0', 'K1', 'K2', 'K3'],
         'key1': ['K0', 'K1', 'K0', 'K1'],
         'Name':['Jai', 'Princi', 'Gaurav', 'Anuj'],
        'Age':[27, 24, 22, 32],}

data2 = {'key': ['K0', 'K1', 'K2', 'K3'],
         'key1': ['K0', 'K0', 'K0', 'K0'],
         'Address':['Nagpur', 'Kanpur', 'Allahabad', 'Kannuaj'],
        'Qualification':['Btech', 'B.A', 'Bcom', 'B.hons']}

df = pd.DataFrame(data1)

df1 = pd.DataFrame(data2)


# print(df, "\n\n", df1)
# res = pd.merge(df, df1, how='left', on=['key', 'key1'])

# res

# res1 = pd.merge(df, df1, how='right', on=['key', 'key1'])

# res1

# res2 = pd.merge(df, df1, how='outer', on=['key', 'key1'])

# res2

# res3 = pd.merge(df, df1, how='inner', on=['key', 'key1'])

# res3

Unnamed: 0,key,key1,Name,Age,Address,Qualification
0,K0,K0,Jai,27,Nagpur,Btech
1,K2,K0,Gaurav,22,Allahabad,Bcom


#### 4.Joining DataFrame
###### The .join() method in Pandas is used to combine columns of two DataFrames based on their indexes. 
###### It's a simple way of merging two DataFrames when the relationship between them is primarily based on their row indexes.
###### It is used when we want to combine DataFrames along their indexes rather than specific columns.

##### 1. Joining DataFrames Using dataframe.join()
###### If both DataFrames have the same index, we can use the dataframe.join() function to combine their columns. 
###### This method is useful when we want to merge DataFrames based on their row indexes rather than columns.

In [18]:
import pandas as pd


data1 = {'Name':['Jai', 'Princi', 'Gaurav', 'Anuj'],
        'Age':[27, 24, 22, 32]}

data2 = {'Address':['Allahabad', 'Kannuaj', 'Allahabad', 'Kannuaj'],
        'Qualification':['MCA', 'Phd', 'Bcom', 'B.hons']}

df = pd.DataFrame(data1,index=['K0', 'K1', 'K2', 'K3'])

df1 = pd.DataFrame(data2, index=['K0', 'K2', 'K3', 'K4'])


# print(df, "\n\n", df1)
# res = df.join(df1)

# res

res1 = df.join(df1, how='outer')

res1



Unnamed: 0,Name,Age,Address,Qualification
K0,Jai,27.0,Allahabad,MCA
K1,Princi,24.0,,
K2,Gaurav,22.0,Kannuaj,Phd
K3,Anuj,32.0,Allahabad,Bcom
K4,,,Kannuaj,B.hons


#### 2. Joining DataFrames Using the "on" Argument
##### If we want to join DataFrames based on a column (rather than the index), we can use the on argument. 
##### This allows us to specify which column(s) should be used to align the two DataFrames.

In [19]:
import pandas as pd

data1 = {'Name':['Jai', 'Princi', 'Gaurav', 'Anuj'],
        'Age':[27, 24, 22, 32],
        'Key':['K0', 'K1', 'K2', 'K3']}

data2 = {'Address':['Allahabad', 'Kannuaj', 'Allahabad', 'Kannuaj'],
        'Qualification':['MCA', 'Phd', 'Bcom', 'B.hons']}

df = pd.DataFrame(data1)

df1 = pd.DataFrame(data2, index=['K0', 'K2', 'K3', 'K4'])


# print(df, "\n\n", df1)
res2 = df.join(df1, on='Key')

res2

Unnamed: 0,Name,Age,Key,Address,Qualification
0,Jai,27,K0,Allahabad,MCA
1,Princi,24,K1,,
2,Gaurav,22,K2,Kannuaj,Phd
3,Anuj,32,K3,Allahabad,Bcom


#### 3. Joining DataFrames with Different Index Levels (Multi-Index)
##### In some cases, we may be working with DataFrames that have multi-level indexes.
##### The .join() function also supports joining DataFrames that have different index levels by specifying the index levels.

In [20]:
import pandas as pd

data1 = {'Name':['Jai', 'Princi', 'Gaurav'],
        'Age':[27, 24, 22]}

data2 = {'Address':['Allahabad', 'Kannuaj', 'Allahabad', 'Kanpur'],
        'Qualification':['MCA', 'Phd', 'Bcom', 'B.hons']}

df = pd.DataFrame(data1, index=pd.Index(['K0', 'K1', 'K2'], name='key'))

index = pd.MultiIndex.from_tuples([('K0', 'Y0'), ('K1', 'Y1'),
                                   ('K2', 'Y2'), ('K2', 'Y3')],
                                   names=['key', 'Y'])

df1 = pd.DataFrame(data2, index= index)


# print(df, "\n\n", df1)
result = df.join(df1, how='inner')

result

Unnamed: 0_level_0,Unnamed: 1_level_0,Name,Age,Address,Qualification
key,Y,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
K0,Y0,Jai,27,Allahabad,MCA
K1,Y1,Princi,24,Kannuaj,Phd
K2,Y2,Gaurav,22,Allahabad,Bcom
K2,Y3,Gaurav,22,Kanpur,B.hons


## Sort Pandas DataFrame
### Pandas provides the sort_values() method which allows us to sort a DataFrame by one or more columns in either ascending or descending order.

#### 1. Sorting a DataFrame by a Single Column
###### The sort_values() method in Pandas makes it easy to sort our DataFrame by a single column. 
###### By default, it sorts in ascending order but we can customize this.
##### Parameters of sort_values():
###### by: Specifies the column to sort by.
###### ascending: A boolean (True for ascending, False for descending).
###### inplace: If True, the original DataFrame is modified otherwise a new sorted DataFrame is returned.
###### na_position: Controls where NaN values are placed. Use 'first' to put NaNs at the top or 'last' (default) to place them at the end.
###### ignore_index: If True, resets the index after sorting.

In [21]:
# ascending
import pandas as pd
data = {'Name': ['Alice', 'Bob', 'Charlie', 'David'],
        'Age': [25, 30, 35, 40],
        'Score': [85, 90, 95, 80]}
df = pd.DataFrame(data)

sorted_df = df.sort_values(by='Age')
print(sorted_df)

      Name  Age  Score
0    Alice   25     85
1      Bob   30     90
2  Charlie   35     95
3    David   40     80


In [22]:
# # descending
import pandas as pd
data = {'Name': ['Alice', 'Bob', 'Charlie', 'David'],'Age': [25, 30, 35, 40],'Score': [85, 90, 95, 80]}
df = pd.DataFrame(data)

sorted_df = df.sort_values(by='Age',ascending=False)
print(sorted_df)

      Name  Age  Score
3    David   40     80
2  Charlie   35     95
1      Bob   30     90
0    Alice   25     85


#### 2. Sorting a DataFrame by Multiple Columns
When sorting by multiple columns, Pandas allows us to specify a list of column names

In [23]:
import pandas as pd
data = {'Name': ['Alice', 'Bob', 'Charlie', 'David'],
        'Age': [25, 30, 35, 40],
        'Score': [85, 90, 95, 80]}
df = pd.DataFrame(data)

sorted_df = df.sort_values(by=['Age', 'Score'])
print(sorted_df)

      Name  Age  Score
0    Alice   25     85
1      Bob   30     90
2  Charlie   35     95
3    David   40     80


#### 3. Sorting DataFrame with Missing Values
In real-world datasets, missing values (NaNs) are common. 
By default sort_values() places NaN values at the end. If we need them at the top, we can use the na_position parameter.

In [24]:
import pandas as pd
data_with_nan = {"Name": ["Alice", "Bob", "Charlie", "David"],"Age": [28, 22, None, 22]}
df_nan = pd.DataFrame(data_with_nan)

sorted_df = df_nan.sort_values(by="Age", na_position="first")
print(sorted_df)

      Name   Age
2  Charlie   NaN
1      Bob  22.0
3    David  22.0
0    Alice  28.0


#### 4. Sorting by Index
###### In addition to sorting by column values, we may also want to sort a DataFrame based on its index. 
###### This can be done using the sort_index() method in Pandas. 
###### By default, sort_index() sorts the DataFrame based on the index in ascending order.

In [25]:
## ascending
import pandas as pd
data = {'Name': ['Alice', 'Bob', 'Charlie', 'David'],
        'Age': [25, 30, 35, 40],
        'Score': [85, 90, 95, 80]}
df = pd.DataFrame(data)

df_sorted_by_index = df.sort_index()
print(df_sorted_by_index)

      Name  Age  Score
0    Alice   25     85
1      Bob   30     90
2  Charlie   35     95
3    David   40     80


In [26]:
# # descending We can also sort by index in descending order by passing the ascending=False argument.

import pandas as pd
data = {'Name': ['Alice', 'Bob', 'Charlie', 'David'],
        'Age': [25, 30, 35, 40],
        'Score': [85, 90, 95, 80]}
df = pd.DataFrame(data)
df_sorted_by_index_desc = df.sort_index(ascending=False)
print(df_sorted_by_index_desc)


      Name  Age  Score
3    David   40     80
2  Charlie   35     95
1      Bob   30     90
0    Alice   25     85


#### 5. Choosing a Sorting Algorithm
##### Pandas provides different sorting algorithms that we can choose using the kind parameter. Available options are:

##### 1. QuickSort (kind='quicksort'): It is a highly efficient, divide-and-conquer sorting algorithm. 
###### It selects a "pivot" element and partitions the dataset into two halves: 
###### one with elements smaller than the pivot and the other with elements greater than the pivot.

In [27]:
import pandas as pd

data = {
    "Name": ["Alice", "Bob", "Charlie", "David", "Eve"],
    "Age": [28, 22, 25, 22, 28],
    "Score": [85, 90, 95, 80, 88]
}
df = pd.DataFrame(data)

sorted_df = df.sort_values(by='Age', kind='quicksort')
print(sorted_df)

      Name  Age  Score
1      Bob   22     90
3    David   22     80
2  Charlie   25     95
0    Alice   28     85
4      Eve   28     88


##### 2. MergeSort (kind='mergesort'):
##### Divides the dataset into smaller subarrays, sorts them and then merges them back together in sorted order.

In [28]:
import pandas as pd

data = {
    "Name": ["Alice", "Bob", "Charlie", "David", "Eve"],
    "Age": [28, 22, 25, 22, 28],
    "Score": [85, 90, 95, 80, 88]
}
df = pd.DataFrame(data)

sorted_df = df.sort_values(by='Age', kind='mergesort')
print(sorted_df)

      Name  Age  Score
1      Bob   22     90
3    David   22     80
2  Charlie   25     95
0    Alice   28     85
4      Eve   28     88


##### 3. HeapSort (kind= 'heapsort'):
It is another comparison-based sorting algorithm that builds a heap data structure to systematically extract the largest or smallest element and reorder the dataset.

In [29]:
import pandas as pd

data = {
    "Name": ["Alice", "Bob", "Charlie", "David", "Eve"],
    "Age": [28, 22, 25, 22, 28],
    "Score": [85, 90, 95, 80, 88]
}
df = pd.DataFrame(data)

sorted_df = df.sort_values(by='Age', kind='heapsort')
print(sorted_df)

      Name  Age  Score
1      Bob   22     90
3    David   22     80
2  Charlie   25     95
4      Eve   28     88
0    Alice   28     85


#### 6. Applying Custom Sorting Logic
##### We can also apply custom sorting logic using the key parameter. 
##### This is useful when we need to sort strings in a specific way such as ignoring case sensitivity.

In [30]:
import pandas as pd
data = {
    "Name": ["Alice", "Bob", "Charlie", "David", "Eve"],
    "Age": [28, 22, 25, 22, 28],
    "Score": [85, 90, 95, 80, 88]
}
df = pd.DataFrame(data)

sorted_df = df.sort_values(by='Name', key=lambda col: col.str.lower())
print(sorted_df)

      Name  Age  Score
0    Alice   28     85
1      Bob   22     90
2  Charlie   25     95
3    David   22     80
4      Eve   28     88


## Pivot Table in Python using Pandas

In [35]:
# importing pandas
import pandas as pd

# creating dataframe
df = pd.DataFrame({'Product': ['Carrots', 'Broccoli', 'Banana', 'Banana',
                               'Beans', 'Orange', 'Broccoli', 'Banana'],
                   'Category': ['Vegetable', 'Vegetable', 'Fruit', 'Fruit',
                                'Vegetable', 'Fruit', 'Vegetable', 'Fruit'],
                   'Quantity': [8, 5, 3, 4, 5, 9, 11, 8],
                   'Amount': [270, 239, 617, 384, 626, 610, 62, 90]})
# df
## 1.Get the Total Sales of Each Product
## In this example, the DataFrame 'df' is transformed using a pivot table, aggregating the total 'Amount' for each unique 'Product' 
## and displaying the result with the sum of amounts for each product.
# pivot = df.pivot_table(index=['Product'],
#                        values=['Amount'],
#                        aggfunc='sum')

## 2: Get the Total Sales of Each Category
## In this example, a pivot table is created from the DataFrame 'df' to summarize the total 'Amount' sales for each unique 'Category,'
## employing the 'sum' aggregation function, and the result is printed.
# pivot = df.pivot_table(index=['Category'],
#                        values=['Amount'],
#                        aggfunc='sum')

## 3.Get Total Sales by Category and Product Both
## In this example, a pivot table is generated from the DataFrame 'df' to showcase the total 'Amount' sales for unique combinations of 'Product'
## and 'Category,' utilizing the 'sum' aggregation function. 
## The resulting pivot table is then printed.
# pivot = df.pivot_table(index=['Product', 'Category'],
#                        values=['Amount'], aggfunc='sum')

## 4: Get the Mean, Median, Minimum Sale by Category
## In this example, a pivot table is created from the DataFrame 'df' to display the median, mean, and minimum 'Amount' values categorized by 'Category.' 
## The aggregation functions 'median,' 'mean,' and 'min' are applied, and the resulting pivot table is printed.
## 

# 'mean', 'min'} will get median, mean and
# minimum of sales respectively
# pivot = df.pivot_table(index=['Category'], values=['Amount'],
#                        aggfunc={'median', 'mean', 'min'})

## 5: Get the Mean, Median, Minimum Sale by Product
## In this example, a pivot table is generated from the DataFrame 'df' to showcase the median, mean, 
## and minimum 'Amount' values for each unique 'Product.' The aggregation functions 'median,' 'mean,' and 'min' are applied,
## resulting in the pivot table, which is then printed.
## 
pivot = df.pivot_table(index=['Product'], values=['Amount'],
                       aggfunc={'median', 'mean', 'min'})

print(pivot)

              Amount              
                mean median    min
Product                           
Banana    363.666667  384.0   90.0
Beans     626.000000  626.0  626.0
Broccoli  150.500000  150.5   62.0
Carrots   270.000000  270.0  270.0
Orange    610.000000  610.0  610.0


####  这个项目直播完成