In [None]:
Q1: List any five functions of the pandas library with execution.

# 'read_csv()': This function is used to read data from a CSV file and create a pandas DataFrame.
import pandas as pd
df = pd.read_csv('mydata.csv')


In [None]:
# 'head()': This function is used to display the first n rows of a pandas DataFrame.

df.head(10) # display the first 10 rows


In [None]:
#'describe()': This function is used to generate descriptive statistics of a pandas DataFrame.

df.describe() # generate descriptive statistics of the DataFrame


In [None]:
# 'groupby()': This function is used to group the data in a pandas DataFrame by one or more columns.

df.groupby(['category', 'sub_category']).sum() # group the data by 'category' and 'sub_category' columns and compute the sum of each group


In [None]:
# 'plot()': This function is used to plot data from a pandas DataFrame.
df.plot(kind='bar', x='year', y='sales') # plot a bar chart of the 'sales' column by 'year'


In [1]:
# Q2. Given a Pandas DataFrame df with columns 'A', 'B', and 'C', write a Python function to re-index the
# DataFrame with a new index that starts from 1 and increments by 2 for each row.

# We can use the 'set_index()' function of the pandas DataFrame to set a new index for the DataFrame. Here's how we can do it:

import pandas as pd

def reindex_dataframe(df):
    new_index = pd.Index(range(1, 2*len(df)+1, 2), name='new_index')
    df.set_index(new_index, inplace=True)
    return df



In [2]:
# You can call this function with your DataFrame as an argument to re-index the DataFrame with the new index. 
df = pd.DataFrame({'A': [1, 2, 3], 'B': [4, 5, 6], 'C': [7, 8, 9]})
print(df)

# Output:
#    A  B  C
# 0  1  4  7
# 1  2  5  8
# 2  3  6  9

df = reindex_dataframe(df)
print(df)

# Output:
#            A  B  C
# new_index         
# 1          1  4  7
# 3          2  5  8
# 5          3  6  9


   A  B  C
0  1  4  7
1  2  5  8
2  3  6  9
           A  B  C
new_index         
1          1  4  7
3          2  5  8
5          3  6  9


In [3]:
#Q3. You have a Pandas DataFrame df with a column named 'Values'. Write a Python function that
# iterates over the DataFrame and calculates the sum of the first three values in the 'Values' column. The
# function should print the sum to the console.

# Python function that iterates over a Pandas DataFrame and calculates the sum of the first three values in the 'Values' column:

import pandas as pd

def sum_first_three(df):
    sum = 0
    for i in range(3):
        sum += df.loc[i, 'Values']
    print('Sum of the first three values:', sum)


In [4]:
# You can call this function with your DataFrame as an argument to calculate the sum of the first three values in the 'Values' column.

df = pd.DataFrame({'Values': [10, 20, 30, 40, 50]})
sum_first_three(df)

# Output:
# Sum of the first three values: 6


Sum of the first three values: 60


In [5]:
# Q4. Given a Pandas DataFrame df with a column 'Text', write a Python function to create a new column
# 'Word_Count' that contains the number of words in each row of the 'Text' column.

#You can use the 'apply()' function of the Pandas DataFrame to apply a function to each row of a column and create a 
# new column based on the result. Here's how you can create a new column 'Word_Count' in a Pandas DataFrame 'df' that 
# contains the number of words in each row of the  'Text' column:

import pandas as pd

def count_words(row):
    return len(row['Text'].split())

def add_word_count(df):
    df['Word_Count'] = df.apply(count_words, axis=1)
    return df



In [6]:
# You can call this function with your DataFrame as an argument to create a new column 
# 'Word_Count' that contains the number of words  in each row of the 'Text' column. 

df = pd.DataFrame({'Text': ['This is the first row', 'This is the second row', 'This is the third row']})
print(df)

# Output:
#                     Text
# 0   This is the first row
# 1  This is the second row
# 2    This is the third row

df = add_word_count(df)
print(df)

# Output:
#                     Text  Word_Count
# 0   This is the first row           5
# 1  This is the second row           5
# 2    This is the third row           5


                     Text
0   This is the first row
1  This is the second row
2   This is the third row
                     Text  Word_Count
0   This is the first row           5
1  This is the second row           5
2   This is the third row           5


# Q5. How are DataFrame.size() and DataFrame.shape() different?
## 'DataFrame.size()' and 'DataFrame.shape()' are two different methods of the Pandas DataFrame object that return different information about the DataFrame.

### 'DataFrame.size()' returns the number of elements in the DataFrame, which is equal to the total number of cells or values in the DataFrame. It is equivalent to the product of the number of rows and columns in the DataFrame.


In [7]:
# Example 

import pandas as pd

df = pd.DataFrame({'A': [1, 2, 3], 'B': [4, 5, 6]})
print(df.size)   # Output: 6


6


# 'DataFrame.shape()' returns a tuple that contains the number of rows and columns in the DataFrame. The first element of the tuple is the number of rows, and the second element is the number of columns

In [8]:
# Example

import pandas as pd

df = pd.DataFrame({'A': [1, 2, 3], 'B': [4, 5, 6]})
print(df.shape)  # Output: (3, 2)


(3, 2)


In [None]:
# Q6. Which function of pandas do we use to read an excel file?

# To read an Excel file into a Pandas DataFrame, we can use the 'read_excel()' function of Pandas. 
# Here is the syntax for the 'read_excel()' function:

import pandas as pd

df = pd.read_excel('filename.xlsx', sheet_name='sheetname')


# Q7. You have a Pandas DataFrame df that contains a column named 'Email' that contains email
# addresses in the format 'username@domain.com'. Write a Python function that creates a new column
# 'Username' in df that contains only the username part of each email address.

### To extract the username part of each email address in a Pandas DataFrame 'df' that contains a column named 'Email', we can use the 'str' attribute of the DataFrame and the 'split()' function. 


In [10]:
# Here is how we can create a new column 'Username' in the DataFrame that contains only the username part of each email address:

import pandas as pd

def extract_username(email):
    return email.split('@')[0]

def add_username(df):
    df['Username'] = df['Email'].str.extract('(\w+)', expand=False)
    return df


In [12]:
# You can call this function with your DataFrame as an argument to create a new column 'Username' that contains only the username 
# part of each email address. For example:

df = pd.DataFrame({'Email': ['john.doe@example.com', 'jane.doe@example.com', 'james@example.com']})
print(df)

# Output:
#                    Email
# 0   john.doe@example.com
# 1   jane.doe@example.com
# 2       james@example.com

df = add_username(df)
print(df)

# Output:
#                    Email Username
# 0   john.doe@example.com     john
# 1   jane.doe@example.com     jane
# 2       james@example.com    james


## As you can see, the function creates a new column 'Username' that contains only the username part of each email address. 
# The 'str.extract()' function is another way to extract the username using regular expressions. 
# In this case, we use the regular expression '(\w+)' to match one or more word characters,
# which correspond to the username part of the email address.


                  Email
0  john.doe@example.com
1  jane.doe@example.com
2     james@example.com
                  Email Username
0  john.doe@example.com     john
1  jane.doe@example.com     jane
2     james@example.com    james


In [13]:
# Q8. You have a Pandas DataFrame df with columns 'A', 'B', and 'C'. Write a Python function that selects
# all rows where the value in column 'A' is greater than 5 and the value in column 'B' is less than 10. The
# function should return a new DataFrame that contains only the selected rows.

## To select all rows from a Pandas DataFrame 'df' where the value in column 'A' is greater than 5 and the value 
## in column 'B' is less than 10, we can use boolean indexing. Here is how we can write a function that 
## selects the rows and returns a new DataFrame:

import pandas as pd

def select_rows(df):
    selected_rows = df[(df['A'] > 5) & (df['B'] < 10)]
    return selected_rows


In [14]:
# You can call this function with your DataFrame as an argument to select the rows that 
# satisfy the conditions and return a new DataFrame that contains only the selected rows. For example:

df = pd.DataFrame({'A': [1, 6, 7, 3], 'B': [5, 2, 9, 7], 'C': [10, 20, 30, 40]})
print(df)

# Output:
#    A  B   C
# 0  1  5  10
# 1  6  2  20
# 2  7  9  30
# 3  3  7  40

selected_rows = select_rows(df)
print(selected_rows)

# Output:
#    A  B   C
# 2  7  9  30

# As you can see, the function selects only the row with index 2, which satisfies both conditions, and returns a
# new DataFrame that contains only this row.

   A  B   C
0  1  5  10
1  6  2  20
2  7  9  30
3  3  7  40
   A  B   C
1  6  2  20
2  7  9  30


In [15]:
# Q9. Given a Pandas DataFrame df with a column 'Values', write a Python function to calculate the mean,
# median, and standard deviation of the values in the 'Values' column.

## To calculate the mean, median, and standard deviation of the values in a Pandas DataFrame 'df' with a column 'Values', 
## we can use the mean(), median(), and std() functions of the DataFrame object in Pandas. Here is how we can write a 
## function to calculate these statistics:

import pandas as pd

def calculate_statistics(df):
    mean = df['Values'].mean()
    median = df['Values'].median()
    std = df['Values'].std()
    return mean, median, std


In [17]:
# In the above function, we use the mean(), median(), and std() functions of the DataFrame object to calculate 
# the mean, median, and standard deviation of the values in the 'Values' column of the DataFrame. 
# We store these values in separate variables  and return them as a tuple.

## You can call this function with your DataFrame as an argument to calculate the mean, median, and standard 
## deviation of the values in the 'Values' column.  For example:

df = pd.DataFrame({'Values': [1, 2, 3, 4, 5]})
print(df)

# Output:
#    Values
# 0       1
# 1       2
# 2       3
# 3       4
# 4       5

mean, median, std = calculate_statistics(df)
print('Mean:', mean)
print('Median:', median)
print('Standard deviation:', std)

# Output:
# Mean: 3.0
# Median: 3.0
# Standard deviation: 1.5811388300841898


## As you can see, the function calculates the mean, median, and standard deviation of the values in the 
## 'Values' column of the DataFrame and returns them as a tuple. You can print these values or use them
## for further analysis or visualization.


   Values
0       1
1       2
2       3
3       4
4       5
Mean: 3.0
Median: 3.0
Standard deviation: 1.5811388300841898


In [18]:
# Q10. Given a Pandas DataFrame df with a column 'Sales' and a column 'Date', write a Python function to
# create a new column 'MovingAverage' that contains the moving average of the sales for the past 7 days
# for each row in the DataFrame. The moving average should be calculated using a window of size 7 and
# should include the current day.

## To calculate the moving average of the sales for the past 7 days for each row in a Pandas DataFrame 'df' with 
## columns 'Sales' and 'Date', we can use the rolling() function of the DataFrame object in Pandas. Here is how we can write a 
## function to calculate the moving average:

import pandas as pd

def calculate_moving_average(df):
    ma = df['Sales'].rolling(window=7, min_periods=1).mean()
    df['MovingAverage'] = ma
    return df



In [19]:
# You can call this function with your DataFrame as an argument to calculate the moving average of the sales for 
# the past 7 days for each row. 

df = pd.DataFrame({'Date': ['2023-02-25', '2023-02-26', '2023-02-27', '2023-02-28', '2023-03-01', '2023-03-02', '2023-03-03'],
                   'Sales': [10, 20, 30, 40, 50, 60, 70]})
df['Date'] = pd.to_datetime(df['Date'])
print(df)

# Output:
#         Date  Sales
# 0 2023-02-25     10
# 1 2023-02-26     20
# 2 2023-02-27     30
# 3 2023-02-28     40
# 4 2023-03-01     50
# 5 2023-03-02     60
# 6 2023-03-03     70

df = calculate_moving_average(df)
print(df)

# Output:
#         Date  Sales  MovingAverage
# 0 2023-02-25     10      10.000000
# 1 2023-02-26     20      15.000000
# 2 2023-02-27     30      20.000000
# 3 2023-02-28     40      25.000000
# 4 2023-03-01     50      30.000000
# 5 2023-03-02     60      35.000000
# 6 2023-03-03     70      40.000000



        Date  Sales
0 2023-02-25     10
1 2023-02-26     20
2 2023-02-27     30
3 2023-02-28     40
4 2023-03-01     50
5 2023-03-02     60
6 2023-03-03     70
        Date  Sales  MovingAverage
0 2023-02-25     10           10.0
1 2023-02-26     20           15.0
2 2023-02-27     30           20.0
3 2023-02-28     40           25.0
4 2023-03-01     50           30.0
5 2023-03-02     60           35.0
6 2023-03-03     70           40.0


In [20]:
# Q11. You have a Pandas DataFrame df with a column 'Date'. Write a Python function that creates a new
# column 'Weekday' in the DataFrame. The 'Weekday' column should contain the weekday name (e.g.
# Monday, Tuesday) corresponding to each date in the 'Date' column.

## To create a new column 'Weekday' in the DataFrame containing the weekday name corresponding to each date in the 
## 'Date' column, you can use  the dt.weekday_name attribute of the pandas DatetimeIndex.

import pandas as pd

def add_weekday_column(df):
    df['Weekday'] = pd.to_datetime(df['Date']).dt.weekday_name
    return df


In [28]:
import pandas as pd

def add_weekday_column(df):
    # Convert the 'Date' column to a pandas datetime object
    df['Date'] = pd.to_datetime(df['Date'])
    
    # Extract the weekday from the 'Date' column and add it as a new column 'Weekday'
    df['Weekday'] = df['Date'].dt.weekday_name
    
    return df


In [None]:
# You can call this function by passing your Pandas DataFrame as an argument:

df = pd.DataFrame({'Date': ['2023-03-01', '2023-03-02', '2023-03-03']})
df['Date'] = pd.to_datetime(df['Date'])
df = add_weekday_column(df)
print(df)

In [33]:
# Q12. Given a Pandas DataFrame df with a column 'Date' that contains timestamps, write a Python
# function to select all rows where the date is between '2023-01-01' and '2023-01-31'.

# You can use the pandas.DataFrame.loc accessor with a boolean mask to select the rows where the date is 
# between '2023-01-01' and '2023-01-31'. Here's a sample function that you can use:

import pandas as pd

def select_january_dates(df):
    mask = (df['Date'] >= '2023-01-01') & (df['Date'] <= '2023-01-31')
    return df.loc[mask]


In [34]:
# To use this function, you can simply pass your DataFrame to it:

df = pd.DataFrame({'Date': ['2023-01-01', '2023-01-15', '2023-01-31', '2023-02-15']})
df['Date'] = pd.to_datetime(df['Date'])
january_dates = select_january_dates(df)
print(january_dates)


        Date
0 2023-01-01
1 2023-01-15
2 2023-01-31


In [None]:
# Q13. To use the basic functions of pandas, what is the first and foremost necessary library that needs to be imported?

#To use the basic functions of pandas, you need to import the pandas library. You can import pandas using the following code:

import pandas as pd

# The'pd'alias is commonly used for pandas, but you can choose any other alias 
# that you prefer. Once you have imported pandas, you can use its functions and classes to create, 
# manipulate, and analyze data in various ways. Some of the commonly used functions and classes in pandas include 'pandas.DataFrame',
# 'pandas.Series', 'pandas.read_csv()', 'pandas.concat()', 'pandas.merge()', and many others.
