In [None]:
#Q1.

#1.pd.read_csv(): This function is used to read data from a CSV file into a Pandas DataFrame.

import pandas as pd

# Read CSV file into DataFrame
data = pd.read_csv('data.csv')

print(data.head())

#2.df.head(): The head() function is used to display the first few rows of a DataFrame.

import pandas as pd

# Create a sample DataFrame
data = {'Name': ['Alice', 'Bob', 'Charlie', 'David', 'Eve'],
        'Age': [25, 30, 22, 28, 24]}

df = pd.DataFrame(data)

# Display the first few rows
print(df.head())

#3.df.info(): This function provides a concise summary of a DataFrame, including information about the data types and non-null values in each column.

import pandas as pd

# Create a sample DataFrame
data1 = {'Name': ['Alice', 'Bob', 'Charlie'],
        'Age': [25, 30, 22]}

df = pd.DataFrame(data1)

# Display DataFrame information
df.info()

#4.df.groupby(): The groupby() function is used to group data in a DataFrame based on one or more columns and then apply aggregate functions.

import pandas as pd

# Create a sample DataFrame
data2 = {'Name': ['Alice', 'Bob', 'Charlie', 'David', 'Eve'],
        'Department': ['HR', 'IT', 'HR', 'IT', 'IT'],
        'Salary': [50000, 60000, 55000, 65000, 70000]}

df = pd.DataFrame(data2)

# Group by Department and calculate average salary
grouped = df.groupby('Department')['Salary'].mean()

print(grouped)


#5.df.plot(): This function is used to create basic plots directly from a DataFrame.

import pandas as pd
import matplotlib.pyplot as plt

# Create a sample DataFrame
data3 = {'Month': ['Jan', 'Feb', 'Mar', 'Apr', 'May'],
        'Revenue': [1000, 1200, 900, 1500, 1800]}

df = pd.DataFrame(data3)

# Create a line plot of revenue
df.plot(x='Month', y='Revenue', kind='line')
plt.show()

In [1]:
#Q2.

#We can achieve this by using the reset_index function along with a custom index. Here's a Python function that re-indexes the DataFrame as described:

import pandas as pd

def reindex_dataframe(df):
    new_index = pd.Index(range(1, len(df) * 2, 2), name='New_Index')
    df = df.reset_index(drop=True)
    df.index = new_index
    return df

# Example usage:
data = {'A': [10, 20, 30], 'B': [40, 50, 60], 'C': [70, 80, 90]}
df = pd.DataFrame(data)

new_df = reindex_dataframe(df)
print(new_df)

#In this function, we create a new index starting from 1 and incrementing by 2 for each row. We then reset the existing index of the DataFrame using reset_index(drop=True) to remove the old index column and finally assign the new index to the DataFrame.

            A   B   C
New_Index            
1          10  40  70
3          20  50  80
5          30  60  90


In [4]:
#Q3.

import pandas as pd

def calculate_sum_of_first_three(df):
    if 'Values' not in df.columns:
        print("Error: 'Values' column not found in DataFrame")
        return
    
    first_three_sum = df['Values'].head(3).sum()
    print("Sum of the first three values:", first_three_sum)

# Example DataFrame
data = {'Values': [10, 20, 30, 40, 50]}
df = pd.DataFrame(data)

# Call the function
calculate_sum_of_first_three(df)

Sum of the first three values: 60


In [5]:
#Q4.

#We can achieve this by using the apply() function along with the split() function to split the text into words and then calculate the length of the resulting list to get the word count. Here's a Python function that does exactly that:

import pandas as pd

def add_word_count_column(df):
    df['Word_Count'] = df['Text'].apply(lambda x: len(x.split()))
    return df

# Example usage
data = {'Text': ["This is a sample text.", "Another example with more words.", "Short text."]}
df = pd.DataFrame(data)

df_with_word_count = add_word_count_column(df)
print(df_with_word_count)

#This function add_word_count_column takes a DataFrame as input, applies a lambda function to each row in the 'Text' column to count the number of words, and then adds the 'Word_Count' column to the DataFrame. The example usage demonstrates how to use this function with a sample DataFrame.

                               Text  Word_Count
0            This is a sample text.           5
1  Another example with more words.           5
2                       Short text.           2


In [6]:
#Q5.

#DataFrame.size:
#DataFrame.size returns the total number of elements in the DataFrame, which is the product of the number of rows and the number of columns.
#This attribute provides a single integer value representing the total size of the DataFrame, including all its elements (cells).

import pandas as pd

data = {'A': [1, 2, 3], 'B': [4, 5, 6]}
df = pd.DataFrame(data)

print(df.size)  # Output: 6 (2 rows * 3 columns = 6 elements)


#DataFrame.shape:
#DataFrame.shape returns a tuple containing the number of rows and the number of columns in the DataFrame.
#This attribute provides a clearer understanding of the DataFrame's structure by explicitly showing the dimensions.

import pandas as pd

data = {'A': [1, 2, 3], 'B': [4, 5, 6]}
df = pd.DataFrame(data)

print(df.shape)  # Output: (3, 2) (3 rows, 2 columns)


#In summary, DataFrame.size provides the total number of elements in the DataFrame, while DataFrame.shape provides a tuple indicating the number of rows and columns. Depending on your needs, we might use one or the other to get the information about the DataFrame's dimensions.

6
(3, 2)


In [None]:
#Q6.

#We can use the read_excel() function to read data from an Excel file. This function allows us to read data from Excel files and create Pandas DataFrame objects, which we can then manipulate and analyze. Here's the basic syntax:
import pandas as pd

# Read an Excel file and create a DataFrame
df = pd.read_excel('filename.xlsx')

# You can also specify a specific sheet by name or index
# df = pd.read_excel('filename.xlsx', sheet_name='Sheet1')
# df = pd.read_excel('filename.xlsx', sheet_name=0)


In [None]:
#Q7.

import pandas as pd

def extract_username(email):
    return email.split('@')[0]

def add_username_column(df):
    df['Username'] = df['Email'].apply(extract_username)
    return df

# Example usage
data = {'Email': ['john.doe@example.com', 'jane.smith@example.com', 'bob.jones@example.com']}
df = pd.DataFrame(data)

df_with_username = add_username_column(df)
print(df_with_username)

#In this example, the extract_username function takes an email address as input and uses the split('@') method to split the email into a list containing the username and domain parts. Then, it returns the username part (index 0) of the list.

#The add_username_column function applies the extract_username function to each row in the 'Email' column using the .apply() method, and then assigns the results to a new 'Username' column in the DataFrame. Finally, it returns the modified DataFrame.

In [8]:
#Q8.

import pandas as pd

def select_rows(df):
    selected_rows = df[(df['A'] > 5) & (df['B'] < 10)]
    return selected_rows

# Sample DataFrame
data = {'A': [3, 8, 6, 2, 9],
        'B': [5, 2, 9, 3, 1],
        'C': [1, 7, 4, 5, 2]}
df = pd.DataFrame(data)

# Call the function and get the selected rows
selected_df = select_rows(df)

# Print the selected DataFrame
print(selected_df)

#The select_rows function filters the DataFrame based on the conditions specified and returns a new DataFrame containing only the selected rows.

   A  B  C
1  8  2  7
2  6  9  4
4  9  1  2


In [9]:
#Q9.

import pandas as pd

def calculate_stats(df):
    if 'Values' not in df.columns:
        raise ValueError("DataFrame must contain a 'Values' column.")

    mean = df['Values'].mean()
    median = df['Values'].median()
    std_dev = df['Values'].std()

    return mean, median, std_dev

# Example usage:
data = {'Values': [10, 20, 30, 40, 50]}
df = pd.DataFrame(data)
mean_value, median_value, std_dev_value = calculate_stats(df)

print("Mean:", mean_value)
print("Median:", median_value)
print("Standard Deviation:", std_dev_value)

Mean: 30.0
Median: 30.0
Standard Deviation: 15.811388300841896


In [None]:
#Q10.

import pandas as pd

def calculate_moving_average(df):
    df['MovingAverage'] = df['Sales'].rolling(window=7, min_periods=1).mean()
    return df

# Sample DataFrame with 'Sales' and 'Date' columns
data = {'Date': pd.date_range(start='2023-01-01', periods=30),
        'Sales': [10, 15, 20, 18, 25, 30, 22, 19, 17, 23, 29, 35, 21, 16, 12, 28, 32, 24, 26, 14, 31, 27, 33, 37, 40, 36, 39, 42, 38, 41, 45]}
df = pd.DataFrame(data)

# Call the function to calculate moving average
df = calculate_moving_average(df)

print(df)

#In this example, the function calculate_moving_average() calculates the moving average of the 'Sales' column over a window of size 7 for each row in the DataFrame. The min_periods=1 argument ensures that even if there are fewer than 7 days of data available, the moving average is still computed.

In [11]:
#Q11.

import pandas as pd

def add_weekday_column(df):
    # Convert the 'Date' column to datetime format
    df['Date'] = pd.to_datetime(df['Date'])

    # Create a new column 'Weekday' containing the weekday names
    df['Weekday'] = df['Date'].dt.strftime('%A')

    return df

# Sample DataFrame
data = {'Date': ['2023-01-01', '2023-01-02', '2023-01-03', '2023-01-04', '2023-01-05']}
df = pd.DataFrame(data)

# Call the function to add the 'Weekday' column
df = add_weekday_column(df)

print(df)

#This function first converts the 'Date' column to the datetime format using pd.to_datetime(). Then, it creates a new column 'Weekday' using the .dt.strftime('%A') method, which extracts the weekday names from the datetime values in the 'Date' column.

        Date    Weekday
0 2023-01-01     Sunday
1 2023-01-02     Monday
2 2023-01-03    Tuesday
3 2023-01-04  Wednesday
4 2023-01-05   Thursday


In [12]:
#Q12.

import pandas as pd

def select_rows_between_dates(df, start_date, end_date):
    df['Date'] = pd.to_datetime(df['Date'])  # Convert 'Date' column to datetime if it's not already

    mask = (df['Date'] >= start_date) & (df['Date'] <= end_date)
    selected_rows = df[mask]

    return selected_rows

# Example usage
data = {'Date': ['2023-01-15', '2023-01-20', '2023-02-10', '2023-01-05']}
df = pd.DataFrame(data)
start_date = '2023-01-01'
end_date = '2023-01-31'

selected_data = select_rows_between_dates(df, start_date, end_date)
print(selected_data)

        Date
0 2023-01-15
1 2023-01-20
3 2023-01-05


In [None]:
#Q13.

#The first and foremost necessary library that needs to be imported to use the basic functions of pandas is, unsurprisingly, the pandas library itself. we can import it using the following line of code:

import pandas as pd

#By convention, the pd alias is commonly used for the pandas library to make it easier to reference its functions and classes throughout our code. Once we've imported pandas, we can start using its powerful data manipulation and analysis capabilities.