In [2]:
import pandas as pd
import numpy as np

In [3]:
# Creating a DataFrame
data = {
    'Name': ['Alice', 'Bob', 'Charlie', 'David', 'Eva'],
    'Age': [25, 30, 35, 28, 22],
    'City': ['New York', 'San Francisco', 'Los Angeles', 'Chicago', 'Boston'],
    'Salary': [50000, 70000, 80000, 65000, 45000]
}

In [4]:
df = pd.DataFrame(data)
print("Original DataFrame:")
print(df)

Original DataFrame:
      Name  Age           City  Salary
0    Alice   25       New York   50000
1      Bob   30  San Francisco   70000
2  Charlie   35    Los Angeles   80000
3    David   28        Chicago   65000
4      Eva   22         Boston   45000


In [5]:
# Basic information about the DataFrame
print("\nDataFrame Info:")
df.info()


DataFrame Info:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5 entries, 0 to 4
Data columns (total 4 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   Name    5 non-null      object
 1   Age     5 non-null      int64 
 2   City    5 non-null      object
 3   Salary  5 non-null      int64 
dtypes: int64(2), object(2)
memory usage: 288.0+ bytes


In [6]:
# Summary statistics
print("\nSummary Statistics:")
print(df.describe())


Summary Statistics:
             Age        Salary
count   5.000000      5.000000
mean   28.000000  62000.000000
std     4.949747  14404.860291
min    22.000000  45000.000000
25%    25.000000  50000.000000
50%    28.000000  65000.000000
75%    30.000000  70000.000000
max    35.000000  80000.000000


In [7]:
# Accessing columns
print("\nAge column:")
print(df['Age'])


Age column:
0    25
1    30
2    35
3    28
4    22
Name: Age, dtype: int64


In [8]:
print(df['Salary'])

0    50000
1    70000
2    80000
3    65000
4    45000
Name: Salary, dtype: int64


In [7]:
# Adding a new column
df['Experience'] = [3, 7, 12, 5, 1]
print("\nDataFrame with new 'Experience' column:")
print(df)


DataFrame with new 'Experience' column:
      Name  Age           City  Salary  Experience
0    Alice   25       New York   50000           3
1      Bob   30  San Francisco   70000           7
2  Charlie   35    Los Angeles   80000          12
3    David   28        Chicago   65000           5
4      Eva   22         Boston   45000           1


In [9]:
df['bonus']=[500,800,750,1000,500]
print(df)

      Name  Age           City  Salary  bonus
0    Alice   25       New York   50000    500
1      Bob   30  San Francisco   70000    800
2  Charlie   35    Los Angeles   80000    750
3    David   28        Chicago   65000   1000
4      Eva   22         Boston   45000    500


In [8]:
# Filtering rows
print("\nEmployees older than 28:")
print(df[df['Age'] > 28])


Employees older than 28:
      Name  Age           City  Salary  Experience
1      Bob   30  San Francisco   70000           7
2  Charlie   35    Los Angeles   80000          12


In [10]:
print(df[df['bonus']>700])

      Name  Age           City  Salary  bonus
1      Bob   30  San Francisco   70000    800
2  Charlie   35    Los Angeles   80000    750
3    David   28        Chicago   65000   1000


In [9]:
# Sorting the DataFrame
print("\nDataFrame sorted by Salary (descending):")
print(df.sort_values('Salary', ascending=False))


DataFrame sorted by Salary (descending):
      Name  Age           City  Salary  Experience
2  Charlie   35    Los Angeles   80000          12
1      Bob   30  San Francisco   70000           7
3    David   28        Chicago   65000           5
0    Alice   25       New York   50000           3
4      Eva   22         Boston   45000           1


In [11]:
print(df.sort_values('Name', ascending=True))

      Name  Age           City  Salary  bonus
0    Alice   25       New York   50000    500
1      Bob   30  San Francisco   70000    800
2  Charlie   35    Los Angeles   80000    750
3    David   28        Chicago   65000   1000
4      Eva   22         Boston   45000    500


In [10]:
# Grouping and aggregation
print("\nAverage salary by city:")
print(df.groupby('City')['Salary'].mean())


Average salary by city:
City
Boston           45000.0
Chicago          65000.0
Los Angeles      80000.0
New York         50000.0
San Francisco    70000.0
Name: Salary, dtype: float64


In [11]:
# Applying a function to a column
df['Salary_After_Tax'] = df['Salary'].apply(lambda x: x * 0.8)
print("\nDataFrame with new 'Salary_After_Tax' column:")
print(df)


DataFrame with new 'Salary_After_Tax' column:
      Name  Age           City  Salary  Experience  Salary_After_Tax
0    Alice   25       New York   50000           3           40000.0
1      Bob   30  San Francisco   70000           7           56000.0
2  Charlie   35    Los Angeles   80000          12           64000.0
3    David   28        Chicago   65000           5           52000.0
4      Eva   22         Boston   45000           1           36000.0


In [12]:
df['revised_bonus']=df['bonus'].apply(lambda x:x+1000)
print(df)

      Name  Age           City  Salary  bonus  revised_bonus
0    Alice   25       New York   50000    500           1500
1      Bob   30  San Francisco   70000    800           1800
2  Charlie   35    Los Angeles   80000    750           1750
3    David   28        Chicago   65000   1000           2000
4      Eva   22         Boston   45000    500           1500


In [12]:
# Handling missing data
df.loc[2, 'Age'] = np.nan
print("\nDataFrame with a missing value:")
print(df)




DataFrame with a missing value:
      Name   Age           City  Salary  Experience  Salary_After_Tax
0    Alice  25.0       New York   50000           3           40000.0
1      Bob  30.0  San Francisco   70000           7           56000.0
2  Charlie   NaN    Los Angeles   80000          12           64000.0
3    David  28.0        Chicago   65000           5           52000.0
4      Eva  22.0         Boston   45000           1           36000.0


In [13]:
print("\nDropping rows with missing values:")
print(df.dropna())


Dropping rows with missing values:
    Name   Age           City  Salary  Experience  Salary_After_Tax
0  Alice  25.0       New York   50000           3           40000.0
1    Bob  30.0  San Francisco   70000           7           56000.0
3  David  28.0        Chicago   65000           5           52000.0
4    Eva  22.0         Boston   45000           1           36000.0


In [14]:
# Renaming columns
df = df.rename(columns={'Salary': 'Annual_Salary'})
print("\nDataFrame with renamed 'Salary' column:")
print(df)


DataFrame with renamed 'Salary' column:
      Name   Age           City  Annual_Salary  Experience  Salary_After_Tax
0    Alice  25.0       New York          50000           3           40000.0
1      Bob  30.0  San Francisco          70000           7           56000.0
2  Charlie   NaN    Los Angeles          80000          12           64000.0
3    David  28.0        Chicago          65000           5           52000.0
4      Eva  22.0         Boston          45000           1           36000.0


In [14]:
df=df.rename(columns={'Name':'Employee_name'})
print(df)

  Employee_name  Age           City  Salary  bonus  revised_bonus
0         Alice   25       New York   50000    500           1500
1           Bob   30  San Francisco   70000    800           1800
2       Charlie   35    Los Angeles   80000    750           1750
3         David   28        Chicago   65000   1000           2000
4           Eva   22         Boston   45000    500           1500
