# **Data Manipulation with pandas**

This notebook goes through the basics of Data Manipulation with pandas with python.

In [1]:
# Import necessary libraries
import pandas as pd

In [2]:
# Sample DataFrame
data = {
    'Name': ['John', 'Anna', 'Peter', 'Linda', 'James', 'Emily'],
    'Age': [28, 22, 35, 32, 30, 26],
    'Salary': [50000, 60000, 80000, 75000, 55000, 62000],
    'Department': ['HR', 'Finance', 'IT', 'HR', 'Finance', 'IT']
}

# Create the DataFrame
df = pd.DataFrame(data)

In [5]:
df.head()

Unnamed: 0,Name,Age,Salary,Department
0,John,28,50000,HR
1,Anna,22,60000,Finance
2,Peter,35,80000,IT
3,Linda,32,75000,HR
4,James,30,55000,Finance


In [8]:
# Transforming Data: Apply a function to columns or rows
# Let's increase everyone's salary by 10%

df['Salary'] = df['Salary'] * 1.1
df.head()

Unnamed: 0,Name,Age,Salary,Department
0,John,28,66550.0,HR
1,Anna,22,79860.0,Finance
2,Peter,35,106480.0,IT
3,Linda,32,99825.0,HR
4,James,30,73205.0,Finance


In [11]:
# Aggregating Data: Using groupby and aggregation functions
# Let's group by 'Department' and find the average salary and age

df_grouped = df.groupby('Department').agg({'Salary': 'mean', 'Age': 'mean'})
df_grouped.head()

Unnamed: 0_level_0,Salary,Age
Department,Unnamed: 1_level_1,Unnamed: 2_level_1
Finance,76532.5,26.0
HR,83187.5,30.0
IT,94501.0,30.5


In [12]:
# Slicing Data: Selecting specific rows or columns
# Slice to get the first three rows i.e. keep only the first three rows of the data frame

df_slice = df.iloc[:3]
df_slice.head()

Unnamed: 0,Name,Age,Salary,Department
0,John,28,66550.0,HR
1,Anna,22,79860.0,Finance
2,Peter,35,106480.0,IT


In [13]:
# Slicing by columns (Select only 'Name' and 'Salary' columns
# Slice to get the first two columns

df_slice_columns = df[['Name', 'Salary']]
df_slice_columns.head()

Unnamed: 0,Name,Salary
0,John,66550.0
1,Anna,79860.0
2,Peter,106480.0
3,Linda,99825.0
4,James,73205.0


In [14]:
# Indexing: Setting a column as index and working with index
# Let's set 'Name' as the index of the DataFrame

df_indexed = df.set_index('Name')
df_indexed.head()

Unnamed: 0_level_0,Age,Salary,Department
Name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
John,28,66550.0,HR
Anna,22,79860.0,Finance
Peter,35,106480.0,IT
Linda,32,99825.0,HR
James,30,73205.0,Finance


In [19]:
# Accessing rows by index value
# Accessing Anna row

df_indexed.loc['Anna']

Unnamed: 0,Anna
Age,22
Salary,79860.0
Department,Finance


In [20]:
# Resetting the index back to default

df_reset = df_indexed.reset_index()
df_reset.head()

Unnamed: 0,Name,Age,Salary,Department
0,John,28,66550.0,HR
1,Anna,22,79860.0,Finance
2,Peter,35,106480.0,IT
3,Linda,32,99825.0,HR
4,James,30,73205.0,Finance
