# Data Analysis
### Introduction to Pandas

In [1]:
### 
# Importing Pandas
import pandas as pd
import numpy as np

# Checking the installed version of Pandas
print("Pandas version:", pd.__version__)

Pandas version: 1.5.3


In [2]:
### Creating Pandas Series

data = [10, 20, 30, 40, 50]
series = pd.Series(data)
print("\nPandas Series:")
display(series)


Pandas Series:


0    10
1    20
2    30
3    40
4    50
dtype: int64

In [3]:
# Customizing index in a Series
series_custom_index = pd.Series(data, index=['a', 'b', 'c', 'd', 'e'])
print("\nPandas Series with Custom Index:")
display(series_custom_index)


Pandas Series with Custom Index:


a    10
b    20
c    30
d    40
e    50
dtype: int64

In [4]:
### Creating a Pandas DataFrame

data_dict = {
    'Name': ['Lynn', 'Shane', 'Michael', 'Harvey', 'Ken'],
    'Age': [25, 30, 35, 40, 28],
    'Salary': [50000, 60000, 70000, 80000, 55000]
}
df = pd.DataFrame(data_dict)
print("\nPandas DataFrame:")
display(df)


Pandas DataFrame:


Unnamed: 0,Name,Age,Salary
0,Alice,25,50000
1,Bob,30,60000
2,Charlie,35,70000
3,David,40,80000
4,Eve,28,55000


In [5]:
### Loading Data into Pandas
# Loading a CSV file (Example: Titanic dataset from an online source)
path = "data_analysis_employee_dataset.csv"
df = pd.read_csv(path)
print("\nFirst 5 rows of Employee Dataset:")
display(df.head())



First 5 rows of Employee Dataset:


Unnamed: 0,Employee ID,Name,Department,Salary,Joining Date,Age,Performance Rating,Bonus,Email,Attendance (%),Data Insert Time
0,462,Grace Garcia,Operations,90343.0,2016-12-02,41.0,Excellent,5623.02,grace.garcia@company.com,85.79,2024-01-03 22:03:46
1,174,Uma Brown,Sales,72711.0,2011-05-27,27.0,Excellent,6812.47,uma.brown@company.com,79.57,2024-01-17 17:02:48
2,475,Xander Davis,Marketing,41324.0,2017-03-03,50.0,Average,4425.41,xander.davis@company.com,84.95,2024-01-02 14:48:57
3,256,Yara Rodriguez,Operations,67751.0,2012-12-21,27.0,Excellent,4217.53,yara.rodriguez@company.com,94.47,2024-01-17 20:44:58
4,205,Uma Miller,Operations,43436.0,2011-12-30,24.0,Good,6466.02,uma.miller@company.com,88.83,2024-01-01 09:24:19


In [6]:
### Basic DataFrame Operations
# Displaying column names
print("\nColumn Names:")
print(df.columns.tolist())


Column Names:
['Employee ID', 'Name', 'Department', 'Salary', 'Joining Date', 'Age', 'Performance Rating', 'Bonus', 'Email', 'Attendance (%)', 'Data Insert Time']


In [7]:
# Checking data types
print("\nData Types:")
display(df.dtypes)


Data Types:


Employee ID             int64
Name                   object
Department             object
Salary                float64
Joining Date           object
Age                   float64
Performance Rating     object
Bonus                 float64
Email                  object
Attendance (%)        float64
Data Insert Time       object
dtype: object

In [8]:
# Checking the shape of the dataset
print("\nShape of the Dataset (Rows, Columns):", df.shape)

# Summary statistics of numerical columns
print("\nSummary Statistics:")
display(df.describe())


Shape of the Dataset (Rows, Columns): (500, 11)

Summary Statistics:


Unnamed: 0,Employee ID,Salary,Age,Bonus,Attendance (%)
count,500.0,499.0,499.0,401.0,500.0
mean,350.5,80911.767535,40.991984,8079.372643,85.46284
std,144.481833,38571.680398,11.732799,4619.233616,11.499715
min,101.0,2000.0,15.0,202.2,-10.0
25%,225.75,59623.0,31.0,5316.06,77.1175
50%,350.5,78756.0,42.0,7464.88,85.58
75%,475.25,97568.5,51.0,9849.38,93.115
max,600.0,500000.0,95.0,49240.34,200.0


### Working with DataFrames

In [9]:
# 1.Accessing Data in DataFrames
print("\n### Accessing Columns")
print(df['Name'])  # Accessing a single column


### Accessing Columns
0        Grace Garcia
1           Uma Brown
2        Xander Davis
3      Yara Rodriguez
4          Uma Miller
            ...      
495     Ivy Rodriguez
496        Mona Brown
497       Frank Jones
498     Quincy Miller
499         Uma Jones
Name: Name, Length: 500, dtype: object


In [10]:
print("\n### Accessing Multiple Columns")
print(df[['Name', 'Department']])  # Accessing multiple columns


### Accessing Multiple Columns
               Name  Department
0      Grace Garcia  Operations
1         Uma Brown       Sales
2      Xander Davis   Marketing
3    Yara Rodriguez  Operations
4        Uma Miller  Operations
..              ...         ...
495   Ivy Rodriguez       Sales
496      Mona Brown     Finance
497     Frank Jones       Sales
498   Quincy Miller          HR
499       Uma Jones     Finance

[500 rows x 2 columns]


In [11]:
print("\n### Accessing Rows using loc")
print(df.loc[0])  # Access first row


### Accessing Rows using loc
Employee ID                                462
Name                              Grace Garcia
Department                          Operations
Salary                                 90343.0
Joining Date                        2016-12-02
Age                                       41.0
Performance Rating                   Excellent
Bonus                                  5623.02
Email                 grace.garcia@company.com
Attendance (%)                           85.79
Data Insert Time           2024-01-03 22:03:46
Name: 0, dtype: object


In [12]:
print("\n### Accessing Rows using iloc")
print(df.iloc[0:5])  # Access first five rows


### Accessing Rows using iloc
   Employee ID            Name  Department   Salary Joining Date   Age  \
0          462    Grace Garcia  Operations  90343.0   2016-12-02  41.0   
1          174       Uma Brown       Sales  72711.0   2011-05-27  27.0   
2          475    Xander Davis   Marketing  41324.0   2017-03-03  50.0   
3          256  Yara Rodriguez  Operations  67751.0   2012-12-21  27.0   
4          205      Uma Miller  Operations  43436.0   2011-12-30  24.0   

  Performance Rating    Bonus                       Email  Attendance (%)  \
0          Excellent  5623.02    grace.garcia@company.com           85.79   
1          Excellent  6812.47       uma.brown@company.com           79.57   
2            Average  4425.41    xander.davis@company.com           84.95   
3          Excellent  4217.53  yara.rodriguez@company.com           94.47   
4               Good  6466.02      uma.miller@company.com           88.83   

      Data Insert Time  
0  2024-01-03 22:03:46  
1  2024-01-

In [13]:
# 2. Masking and Boolean Indexing
print("\n### Filtering: Employees in IT Department")
it_employees = df[df['Department'] == 'IT']
print(it_employees)


### Filtering: Employees in IT Department
     Employee ID             Name Department    Salary Joining Date   Age  \
12           507        Bob Brown         IT  101642.0   2017-10-13  29.0   
16           596  Xander Williams         IT   93932.0   2019-06-28  28.0   
19           509      Wendy Brown         IT   61732.0   2017-10-27  24.0   
23           457      Hank Miller         IT  117052.0   2016-10-28  49.0   
38           310   Nathan Johnson         IT  114740.0   2014-01-03  52.0   
..           ...              ...        ...       ...          ...   ...   
457          290      Paul Miller         IT   88747.0   2013-08-16  27.0   
459          288       Ivy Miller         IT   59508.0   2013-08-02  24.0   
472          292      Hank Miller         IT  106412.0   2013-08-30  26.0   
488          315      Kathy Jones         IT   45237.0   2014-02-07  55.0   
492          121  Xander Williams         IT   65342.0   2010-05-21  32.0   

    Performance Rating     Bonus

In [14]:
print("\n### Filtering: Employees with Salary > 50000")
high_salary = df[df['Salary'] > 50000]
print(high_salary)


### Filtering: Employees with Salary > 50000
     Employee ID            Name  Department    Salary Joining Date   Age  \
0            462    Grace Garcia  Operations   90343.0   2016-12-02  41.0   
1            174       Uma Brown       Sales   72711.0   2011-05-27  27.0   
3            256  Yara Rodriguez  Operations   67751.0   2012-12-21  27.0   
5            495    Quincy Brown          HR   92224.0   2017-07-21  55.0   
6            478   Charlie Smith  Operations   74578.0   2017-03-24  40.0   
..           ...             ...         ...       ...          ...   ...   
494          172  Wendy Williams       Sales   69299.0   2011-05-13  43.0   
495          207   Ivy Rodriguez       Sales  116213.0   2012-01-13  39.0   
496          371      Mona Brown     Finance  110588.0   2015-03-06  31.0   
497          449     Frank Jones       Sales   63714.0   2016-09-02  46.0   
499          203       Uma Jones     Finance  119909.0   2011-12-16  35.0   

    Performance Rating     Bo

In [15]:
# 3. Modifying Data
print("\n### Adding a New Column")
df['Annual_Bonus'] = df['Salary'] * 0.1  # Adding new column based on calculation
print(df.head())


### Adding a New Column
   Employee ID            Name  Department   Salary Joining Date   Age  \
0          462    Grace Garcia  Operations  90343.0   2016-12-02  41.0   
1          174       Uma Brown       Sales  72711.0   2011-05-27  27.0   
2          475    Xander Davis   Marketing  41324.0   2017-03-03  50.0   
3          256  Yara Rodriguez  Operations  67751.0   2012-12-21  27.0   
4          205      Uma Miller  Operations  43436.0   2011-12-30  24.0   

  Performance Rating    Bonus                       Email  Attendance (%)  \
0          Excellent  5623.02    grace.garcia@company.com           85.79   
1          Excellent  6812.47       uma.brown@company.com           79.57   
2            Average  4425.41    xander.davis@company.com           84.95   
3          Excellent  4217.53  yara.rodriguez@company.com           94.47   
4               Good  6466.02      uma.miller@company.com           88.83   

      Data Insert Time  Annual_Bonus  
0  2024-01-03 22:03:46      

In [16]:
print("\n### Updating an Existing Column")
df['Department'] = df['Department'].str.upper()  # Converting department names to uppercase
print(df.head())


### Updating an Existing Column
   Employee ID            Name  Department   Salary Joining Date   Age  \
0          462    Grace Garcia  OPERATIONS  90343.0   2016-12-02  41.0   
1          174       Uma Brown       SALES  72711.0   2011-05-27  27.0   
2          475    Xander Davis   MARKETING  41324.0   2017-03-03  50.0   
3          256  Yara Rodriguez  OPERATIONS  67751.0   2012-12-21  27.0   
4          205      Uma Miller  OPERATIONS  43436.0   2011-12-30  24.0   

  Performance Rating    Bonus                       Email  Attendance (%)  \
0          Excellent  5623.02    grace.garcia@company.com           85.79   
1          Excellent  6812.47       uma.brown@company.com           79.57   
2            Average  4425.41    xander.davis@company.com           84.95   
3          Excellent  4217.53  yara.rodriguez@company.com           94.47   
4               Good  6466.02      uma.miller@company.com           88.83   

      Data Insert Time  Annual_Bonus  
0  2024-01-03 22:03:

In [17]:
print("\n### Renaming Columns")
df.rename(columns={'Name': 'Employee_Name', 'Salary': 'Monthly_Salary'}, inplace=True)
print(df.head())


### Renaming Columns
   Employee ID   Employee_Name  Department  Monthly_Salary Joining Date   Age  \
0          462    Grace Garcia  OPERATIONS         90343.0   2016-12-02  41.0   
1          174       Uma Brown       SALES         72711.0   2011-05-27  27.0   
2          475    Xander Davis   MARKETING         41324.0   2017-03-03  50.0   
3          256  Yara Rodriguez  OPERATIONS         67751.0   2012-12-21  27.0   
4          205      Uma Miller  OPERATIONS         43436.0   2011-12-30  24.0   

  Performance Rating    Bonus                       Email  Attendance (%)  \
0          Excellent  5623.02    grace.garcia@company.com           85.79   
1          Excellent  6812.47       uma.brown@company.com           79.57   
2            Average  4425.41    xander.davis@company.com           84.95   
3          Excellent  4217.53  yara.rodriguez@company.com           94.47   
4               Good  6466.02      uma.miller@company.com           88.83   

      Data Insert Time  Annu

In [18]:
# 4. Handling Missing Data
print("\n### Checking for Missing Values")
print(df.isnull().sum())


### Checking for Missing Values
Employee ID            0
Employee_Name          0
Department             0
Monthly_Salary         1
Joining Date           0
Age                    1
Performance Rating     0
Bonus                 99
Email                  0
Attendance (%)         0
Data Insert Time       0
Annual_Bonus           1
dtype: int64


In [19]:
print("\n### Filling Missing Values with Mean")
df.fillna(df.mean(numeric_only=True), inplace=True)
print(df.head())


### Filling Missing Values with Mean
   Employee ID   Employee_Name  Department  Monthly_Salary Joining Date   Age  \
0          462    Grace Garcia  OPERATIONS         90343.0   2016-12-02  41.0   
1          174       Uma Brown       SALES         72711.0   2011-05-27  27.0   
2          475    Xander Davis   MARKETING         41324.0   2017-03-03  50.0   
3          256  Yara Rodriguez  OPERATIONS         67751.0   2012-12-21  27.0   
4          205      Uma Miller  OPERATIONS         43436.0   2011-12-30  24.0   

  Performance Rating    Bonus                       Email  Attendance (%)  \
0          Excellent  5623.02    grace.garcia@company.com           85.79   
1          Excellent  6812.47       uma.brown@company.com           79.57   
2            Average  4425.41    xander.davis@company.com           84.95   
3          Excellent  4217.53  yara.rodriguez@company.com           94.47   
4               Good  6466.02      uma.miller@company.com           88.83   

      Data I

In [20]:
print("\n### Dropping Rows with Missing Values")
df.dropna(inplace=True)
print(df.head())


### Dropping Rows with Missing Values
   Employee ID   Employee_Name  Department  Monthly_Salary Joining Date   Age  \
0          462    Grace Garcia  OPERATIONS         90343.0   2016-12-02  41.0   
1          174       Uma Brown       SALES         72711.0   2011-05-27  27.0   
2          475    Xander Davis   MARKETING         41324.0   2017-03-03  50.0   
3          256  Yara Rodriguez  OPERATIONS         67751.0   2012-12-21  27.0   
4          205      Uma Miller  OPERATIONS         43436.0   2011-12-30  24.0   

  Performance Rating    Bonus                       Email  Attendance (%)  \
0          Excellent  5623.02    grace.garcia@company.com           85.79   
1          Excellent  6812.47       uma.brown@company.com           79.57   
2            Average  4425.41    xander.davis@company.com           84.95   
3          Excellent  4217.53  yara.rodriguez@company.com           94.47   
4               Good  6466.02      uma.miller@company.com           88.83   

      Data 

In [21]:
# 5. Dropping Rows and Columns
print("\n### Dropping a Column")
df.drop(columns=['Annual_Bonus'], inplace=True)
print(df.head())

print("\n### Dropping a Row")
df.drop(index=0, inplace=True)
print(df.head())


### Dropping a Column
   Employee ID   Employee_Name  Department  Monthly_Salary Joining Date   Age  \
0          462    Grace Garcia  OPERATIONS         90343.0   2016-12-02  41.0   
1          174       Uma Brown       SALES         72711.0   2011-05-27  27.0   
2          475    Xander Davis   MARKETING         41324.0   2017-03-03  50.0   
3          256  Yara Rodriguez  OPERATIONS         67751.0   2012-12-21  27.0   
4          205      Uma Miller  OPERATIONS         43436.0   2011-12-30  24.0   

  Performance Rating    Bonus                       Email  Attendance (%)  \
0          Excellent  5623.02    grace.garcia@company.com           85.79   
1          Excellent  6812.47       uma.brown@company.com           79.57   
2            Average  4425.41    xander.davis@company.com           84.95   
3          Excellent  4217.53  yara.rodriguez@company.com           94.47   
4               Good  6466.02      uma.miller@company.com           88.83   

      Data Insert Time  
0 

### Data Exploration & Transformation

In [22]:
# 1. Sorting and Ranking Data
print("\n### Sorting by Monthly Salary")
sorted_df = df.sort_values(by='Monthly_Salary', ascending=False)
print(sorted_df.head())


### Sorting by Monthly Salary
     Employee ID     Employee_Name Department  Monthly_Salary Joining Date  \
88           413      Wendy Miller      SALES        500000.0   2015-12-25   
139          366         Eve Smith         IT        480000.0   2015-01-30   
423          150  Olivia Rodriguez    FINANCE        450000.0   2010-12-10   
499          203         Uma Jones    FINANCE        119909.0   2011-12-16   
409          302      Rachel Jones  MARKETING        119634.0   2013-11-08   

      Age Performance Rating     Bonus                         Email  \
88   58.0          Excellent  47902.45      wendy.miller@company.com   
139  48.0               Good  49240.34         eve.smith@company.com   
423  34.0            Average  33113.21  olivia.rodriguez@company.com   
499  35.0               Good  17407.30         uma.jones@company.com   
409  95.0      Below Average  17104.70      rachel.jones@company.com   

     Attendance (%)     Data Insert Time  
88            84.40  202

In [23]:
print("\n### Ranking Employees by Performance")
df['Performance_Rank'] = df['Performance Rating'].rank(ascending=False)
print(df[['Employee_Name', 'Performance Rating', 'Performance_Rank']].head())


### Ranking Employees by Performance
    Employee_Name Performance Rating  Performance_Rank
1       Uma Brown          Excellent             255.5
2    Xander Davis            Average             451.0
3  Yara Rodriguez          Excellent             255.5
4      Uma Miller               Good             152.5
5    Quincy Brown          Excellent             255.5


In [None]:
df_sample = pd.DataFrame({
    'Name': ['Ali', 'Bella', 'Chris', 'Dina'],
    'Performance Rating': [90, 75, 90, 60]
})

df_sample['Performance_Rank'] = df_sample['Performance Rating'].rank(ascending=False)

# rank() uses method='average'
# Tied values get the average of their positions

# For the two 90s:
# Positions would be 1 and 2
# Average â†’ (1 + 2) / 2 = 1.5

In [24]:
# 2. Aggregation and Grouping Operations
print("\n### Grouping by Department and Aggregating Monthly Salary")
department_salary = df.groupby('Department')['Monthly_Salary'].mean().reset_index()
print(department_salary)

print("\n### Aggregating Multiple Statistics")
department_stats = df.groupby('Department').agg({'Monthly_Salary': ['mean', 'max', 'min'], 'Age': 'median'})
print(department_stats)


### Grouping by Department and Aggregating Monthly Salary
   Department  Monthly_Salary
0     FINANCE    85820.566667
1          HR    77313.289855
2          IT    82402.478261
3   MARKETING    76758.360465
4  OPERATIONS    75461.719101
5       SALES    87790.859829

### Aggregating Multiple Statistics
           Monthly_Salary                           Age
                     mean       max      min     median
Department                                             
FINANCE      85820.566667  450000.0  43343.0  40.495992
HR           77313.289855  119605.0  40301.0  39.000000
IT           82402.478261  480000.0   2000.0  43.000000
MARKETING    76758.360465  119634.0   5000.0  43.000000
OPERATIONS   75461.719101  118832.0  40384.0  43.000000
SALES        87790.859829  500000.0  41150.0  42.000000


In [25]:
# 3. Creating New Columns and Modifying Existing Ones
print("\n### Creating a New Column: Salary after Bonus")
df['Salary_After_Bonus'] = df['Monthly_Salary'] + df['Bonus']
print(df.head())

print("\n### Modifying an Existing Column: Standardizing Attendance")
df['Attendance (%)'] = df['Attendance (%)'] / 100  # Convert percentage to decimal
print(df.head())


### Creating a New Column: Salary after Bonus
   Employee ID   Employee_Name  Department  Monthly_Salary Joining Date   Age  \
1          174       Uma Brown       SALES         72711.0   2011-05-27  27.0   
2          475    Xander Davis   MARKETING         41324.0   2017-03-03  50.0   
3          256  Yara Rodriguez  OPERATIONS         67751.0   2012-12-21  27.0   
4          205      Uma Miller  OPERATIONS         43436.0   2011-12-30  24.0   
5          495    Quincy Brown          HR         92224.0   2017-07-21  55.0   

  Performance Rating        Bonus                       Email  Attendance (%)  \
1          Excellent  6812.470000       uma.brown@company.com           79.57   
2            Average  4425.410000    xander.davis@company.com           84.95   
3          Excellent  4217.530000  yara.rodriguez@company.com           94.47   
4               Good  6466.020000      uma.miller@company.com           88.83   
5          Excellent  8079.372643    quincy.brown@company.com

In [26]:
# 4. Applying Functions with apply() and map()
print("\n### Categorizing Employees Based on Salary")
def salary_category(salary):
    if salary > 70000:
        return 'High'
    elif salary > 40000:
        return 'Medium'
    else:
        return 'Low'

df['Salary_Category'] = df['Monthly_Salary'].apply(salary_category)
df[['Employee_Name', 'Monthly_Salary', 'Salary_Category']].head()


### Categorizing Employees Based on Salary


Unnamed: 0,Employee_Name,Monthly_Salary,Salary_Category
1,Uma Brown,72711.0,High
2,Xander Davis,41324.0,Medium
3,Yara Rodriguez,67751.0,Medium
4,Uma Miller,43436.0,Medium
5,Quincy Brown,92224.0,High


In [27]:
print("\n### Mapping Department Names to Codes")
department_map = {'IT': 1, 'HR': 2, 'Finance': 3, 'Sales': 4, 'Marketing': 5}
df['Department_Code'] = df['Department'].map(department_map)
df[['Department', 'Department_Code']].head()


### Mapping Department Names to Codes


Unnamed: 0,Department,Department_Code
1,SALES,
2,MARKETING,
3,OPERATIONS,
4,OPERATIONS,
5,HR,2.0


### Combining DataFrames

In [None]:
# Load dataset
df_experience = pd.read_csv('data_analysis_employee_experience_dataset.csv')

# 1. Appending DataFrames
print("\n### Appending DataFrames")
df_appended = df.append(df_experience, ignore_index=True) # ignore_index=True drops old indexes and avoids duplicate index values

df_appended.tail() 


### Appending DataFrames


  df_appended = df.append(df_experience, ignore_index=True)


Unnamed: 0,Employee ID,Employee_Name,Department,Monthly_Salary,Joining Date,Age,Performance Rating,Bonus,Email,Attendance (%),Data Insert Time,Performance_Rank,Salary_After_Bonus,Salary_Category,Department_Code,Years of Experience
993,207,,,,,,,,,,,,,,,1.0
994,371,,,,,,,,,,,,,,,17.0
995,449,,,,,,,,,,,,,,,7.0
996,536,,,,,,,,,,,,,,,13.0
997,203,,,,,,,,,,,,,,,4.0


In [None]:
# 2. Concatenating DataFrames
print("\n### Concatenating DataFrames")
df_concat = pd.concat([df, df_experience], ignore_index=True)

# pd.concat([df, df_experience], join='inner') keeps only the common columns between both DataFrames

df_concat.tail()


### Concatenating DataFrames


Unnamed: 0,Employee ID,Employee_Name,Department,Monthly_Salary,Joining Date,Age,Performance Rating,Bonus,Email,Attendance (%),Data Insert Time,Performance_Rank,Salary_After_Bonus,Salary_Category,Department_Code,Years of Experience
993,207,,,,,,,,,,,,,,,1.0
994,371,,,,,,,,,,,,,,,17.0
995,449,,,,,,,,,,,,,,,7.0
996,536,,,,,,,,,,,,,,,13.0
997,203,,,,,,,,,,,,,,,4.0


In [30]:
# 3. Merging DataFrames on 'Employee ID'
print("\n### Merging DataFrames on 'Employee ID'")
df_merged = pd.merge(df, df_experience, on='Employee ID', how='inner')
df_merged.head()


### Merging DataFrames on 'Employee ID'


Unnamed: 0,Employee ID,Employee_Name,Department,Monthly_Salary,Joining Date,Age,Performance Rating,Bonus,Email,Attendance (%),Data Insert Time,Performance_Rank,Salary_After_Bonus,Salary_Category,Department_Code,Years of Experience
0,174,Uma Brown,SALES,72711.0,2011-05-27,27.0,Excellent,6812.47,uma.brown@company.com,0.7957,2024-01-17 17:02:48,255.5,79523.47,High,,7
1,475,Xander Davis,MARKETING,41324.0,2017-03-03,50.0,Average,4425.41,xander.davis@company.com,0.8495,2024-01-02 14:48:57,451.0,45749.41,Medium,,15
2,256,Yara Rodriguez,OPERATIONS,67751.0,2012-12-21,27.0,Excellent,4217.53,yara.rodriguez@company.com,0.9447,2024-01-17 20:44:58,255.5,71968.53,Medium,,11
3,205,Uma Miller,OPERATIONS,43436.0,2011-12-30,24.0,Good,6466.02,uma.miller@company.com,0.8883,2024-01-01 09:24:19,152.5,49902.02,Medium,,8
4,495,Quincy Brown,HR,92224.0,2017-07-21,55.0,Excellent,8079.372643,quincy.brown@company.com,0.712,2024-01-18 23:19:02,255.5,100303.372643,High,2.0,7


In [None]:
# 4. Joining DataFrames with Different Keys
print("\n### Joining DataFrames using 'Employee ID' as index")
df.set_index('Employee ID', inplace=True)
df_experience.set_index('Employee ID', inplace=True)
df_joined = df.join(df_experience, how='left') # no longer need to specify the join key explicitly
df_joined.head()


### Joining DataFrames using 'Employee ID' as index


Unnamed: 0_level_0,Employee_Name,Department,Monthly_Salary,Joining Date,Age,Performance Rating,Bonus,Email,Attendance (%),Data Insert Time,Performance_Rank,Salary_After_Bonus,Salary_Category,Department_Code,Years of Experience
Employee ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1
174,Uma Brown,SALES,72711.0,2011-05-27,27.0,Excellent,6812.47,uma.brown@company.com,0.7957,2024-01-17 17:02:48,255.5,79523.47,High,,7
475,Xander Davis,MARKETING,41324.0,2017-03-03,50.0,Average,4425.41,xander.davis@company.com,0.8495,2024-01-02 14:48:57,451.0,45749.41,Medium,,15
256,Yara Rodriguez,OPERATIONS,67751.0,2012-12-21,27.0,Excellent,4217.53,yara.rodriguez@company.com,0.9447,2024-01-17 20:44:58,255.5,71968.53,Medium,,11
205,Uma Miller,OPERATIONS,43436.0,2011-12-30,24.0,Good,6466.02,uma.miller@company.com,0.8883,2024-01-01 09:24:19,152.5,49902.02,Medium,,8
495,Quincy Brown,HR,92224.0,2017-07-21,55.0,Excellent,8079.372643,quincy.brown@company.com,0.712,2024-01-18 23:19:02,255.5,100303.372643,High,2.0,7


#### More Examples

In [33]:
# Create sample DataFrames
df1 = pd.DataFrame({
    'Employee ID': [101, 102, 103],
    'Name': ['Lynn', 'Shane', 'Michael'],
    'Department': ['HR', 'Finance', 'IT']
})

df2 = pd.DataFrame({
    'Employee ID': [104, 105],
    'Name': ['Talor', 'Carol'],
    'Department': ['IT', 'Finance']
})

print("DataFrame 1:")
display(df1)
print("DataFrame 2:")
display(df2)

DataFrame 1:


Unnamed: 0,Employee ID,Name,Department
0,101,Alice,HR
1,102,Bob,Finance
2,103,Charlie,IT


DataFrame 2:


Unnamed: 0,Employee ID,Name,Department
0,104,David,IT
1,105,Eve,Finance


In [34]:
### Appending DataFrames
# Append df2 to df1
df_appended = df1.append(df2, ignore_index=True)
print("\nAppended DataFrame:")
display(df_appended)



Appended DataFrame:


  df_appended = df1.append(df2, ignore_index=True)


Unnamed: 0,Employee ID,Name,Department
0,101,Alice,HR
1,102,Bob,Finance
2,103,Charlie,IT
3,104,David,IT
4,105,Eve,Finance


In [35]:
### Concatenating DataFrames
# Concatenating along rows (default axis=0)
df_concat = pd.concat([df1, df2], ignore_index=True)
print("\nConcatenated DataFrame:")
display(df_concat)


Concatenated DataFrame:


Unnamed: 0,Employee ID,Name,Department
0,101,Alice,HR
1,102,Bob,Finance
2,103,Charlie,IT
3,104,David,IT
4,105,Eve,Finance


In [36]:
### Merging DataFrames
# Create another DataFrame with Salary information
df_salary = pd.DataFrame({
    'Employee ID': [101, 102, 103, 104, 105],
    'Salary': [50000, 60000, 70000, 80000, 55000]
})

print("\nSalary DataFrame:")
display(df_salary)


Salary DataFrame:


Unnamed: 0,Employee ID,Salary
0,101,50000
1,102,60000
2,103,70000
3,104,80000
4,105,55000


In [37]:
# Merge on 'Employee ID'
df_merged = pd.merge(df_concat, df_salary, on='Employee ID', how='inner')
print("\nMerged DataFrame:")
display(df_merged)


Merged DataFrame:


Unnamed: 0,Employee ID,Name,Department,Salary
0,101,Alice,HR,50000
1,102,Bob,Finance,60000
2,103,Charlie,IT,70000
3,104,David,IT,80000
4,105,Eve,Finance,55000


In [39]:
### Joining DataFrames
# Create another DataFrame with Bonus information
df_bonus = pd.DataFrame({
    'Employee ID': [101, 102, 103, 106],
    'Bonus': [5000, 6000, 7000, 4000]
})

print("\nBonus DataFrame:")
display(df_bonus)

# Left Join - Keep all records from df_merged
df_joined = df_merged.merge(df_bonus, on='Employee ID', how='left')
print("\nDataFrame after Left Join with Bonus:")
display(df_joined)


Bonus DataFrame:


Unnamed: 0,Employee ID,Bonus
0,101,5000
1,102,6000
2,103,7000
3,106,4000



DataFrame after Left Join with Bonus:


Unnamed: 0,Employee ID,Name,Department,Salary,Bonus
0,101,Alice,HR,50000,5000.0
1,102,Bob,Finance,60000,6000.0
2,103,Charlie,IT,70000,7000.0
3,104,David,IT,80000,
4,105,Eve,Finance,55000,


In [40]:
# Outer Join - Keep all records from both DataFrames
df_outer = df_merged.merge(df_bonus, on='Employee ID', how='outer')
print("\nDataFrame after Outer Join:")
display(df_outer)


DataFrame after Outer Join:


Unnamed: 0,Employee ID,Name,Department,Salary,Bonus
0,101,Alice,HR,50000.0,5000.0
1,102,Bob,Finance,60000.0,6000.0
2,103,Charlie,IT,70000.0,7000.0
3,104,David,IT,80000.0,
4,105,Eve,Finance,55000.0,
5,106,,,,4000.0


### Data Analysis Operations

In [41]:
# 1. Working with Numerical Data
print("\n### Numerical Data Analysis")
print("Mean Salary:", df['Monthly_Salary'].mean())
print("Total Bonus Paid:", df['Bonus'].sum())
print("Standard Deviation of Age:", df['Age'].std())


### Numerical Data Analysis
Mean Salary: 80892.86726960936
Total Bonus Paid: 4034063.3016957603
Standard Deviation of Age: 11.732798899796439


In [42]:
# 2. Working with Text Data
print("\n### Working with Text Data")
df['Employee_Name_Upper'] = df['Employee_Name'].str.upper()
print(df[['Employee_Name', 'Employee_Name_Upper']].head())

print("\nExtracting Email Domains")
df['Email_Domain'] = df['Email'].str.split('@').str[1]
print(df[['Email', 'Email_Domain']].head())


### Working with Text Data
              Employee_Name Employee_Name_Upper
Employee ID                                    
174               Uma Brown           UMA BROWN
475            Xander Davis        XANDER DAVIS
256          Yara Rodriguez      YARA RODRIGUEZ
205              Uma Miller          UMA MILLER
495            Quincy Brown        QUINCY BROWN

Extracting Email Domains
                                  Email Email_Domain
Employee ID                                         
174               uma.brown@company.com  company.com
475            xander.davis@company.com  company.com
256          yara.rodriguez@company.com  company.com
205              uma.miller@company.com  company.com
495            quincy.brown@company.com  company.com


In [43]:
# 3. Handling Datetime Data
print("\n### Handling Datetime Data")
df['Joining Date'] = pd.to_datetime(df['Joining Date'])
df['Year Joined'] = df['Joining Date'].dt.year
df['Month Joined'] = df['Joining Date'].dt.month
print(df[['Joining Date', 'Year Joined', 'Month Joined']].head())


### Handling Datetime Data
            Joining Date  Year Joined  Month Joined
Employee ID                                        
174           2011-05-27         2011             5
475           2017-03-03         2017             3
256           2012-12-21         2012            12
205           2011-12-30         2011            12
495           2017-07-21         2017             7


In [44]:
# 4. Pivot Tables and Cross-Tabulations
print("\n### Pivot Tables")
pivot_salary = df.pivot_table(values='Monthly_Salary', index='Department', aggfunc='mean')
print(pivot_salary)

print("\n### Cross Tabulation of Department and Salary Category")
crosstab_salary = pd.crosstab(df['Department'], df['Salary_Category'])
print(crosstab_salary)


### Pivot Tables
            Monthly_Salary
Department                
FINANCE       85820.566667
HR            77313.289855
IT            82402.478261
MARKETING     76758.360465
OPERATIONS    75461.719101
SALES         87790.859829

### Cross Tabulation of Department and Salary Category
Salary_Category  High  Low  Medium
Department                        
FINANCE            59    0      31
HR                 40    0      29
IT                 56    1      35
MARKETING          48    1      37
OPERATIONS         49    0      40
SALES              52    0      21


### Exploratory Data Analysis (EDA)

In [45]:
# 1. Understanding Descriptive Statistics
print("\n### Descriptive Statistics")
print(df.describe())


### Descriptive Statistics
       Monthly_Salary         Age         Bonus  Attendance (%)  \
count      499.000000  499.000000    499.000000      499.000000   
mean     80892.867270   40.991968   8084.295194        0.854622   
std      38569.360379   11.732799   4138.389653        0.115112   
min       2000.000000   15.000000    202.200000       -0.100000   
25%      59623.000000   31.000000   5760.005000        0.771050   
50%      78756.000000   42.000000   8079.372643        0.855700   
75%      97568.500000   51.000000   9239.410000        0.931200   
max     500000.000000   95.000000  49240.340000        2.000000   

       Performance_Rank  Salary_After_Bonus  Department_Code  Year Joined  \
count        499.000000          499.000000       161.000000   499.000000   
mean         250.000000        88977.162463         1.428571  2014.292585   
std          141.272199        42072.298040         0.496416     2.778027   
min           50.500000         2202.200000         1.000000

In [46]:
# 2. Detecting Outliers and Missing Values
print("\n### Checking for Missing Values")
print(df.isnull().sum())


### Checking for Missing Values
Employee_Name            0
Department               0
Monthly_Salary           0
Joining Date             0
Age                      0
Performance Rating       0
Bonus                    0
Email                    0
Attendance (%)           0
Data Insert Time         0
Performance_Rank         0
Salary_After_Bonus       0
Salary_Category          0
Department_Code        338
Employee_Name_Upper      0
Email_Domain             0
Year Joined              0
Month Joined             0
dtype: int64


In [47]:
print("\n### Identifying Outliers using IQR Method")
Q1 = df['Monthly_Salary'].quantile(0.25)
Q3 = df['Monthly_Salary'].quantile(0.75)
IQR = Q3 - Q1
outliers = df[(df['Monthly_Salary'] < (Q1 - 1.5 * IQR)) | (df['Monthly_Salary'] > (Q3 + 1.5 * IQR))]
print(outliers)


### Identifying Outliers using IQR Method
                Employee_Name Department  Monthly_Salary Joining Date   Age  \
Employee ID                                                                   
413              Wendy Miller      SALES        500000.0   2015-12-25  58.0   
366                 Eve Smith         IT        480000.0   2015-01-30  48.0   
428                 Leo Jones         IT          2000.0   2016-04-08  44.0   
150          Olivia Rodriguez    FINANCE        450000.0   2010-12-10  34.0   

            Performance Rating     Bonus                         Email  \
Employee ID                                                              
413                  Excellent  47902.45      wendy.miller@company.com   
366                       Good  49240.34         eve.smith@company.com   
428                    Average    202.20         leo.jones@company.com   
150                    Average  33113.21  olivia.rodriguez@company.com   

             Attendance (%)     Data 