# Data Analysis
### Introduction to Pandas

In [None]:
### 
# Importing Pandas
import pandas as pd
import numpy as np

# Checking the installed version of Pandas
print("Pandas version:", pd.__version__)

In [None]:
### Creating Pandas Series

data = [10, 20, 30, 40, 50]
series = pd.Series(data)
print("\nPandas Series:")
display(series)

In [None]:
# Customizing index in a Series
series_custom_index = pd.Series(data, index=['a', 'b', 'c', 'd', 'e'])
print("\nPandas Series with Custom Index:")
display(series_custom_index)

In [None]:
### Creating a Pandas DataFrame

data_dict = {
    'Name': ['Lynn', 'Shane', 'Michael', 'Harvey', 'Ken'],
    'Age': [25, 30, 35, 40, 28],
    'Salary': [50000, 60000, 70000, 80000, 55000]
}
df = pd.DataFrame(data_dict)
print("\nPandas DataFrame:")
display(df)

In [None]:
### Loading Data into Pandas
# Loading a CSV file (Example: Titanic dataset from an online source)
path = "data_analysis_employee_dataset.csv"
df = pd.read_csv(path)
print("\nFirst 5 rows of Employee Dataset:")
display(df.head())


In [None]:
### Basic DataFrame Operations
# Displaying column names
print("\nColumn Names:")
print(df.columns.tolist())

In [None]:
# Checking data types
print("\nData Types:")
display(df.dtypes)

In [None]:
# Checking the shape of the dataset
print("\nShape of the Dataset (Rows, Columns):", df.shape)

# Summary statistics of numerical columns
print("\nSummary Statistics:")
display(df.describe())

### Working with DataFrames

In [None]:
# 1.Accessing Data in DataFrames
print("\n### Accessing Columns")
print(df['Name'])  # Accessing a single column

In [None]:
print("\n### Accessing Multiple Columns")
print(df[['Name', 'Department']])  # Accessing multiple columns

In [None]:
print("\n### Accessing Rows using loc")
print(df.loc[0])  # Access first row

In [None]:
print("\n### Accessing Rows using iloc")
print(df.iloc[0:5])  # Access first five rows

In [None]:
# 2. Masking and Boolean Indexing
print("\n### Filtering: Employees in IT Department")
it_employees = df[df['Department'] == 'IT']
print(it_employees)

In [None]:
print("\n### Filtering: Employees with Salary > 50000")
high_salary = df[df['Salary'] > 50000]
print(high_salary)

In [None]:
# 3. Modifying Data
print("\n### Adding a New Column")
df['Annual_Bonus'] = df['Salary'] * 0.1  # Adding new column based on calculation
print(df.head())

In [None]:
print("\n### Updating an Existing Column")
df['Department'] = df['Department'].str.upper()  # Converting department names to uppercase
print(df.head())

In [None]:
print("\n### Renaming Columns")
df.rename(columns={'Name': 'Employee_Name', 'Salary': 'Monthly_Salary'}, inplace=True)
print(df.head())

In [None]:
# 4. Handling Missing Data
print("\n### Checking for Missing Values")
print(df.isnull().sum())

In [None]:
print("\n### Filling Missing Values with Mean")
df.fillna(df.mean(numeric_only=True), inplace=True)
print(df.head())

In [None]:
print("\n### Dropping Rows with Missing Values")
df.dropna(inplace=True)
print(df.head())

In [None]:
# 5. Dropping Rows and Columns
print("\n### Dropping a Column")
df.drop(columns=['Annual_Bonus'], inplace=True)
print(df.head())

print("\n### Dropping a Row")
df.drop(index=0, inplace=True)
print(df.head())

### Data Exploration & Transformation

In [None]:
# 1. Sorting and Ranking Data
print("\n### Sorting by Monthly Salary")
sorted_df = df.sort_values(by='Monthly_Salary', ascending=False)
print(sorted_df.head())

In [None]:
print("\n### Ranking Employees by Performance")
df['Performance_Rank'] = df['Performance Rating'].rank(ascending=False)
print(df[['Employee_Name', 'Performance Rating', 'Performance_Rank']].head())

In [None]:
df_sample = pd.DataFrame({
    'Name': ['Ali', 'Bella', 'Chris', 'Dina'],
    'Performance Rating': [90, 75, 90, 60]
})

df_sample['Performance_Rank'] = df_sample['Performance Rating'].rank(ascending=False)

# rank() uses method='average'
# Tied values get the average of their positions

# For the two 90s:
# Positions would be 1 and 2
# Average â†’ (1 + 2) / 2 = 1.5

In [None]:
# 2. Aggregation and Grouping Operations
print("\n### Grouping by Department and Aggregating Monthly Salary")
department_salary = df.groupby('Department')['Monthly_Salary'].mean().reset_index()
print(department_salary)

print("\n### Aggregating Multiple Statistics")
department_stats = df.groupby('Department').agg({'Monthly_Salary': ['mean', 'max', 'min'], 'Age': 'median'})
print(department_stats)

In [None]:
# 3. Creating New Columns and Modifying Existing Ones
print("\n### Creating a New Column: Salary after Bonus")
df['Salary_After_Bonus'] = df['Monthly_Salary'] + df['Bonus']
print(df.head())

print("\n### Modifying an Existing Column: Standardizing Attendance")
df['Attendance (%)'] = df['Attendance (%)'] / 100  # Convert percentage to decimal
print(df.head())

In [None]:
# 4. Applying Functions with apply() and map()
print("\n### Categorizing Employees Based on Salary")
def salary_category(salary):
    if salary > 70000:
        return 'High'
    elif salary > 40000:
        return 'Medium'
    else:
        return 'Low'

df['Salary_Category'] = df['Monthly_Salary'].apply(salary_category)
df[['Employee_Name', 'Monthly_Salary', 'Salary_Category']].head()

In [None]:
print("\n### Mapping Department Names to Codes")
department_map = {'IT': 1, 'HR': 2, 'Finance': 3, 'Sales': 4, 'Marketing': 5}
df['Department_Code'] = df['Department'].map(department_map)
df[['Department', 'Department_Code']].head()

### Combining DataFrames

In [None]:
# Load dataset
df_experience = pd.read_csv('data_analysis_employee_experience_dataset.csv')

# 1. Appending DataFrames
print("\n### Appending DataFrames")
df_appended = df.append(df_experience, ignore_index=True) # ignore_index=True drops old indexes and avoids duplicate index values

df_appended.tail() 

In [None]:
# 2. Concatenating DataFrames
print("\n### Concatenating DataFrames")
df_concat = pd.concat([df, df_experience], ignore_index=True)

# pd.concat([df, df_experience], join='inner') keeps only the common columns between both DataFrames

df_concat.tail()

In [None]:
# 3. Merging DataFrames on 'Employee ID'
print("\n### Merging DataFrames on 'Employee ID'")
df_merged = pd.merge(df, df_experience, on='Employee ID', how='inner')
df_merged.head()

In [None]:
# 4. Joining DataFrames with Different Keys
print("\n### Joining DataFrames using 'Employee ID' as index")
df.set_index('Employee ID', inplace=True)
df_experience.set_index('Employee ID', inplace=True)
df_joined = df.join(df_experience, how='left') # no longer need to specify the join key explicitly
df_joined.head()

#### More Examples

In [None]:
# Create sample DataFrames
df1 = pd.DataFrame({
    'Employee ID': [101, 102, 103],
    'Name': ['Lynn', 'Shane', 'Michael'],
    'Department': ['HR', 'Finance', 'IT']
})

df2 = pd.DataFrame({
    'Employee ID': [104, 105],
    'Name': ['Talor', 'Carol'],
    'Department': ['IT', 'Finance']
})

print("DataFrame 1:")
display(df1)
print("DataFrame 2:")
display(df2)

In [None]:
### Appending DataFrames
# Append df2 to df1
df_appended = df1.append(df2, ignore_index=True)
print("\nAppended DataFrame:")
display(df_appended)


In [None]:
### Concatenating DataFrames
# Concatenating along rows (default axis=0)
df_concat = pd.concat([df1, df2], ignore_index=True)
print("\nConcatenated DataFrame:")
display(df_concat)

In [None]:
### Merging DataFrames
# Create another DataFrame with Salary information
df_salary = pd.DataFrame({
    'Employee ID': [101, 102, 103, 104, 105],
    'Salary': [50000, 60000, 70000, 80000, 55000]
})

print("\nSalary DataFrame:")
display(df_salary)

In [None]:
# Merge on 'Employee ID'
df_merged = pd.merge(df_concat, df_salary, on='Employee ID', how='inner')
print("\nMerged DataFrame:")
display(df_merged)

In [None]:
### Joining DataFrames
# Create another DataFrame with Bonus information
df_bonus = pd.DataFrame({
    'Employee ID': [101, 102, 103, 106],
    'Bonus': [5000, 6000, 7000, 4000]
})

print("\nBonus DataFrame:")
display(df_bonus)

# Left Join - Keep all records from df_merged
df_joined = df_merged.merge(df_bonus, on='Employee ID', how='left')
print("\nDataFrame after Left Join with Bonus:")
display(df_joined)

In [None]:
# Outer Join - Keep all records from both DataFrames
df_outer = df_merged.merge(df_bonus, on='Employee ID', how='outer')
print("\nDataFrame after Outer Join:")
display(df_outer)

### Data Analysis Operations

In [None]:
# 1. Working with Numerical Data
print("\n### Numerical Data Analysis")
print("Mean Salary:", df['Monthly_Salary'].mean())
print("Total Bonus Paid:", df['Bonus'].sum())
print("Standard Deviation of Age:", df['Age'].std())

In [None]:
# 2. Working with Text Data
print("\n### Working with Text Data")
df['Employee_Name_Upper'] = df['Employee_Name'].str.upper()
print(df[['Employee_Name', 'Employee_Name_Upper']].head())

print("\nExtracting Email Domains")
df['Email_Domain'] = df['Email'].str.split('@').str[1]
print(df[['Email', 'Email_Domain']].head())

In [None]:
# 3. Handling Datetime Data
print("\n### Handling Datetime Data")
df['Joining Date'] = pd.to_datetime(df['Joining Date'])
df['Year Joined'] = df['Joining Date'].dt.year
df['Month Joined'] = df['Joining Date'].dt.month
print(df[['Joining Date', 'Year Joined', 'Month Joined']].head())

In [None]:
# 4. Pivot Tables and Cross-Tabulations
print("\n### Pivot Tables")
pivot_salary = df.pivot_table(values='Monthly_Salary', index='Department', aggfunc='mean')
print(pivot_salary)

print("\n### Cross Tabulation of Department and Salary Category")
crosstab_salary = pd.crosstab(df['Department'], df['Salary_Category'])
print(crosstab_salary)

### Exploratory Data Analysis (EDA)

In [None]:
# 1. Understanding Descriptive Statistics
print("\n### Descriptive Statistics")
print(df.describe())

In [None]:
# 2. Detecting Outliers and Missing Values
print("\n### Checking for Missing Values")
print(df.isnull().sum())

In [None]:
print("\n### Identifying Outliers using IQR Method")
Q1 = df['Monthly_Salary'].quantile(0.25)
Q3 = df['Monthly_Salary'].quantile(0.75)
IQR = Q3 - Q1
outliers = df[(df['Monthly_Salary'] < (Q1 - 1.5 * IQR)) | (df['Monthly_Salary'] > (Q3 + 1.5 * IQR))]
print(outliers)