# NumPy

In [None]:
%pip install numpy

In [None]:
# Importing NumPy

import numpy as np 

#### Creating NumPy Arrays

In [None]:
# Creating a NumPy Array from a List

arr = np.array([1, 2, 3, 4, 5])
print("NumPy Array:", arr)
print("Type:", type(arr))


Unlike Python lists, NumPy arrays are optimized for numerical operations.

In [None]:
# Checking Array Properties

print("Shape:", arr.shape)
print("Size:", arr.size)
print("Data Type:", arr.dtype)
print("Dimensions:", arr.ndim)
print("Item Size (Bytes):", arr.itemsize)  # Memory size of each element


In [None]:
# Creating a 2D NumPy Array

arr_2d = np.array([[1, 2, 3], [4, 5, 6]])
print("2D Array:\n", arr_2d)
print("Shape:", arr_2d.shape)


In [None]:
# Reshaping a 1D Array into a 2D Array

arr = np.arange(1, 13) # Creates a 1D array with values from 1 to 12
print("Original Array:\n ", arr)

reshaped_arr = arr.reshape(3, 4)  # Converts it into a 3x4 matrix
print("Reshaped 3x4 Array:\n", reshaped_arr)

In [None]:
# Reshaping with -1 (Automatic Calculation)

arr = np.arange(1, 13) 

auto_reshape = arr.reshape(-1, 4)  # Automatically determines rows
print("Auto Reshaped (-1,4):\n", auto_reshape)

-1 automatically calculates one dimension based on available elements.
Here, NumPy sets rows to 3 to match (3,4).

#### Checking Data Types in Arrays

In [None]:
# Checking Data Type of Elements

arr_int = np.array([1, 2, 3, 4])
arr_float = np.array([1.5, 2.5, 3.5])

print("Integer Array Type:", arr_int.dtype)  
print("Float Array Type:", arr_float.dtype) 

# NumPy automatically detects the data type but allows manual control.

In [None]:
# Changing Data Types in Arrays

arr = np.array([1.5, 2.7, 3.9])

# Convert float array to integers
arr_int = arr.astype(int)

print("Original Array:", arr)  
print("Converted to Integer:", arr_int)  

# astype(new_type) converts array elements to a different data type.

# Statistics and Probability
### Descriptive Statistics
Measures of Central Tendency

In [None]:
# Mean (Average)
data = [80, 85, 90, 95, 100]
mean_value = np.mean(data)
print("Mean:", mean_value)

In [None]:
data = [780, 815, 922, 109, 712]


print("Mean:", mean_value)

In [None]:
# Median (Middle Value)
data = [10, 15, 20, 25, 30]
median_value = np.median(data)
print("Median:", median_value)


In [None]:
data = [780, 815, 922, 109, 712]


print("Median:", median_value)

In [None]:
# Mode (Most Frequent Value)
from scipy import stats

data = [1, 2, 2, 3, 4, 4, 4, 5]
mode_value = stats.mode(data)
print("Mode:", mode_value.mode[0]) 

In [None]:
# variance
data = [10, 20, 30]
variance_value = np.var(data, ddof=0)  # Population variance
print("Variance:", variance_value)

In [None]:
data = [10, 2000, 30]



In [None]:
# Standard Deviation
std_dev = np.std(data, ddof=0)  # Population standard deviation
print("Standard Deviation:", std_dev) 

In [None]:
data = [10, 2000, 30]




In [None]:
# Calculating Quartiles
data = [1, 2, 3, 4, 5, 6, 7, 8, 9]
q1 = np.percentile(data, 25)
q3 = np.percentile(data, 75)
print("Q1:", q1, "Q3:", q3)

In [None]:
# Detecting Outliers Using IQR
iqr = q3 - q1
lower_bound = q1 - 1.5 * iqr
upper_bound = q3 + 1.5 * iqr

outliers = [x for x in data if x < lower_bound or x > upper_bound]
print("Outliers:", outliers)

In [None]:
data = [122, 322, 3, 402, 125, 261, 317, 9268, 190]



print("Q1:", q1, "Q3:", q3)


print("Outliers:", outliers)

#### Special Arrays

In [None]:
# Creating Arrays with Zeros and Ones

zeros = np.zeros((3, 3))  # 3x3 array filled with 0s
ones = np.ones((2, 2))    # 2x2 array filled with 1s

print("Zeros Array:\n", zeros)
print("Ones Array:\n", ones)

In [None]:
# Creating an Identity Matrix

identity_matrix = np.eye(4)  # 4x4 identity matrix
print("Identity Matrix:\n", identity_matrix)


In [None]:
# Creating an Array with Random Values

random_array = np.random.rand(3, 3)  # 3x3 array with random values between 0 and 1
print("Random Array:\n", random_array)


In [None]:
# Convert to other ranges

# Example: 0–10
print("Random Array 0-10: \n", np.random.rand(3, 3) * 10)

# Example: 5–15
print("Random Array 0-10: \n", np.random.rand(3, 3) * 10 + 5)

# Example: -1 to 1
print("Random Array 0-10: \n", np.random.rand(3, 3) * 2 - 1)


In [None]:
# Creating a Sequence of Numbers

arr_range = np.arange(1, 10, 2)  # Start at 1, end before 10, step size 2
print("Arange:", arr_range)

# linspace = linearly spaced numbers, generates numbers evenly spaced between two values.
arr_linspace = np.linspace(0, 5, 10)  # 10 evenly spaced numbers between 0 and 5
print("Linspace:", arr_linspace)

In [None]:
# Creating Constant Arrays

# 3x3 Matrix filled with 7
constant_array = np.full((3, 3), 7)
print("3x3 Constant Matrix (Filled with 7):\n", constant_array)

# 2x4 Matrix filled with -1
negative_matrix = np.full((2, 4), -1)
print("2x4 Constant Matrix (Filled with -1):\n", negative_matrix)


In [None]:
# Creating Random Arrays

# Uniformly distributed random numbers (0 to 1)
random_array = np.random.rand(3, 4)
print("Random Array (0 to 1):\n", random_array)

# Normally distributed random numbers (mean=0, variance=1)
random_normal = np.random.randn(4, 3)
print("Random Normal Distribution:\n", random_normal)

# Random integers between 10 and 50 (3x3 matrix)
random_ints = np.random.randint(10, 50, (3, 3))
print("Random Integer Matrix:\n", random_ints)

In [None]:
# Creating a Diagonal Matrix

# Diagonal matrix with values 1, 2, 3
diag_matrix = np.diag([1, 2, 3])
print("Diagonal Matrix:\n", diag_matrix)

#### Comparing Lists vs. NumPy Arrays

In [None]:
# Performance Comparison 

import time

# Creating a large list and NumPy array
size = 10**6
py_list = list(range(size))
np_array = np.arange(size)

In [None]:
# Timing Python list multiplication
start_time = time.time()
py_list = [x * 2 for x in py_list]
end_time = time.time()
print("Python List Time:", end_time - start_time)

In [None]:
# Timing NumPy array multiplication
start_time = time.time()
np_array = np_array * 2
end_time = time.time()
print("NumPy Array Time:", end_time - start_time)

NumPy arrays are significantly faster than Python lists because of vectorization (no loops needed).

### Array Operations
#### Basic Arithmetic with Arrays

In [None]:
# Basic Arithmetic Operations
import numpy as np

arr1 = np.array([1, 2, 3, 4])
arr2 = np.array([5, 6, 7, 8])

# Element-wise operations
print("Addition:", arr1 + arr2)
print("Subtraction:", arr1 - arr2)
print("Multiplication:", arr1 * arr2)
print("Division:", arr1 / arr2)
print("Power:", arr1 ** 2)


In [None]:
# Scalar Operations

arr = np.array([1, 2, 3, 4])

print("Array * 2:", arr * 2)
print("Array + 5:", arr + 5)
print("Array ** 3:", arr ** 3)


#### Universal Functions (ufuncs)

In [None]:
# Applying Universal Functions
arr = np.array([1, 4, 9, 16])

print("Square Root:", np.sqrt(arr))
print("Exponential:", np.exp(arr))
print("Logarithm:", np.log(arr))
print("Absolute Value:", np.abs([-3, -7, -1]))


#### Aggregations and Summary

In [None]:
# Calculating Summary Statistics
arr = np.array([10, 20, 30, 40, 50])

print("Sum:", np.sum(arr))
print("Mean:", np.mean(arr))
print("Median:", np.median(arr))
print("Minimum:", np.min(arr))
print("Maximum:", np.max(arr))
print("Standard Deviation:", np.std(arr))
print("Variance:", np.var(arr))

In [None]:
# Aggregations Along an Axis
arr_2d = np.array([[1, 2, 3], [4, 5, 6]])

print("Column-wise Sum:", np.sum(arr_2d, axis=0))
print("Row-wise Mean:", np.mean(arr_2d, axis=1))


### Indexing and Slicing
#### Indexing in NumPy arrays

In [None]:
# Indexing in a 1D Array

import numpy as np

arr = np.array([10, 20, 30, 40, 50])

# Accessing elements
print("First element:", arr[0])
print("Last element:", arr[-1])
print("Second element:", arr[1])

In [None]:
# Indexing in a 2D Array

arr_2d = np.array([[1, 2, 3], [4, 5, 6], [7, 8, 9]])

# Accessing elements
print("Element at row 1, column 2:", arr_2d[0, 1])
print("Element at last row, last column:", arr_2d[-1, -1])


#### Slicing NumPy Arrays

In [None]:
# array[start : stop : step]

arr = np.array([10, 20, 30, 40, 50, 60])

print("Elements from index 1 to 4:", arr[1:4])
print("First three elements:", arr[:3])
print("Last three elements:", arr[-3:])
print("Every second element:", arr[::2])

In [None]:
# Slicing a 2D array

arr_2d = np.array([[10, 20, 30], 
                   [40, 50, 60], 
                   [70, 80, 90]])

print("First two rows:\n", arr_2d[:2])  
print("First two columns:\n", arr_2d[:, :2])  
print("Bottom-right 2x2 submatrix:\n", arr_2d[1:, 1:])

#### Fancy Indexing (Selecting Specific Rows/Columns)

In [None]:
# Fancy Indexing in a 1D array

arr = np.array([10, 20, 30, 40, 50])

indices = [0, 2, 4]  # Selecting elements at indices 0, 2, 4
print("Selected elements:", arr[indices])

In [None]:
# Fancy Indexing in a 2D array

arr_2d = np.array([[10, 20, 30], 
                   [40, 50, 60], 
                   [70, 80, 90]])

rows = [0, 2]  # Selecting first and last row
cols = [1, 2]  # Selecting second and third column

print("Selected rows:\n", arr_2d[rows])  

print("Selected columns:\n", arr_2d[:, cols])  



In [None]:
print(arr_2d[[0, 2]])
print("\n", arr_2d[0, 2])


#### Boolean Masking

In [None]:
# Filtering Values in a 1D Array
arr = np.array([10, 25, 30, 45, 50, 65])

mask = arr > 30  # Condition: Select values greater than 30
print("Filtered values:", arr[mask])


In [None]:
# Filtering Values in a 2D Array
arr_2d = np.array([[10, 20, 30], 
                   [40, 50, 60], 
                   [70, 80, 90]])

mask = arr_2d > 50  # Keep values greater than 50
print("Filtered values:", arr_2d[mask])


### Broadcasting and Vectorization

In [None]:
# Broadcasting a Scalar to an Array

import numpy as np

arr = np.array([1, 2, 3, 4])
scalar = 2

result = arr * scalar  # Broadcasting scalar to match the array
print("Broadcasted Result:", result)


In [None]:
# Broadcasting a 1D Array to a 2D Array

arr_2d = np.array([[1, 2, 3], [4, 5, 6]])  # Shape (2,3)
arr_1d = np.array([10, 20, 30])  # Shape (3,) 1-dimensional

result = arr_2d + arr_1d  # Broadcasting 1D array to match 2D array
print("Broadcasted Addition:\n", result)


In [None]:
# (3,)  ≠  (3, 1)

print(np.array([10, 20, 30]))  # (3,)
print("\n", np.array([[10, 20, 30]]))  # (1, 3)
print("\n", np.array([[10], [20], [30]])) # (3, 1) 


In [None]:
# Broadcasting with Different Column Sizes

arr_2d = np.array([[1, 2, 3]
                   , [4, 5, 6]])  # Shape (2,3)

arr_col = np.array([[10]
                    , [20]])  # Shape (2,1)

result = arr_2d + arr_col  # Broadcasting column vector to match 2D matrix
print("Broadcasted Addition:\n", result)

In [None]:
# Broadcasting Error

arr1 = np.array([[1, 2, 3], [4, 5, 6]])  # Shape (2,3)
arr2 = np.array([10, 20])  # Shape (2,)

# This will cause an error because dimensions are incompatible
result = arr1 + arr2

#### Vectorized operations
Performing calculations on entire arrays at once, instead of using loops.

NumPy applies the operation element-by-element automatically.

In [None]:
# Non-Vectorized vs. Vectorized Operations

# Using a Loop (Slower)
arr = np.array([1, 2, 3, 4])
result = []

for i in arr:
    result.append(i ** 2)

print("Squared Values:", result)

# Using NumPy Vectorization (Faster)
arr = np.array([1, 2, 3, 4])
result = arr ** 2  # Vectorized operation

print("Vectorized Squaring:", result)


In [None]:
# Applying Mathematical Functions to Arrays

arr = np.array([1, 2, 3, 4])

print("Sine Values:", np.sin(arr))
print("Logarithm:", np.log(arr))
print("Exponential:", np.exp(arr))


### Linear Algebra with NumPy
#### Matrix Operations

In [None]:
# Matrix Addition and Subtraction

import numpy as np

A = np.array([[1, 2]
              , [3, 4]])
B = np.array([[5, 6]
              , [7, 8]])

print("Matrix A:\n", A)
print("Matrix B:\n", B)

# Addition
print("A + B:\n", A + B)

# Subtraction
print("A - B:\n", A - B)

In [None]:
# Matrix Multiplication (Element-wise vs. Dot Product)

# Element-wise multiplication
element_wise = A * B
print("Element-wise Multiplication:\n", element_wise)

# Matrix dot product (True matrix multiplication)
dot_product = A @ B  # Equivalent to np.dot(A, B)
print("Dot Product (Matrix Multiplication):\n", dot_product)

#### Determinants and Inverses

In [None]:
# Calculating Determinant

from numpy.linalg import det

A = np.array([[4, 2]
              , [3, 1]])

det_A = det(A)
print("Determinant of A:", det_A)

### Advanced NumPy
#### Sorting and Searching

In [None]:
# Sorting an Array

import numpy as np
arr = np.array([3, 1, 5, 2, 4])

# Sorting in ascending order
sorted_arr = np.sort(arr)
print("Sorted Array:", sorted_arr)

# Sorting in descending order
sorted_desc = np.sort(arr)[::-1]
print("Sorted in Descending Order:", sorted_desc)

In [None]:
# Sorting a 2D Array

arr_2d = np.array([[5, 2, 3], [8, 1, 4]])

print("Row-wise Sorting:\n", np.sort(arr_2d, axis=1))  # Sort each row
print("Column-wise Sorting:\n", np.sort(arr_2d, axis=0))  # Sort each column

In [None]:
# Searching for Elements in an Array

arr = np.array([10, 20, 30, 40, 50])

# Finding the index of a specific value
index = np.where(arr == 30)
print("Index of 30:", index)

# Finding indices where values are greater than 25
greater_than_25 = np.where(arr > 25)
print("Indices where values > 25:", greater_than_25)


#### Stacking and Splitting Arrays

In [None]:
# Vertical and Horizontal Stacking
arr1 = np.array([[1, 2]
                 , [3, 4]])

arr2 = np.array([[5, 6]
                 , [7, 8]])

# Vertical stacking (rows)
vstacked = np.vstack((arr1, arr2))
print("Vertically Stacked:\n", vstacked)

# Horizontal stacking (columns)
hstacked = np.hstack((arr1, arr2))
print("Horizontally Stacked:\n", hstacked)


In [None]:
# Splitting an Array

arr = np.array([[1, 2, 3, 4]
                , [5, 6, 7, 8]])

# Splitting into two equal parts (columns)
split_arr = np.hsplit(arr, 2)
print("First Split:\n", split_arr[0])
print("Second Split:\n", split_arr[1])


split_arr2 = np.split(arr, 2)
print("First Split:\n", split_arr2[0])
print("Second Split:\n", split_arr2[1])

### Population vs Sample

In [None]:
# Generating a Population and Sample in Python

import numpy as np
import random

# Create a population of 100,000 values (ages of people)
population = np.random.randint(18, 80, 100000)

# Take a random sample of 1000 from the population
sample = random.sample(list(population), 1000)

print("Population Size:", len(population))
print("Sample Size:", len(sample))

In [None]:
# Comparing Population Mean and Sample Mean

# Calculate Population Mean
population_mean = np.mean(population)

# Calculate Sample Mean
sample_mean = np.mean(sample)

print("Population Mean:", population_mean)
print("Sample Mean:", sample_mean)

# Data Cleaning

### Data Types & Conversion
Understanding Data Types in Python

Python has various data types, including integers, floats, strings, and datetime objects.

In [None]:
# Import necessary libraries
import pandas as pd
import numpy as np
import datetime 

# Create a sample dataset
data = {
    'Name': ['Jon', 'Lynn', 'Michael', 'Shane', 'Stephanie'],
    'Age': ['25', '30', '35', '40', '45'],  # Stored as strings
    'Salary': ['50000', '60000', '70000', '80000', '90000'],  # Stored as strings
    'Joining Date': ['2023-01-10', '2022-05-15', '2021-07-20', '2019-11-30', '2018-06-25'],  # Date in string format
    'Department': ['HR', 'Finance', 'IT', 'IT', 'Finance']
}

# Convert to DataFrame
df = pd.DataFrame(data)

print("Original DataFrame:")
display(df)


In [None]:
# Checking data types before conversion

print("\nData types before conversion:")
display(df.dtypes)

In [None]:
### Converting Data Types

# Convert 'Age' and 'Salary' to numeric
df['Age'] = pd.to_numeric(df['Age'])
df['Salary'] = pd.to_numeric(df['Salary'])

# Convert 'Joining Date' to datetime
df['Joining Date'] = pd.to_datetime(df['Joining Date'])

# Checking data types after conversion
print("\nData types after conversion:")
display(df.dtypes)

#### .loc
* label-based indexing
* It selects data by row label & column label


#### label vs position
* .loc → label-based (“Find the row named 2”)
 
* .iloc → position-based (“Find the row at position 2”)

In [None]:
### Mapping & Converting Categorical Data

# Convert 'Department' to categorical
df['Department'] = df['Department'].astype('category')

# Create a mapping for Department
dept_mapping = {'HR': 1, 'Finance': 2, 'IT': 3}
df['Department_Code'] = df['Department'].map(dept_mapping)

print("\nFinal DataFrame with Department mapped:")
display(df)

### Handling Missing & Incorrect Data

In [None]:
# Import necessary libraries
import pandas as pd
import numpy as np

# Create a sample dataset with missing and incorrect values
data = {
    'Name': ['Lynn', 'Shane', 'Michael', 'Jon', np.nan],
    'Age': [25, np.nan, 35, 40, 28],
    'Salary': [50000, 60000, np.nan, 80000, 90000],
    'Joining Date': ['2023-01-10', '2022-05-15', 'Invalid Date', '2019-11-30', '2018-06-25'],
    'Department': ['HR', 'Finance', 'IT', np.nan, 'Finance']
}

# Convert to DataFrame
df = pd.DataFrame(data)

print("Original DataFrame:")
display(df)

In [None]:
### Detecting Missing Data
print("\nMissing values per column:")
display(df.isnull().sum())

In [None]:
### Handling Missing Data

# Option 1: Removing rows with missing values
df_dropped = df.dropna()
print("\nDataFrame after dropping missing values:")
display(df_dropped)

In [None]:
# Drop rows only if Age or Salary is NaN

df_dropped_age = df.dropna(subset=['Age','Salary'])
display(df_dropped_age)

inplace = True

Apply the change directly to the original object, instead of returning a new one.

In [None]:
# Option 2: Filling missing values

# Fill 'Salary' with a default value
df['Salary'].fillna(0, inplace=True)

# Fill 'Age' with mean value
df['Age'].fillna(df['Age'].mean(), inplace=True)

# Fill 'Department' with the most frequent value
df['Department'].fillna(df['Department'].mode()[0], inplace=True)

print("\nDataFrame after filling missing values:")
display(df)

In [None]:
### Handling Incorrect Data

# Converting 'Joining Date' to datetime, handling errors
df['Joining Date'] = pd.to_datetime(df['Joining Date'], errors='coerce')

# Identify and replace incorrect values in a column
# Suppose Age should be between 20 and 60; replace outliers
outlier_condition = (df['Age'] < 20) | (df['Age'] > 60)
df.loc[outlier_condition, 'Age'] = df['Age'].median()

print("\nDataFrame after handling incorrect values:")
display(df)

### Dealing with Text Data

In [None]:
# Import necessary libraries
import pandas as pd
import numpy as np
import re

# Create a sample dataset with messy text data
data = {
    'Name': ['Alice Smith', 'BOB JOHNSON', 'Charlie Brown', 'DAVID WILLIAMS', 'Eve Clark'],
    'Email': ['alice @email.com', 'BOB@email.COM', 'charlie@email.com', 'david@email.com', 'eve@ email .com'],
    'Department': ['HR', 'finance', 'IT', 'it', 'Finance'],
    'Feedback': ['Great service!!!', 'average experience...', 'not good :(', 'EXCELLENT!!', 'bad support']
}

# Convert to DataFrame
df = pd.DataFrame(data)
print("Original DataFrame:")
display(df)

In [None]:
### Standardizing Text Data

# Convert all text columns to lowercase
df = df.apply(lambda col: col.astype(str).str.lower())
print("\nDataFrame after converting text to lowercase:")
display(df)

In [None]:
# Trim spaces and remove extra spaces in email

# \s → any whitespace character (space, tab, newline)
# + → one or more times
df['Email'] = df['Email'].str.replace(r'\s+', '', regex=True)
print("\nDataFrame after cleaning email column:")
display(df[['Email']])

Pandas Series does not have .replace() for regex-based string ops
* .str tells pandas to apply this string function to each element

In [None]:
# Standardize department names

df['Department'] = df['Department'].replace({'finance': 'Finance', 'it': 'IT', 'hr': 'HR'})
print("\nDataFrame after standardizing department names:")
display(df[['Department']])

.replace() here performs value substitution and replaces entire cell values

In [None]:
### Handling Inconsistent and Noisy Text

# Removing special characters from feedback
df['Feedback'] = df['Feedback'].str.replace(r'[^a-zA-Z0-9 ]', '', regex=True)
# [^a-zA-Z0-9 ] Match anything EXCEPT letters, numbers, and spaces

print("\nDataFrame after removing special characters from feedback:")
display(df[['Feedback']])

In [None]:
### Extracting Information from Text

# Extract domain from email using regex
df['Email Domain'] = df['Email'].str.extract(r'@([a-zA-Z0-9.-]+)') # + means one or more times
print("\nDataFrame after extracting email domains:")
display(df[['Email', 'Email Domain']])

In [None]:
### Finding and Replacing Specific Words

# Replace 'bad' and 'not good' with 'negative'
df['Feedback'] = df['Feedback'].replace({'bad': 'negative', 'not good': 'negative'}, regex=True)
# Without regex=True, only replaces entire cell values that are exactly "bad".

print("\nDataFrame after replacing words in feedback:")
display(df[['Feedback']])

### Detecting & Handling Outliers


In [None]:
# %pip install matplotlib seaborn

In [None]:
# Import necessary libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

# Create a sample dataset with outliers
data = {
    'Employee': ['Lynn', 'Shane', 'Michael', 'Mei', 'Adeline', 'Stephanie', 'Carol', 'Vivian', 'Harvey', 'Ken'],
    'Salary': [50000, 52000, 51000, 53000, 49000, 60000, 62000, 65000, 70000, 150000],  # Outlier at 150000
    'Age': [25, 26, 24, 27, 25, 29, 30, 31, 32, 60],  # Possible outlier at 60
}

# Convert to DataFrame
df = pd.DataFrame(data)
print("Original DataFrame:")
display(df)

In [None]:
### Detecting Outliers
# Using Summary Statistics
print("\nSummary statistics:")
display(df.describe())

In [None]:
# Visualizing Outliers
plt.figure(figsize=(12,5))
sns.boxplot(data=df[['Salary', 'Age']])
plt.title("Boxplot of Salary and Age")
plt.show()

In [None]:
### Detecting Outliers Using IQR (Interquartile Range)
Q1 = df[['Salary', 'Age']].quantile(0.25)
Q3 = df[['Salary', 'Age']].quantile(0.75)
IQR = Q3 - Q1

outlier_condition = (df[['Salary', 'Age']] < (Q1 - 1.5 * IQR)) | (df[['Salary', 'Age']] > (Q3 + 1.5 * IQR))
print("\nDetected Outliers:")
display(df[outlier_condition.any(axis=1)])

In [None]:
### Handling Outliers

# Option 1: Removing Outliers
df_removed = df[~outlier_condition.any(axis=1)]
print("\nDataFrame after removing outliers:")
display(df_removed)

In [None]:
# Option 2: Replacing Outliers with Median
df_replaced = df.copy()
df_replaced.loc[outlier_condition.any(axis=1), ['Salary', 'Age']] = df[['Salary', 'Age']].median()
print("\nDataFrame after replacing outliers with median:")
display(df_replaced)

In [None]:
# Option 3: Capping Outliers (Winsorization)
df_capped = df.copy()
lower_bound = Q1 - 1.5 * IQR
upper_bound = Q3 + 1.5 * IQR

df_capped[['Salary', 'Age']] = np.where(df_capped[['Salary', 'Age']] < lower_bound, lower_bound, df_capped[['Salary', 'Age']])
df_capped[['Salary', 'Age']] = np.where(df_capped[['Salary', 'Age']] > upper_bound, upper_bound, df_capped[['Salary', 'Age']])

print("\nDataFrame after capping outliers:")
display(df_capped)

### Data Deduplication

In [None]:
# Import necessary libraries
import pandas as pd
import numpy as np

# Create a sample dataset with duplicate records
data = {
    'Employee ID': [101, 102, 103, 104, 101, 105, 102, 106, 107, 108],
    'Name': ['Alice', 'Bob', 'Charlie', 'David', 'Alice', 'Eve', 'Bob', 'Grace', 'Hank', 'Ivy'],
    'Department': ['HR', 'Finance', 'IT', 'IT', 'HR', 'Finance', 'Finance', 'IT', 'HR', 'Finance'],
    'Salary': [50000, 52000, 51000, 53000, 50000, 60000, 52000, 62000, 49000, 60000]
}

# Convert to DataFrame
df = pd.DataFrame(data)
print("Original DataFrame:")
display(df)

In [None]:
### Detecting Duplicate Records

# Identifying duplicate rows based on all columns
duplicates = df.duplicated()
print("\nDuplicate Rows:")
display(df[duplicates])

In [None]:
# Identifying duplicate rows based on 'Employee ID'
print("\nDuplicate Employee IDs:")
display(df[df.duplicated(subset=['Employee ID', 'Name'], keep=False)])

In [None]:
### Handling Duplicates

# Option 1: Removing exact duplicates
df_removed = df.drop_duplicates()
print("\nDataFrame after removing exact duplicates:")
display(df_removed)

In [None]:
# Option 2: Keeping the first occurrence

df_first = df.drop_duplicates(subset=['Employee ID', 'Name'], keep='first')
print("\nDataFrame after keeping first occurrence:")
display(df_first)

In [None]:
# Option 3: Keeping the last occurrence

df_last = df.drop_duplicates(subset=['Employee ID'], keep='last')
print("\nDataFrame after keeping last occurrence:")
display(df_last)

### Creating New Columns

In [None]:
### Creating New Columns in Pandas
# Import necessary libraries
import pandas as pd
import numpy as np

# Create a sample dataset
data = {
    'Employee': ['Lynn', 'Shane', 'Michael', 'Harvey', 'Vivian'],
    'Salary': [50000, 52000, 51000, 53000, 49000],
    'Joining Date': ['2023-01-10', '2022-05-15', '2021-07-20', '2019-11-30', '2018-06-25'],
    'Department': ['HR', 'Finance', 'IT', 'IT', 'Finance']
}

# Convert to DataFrame
df = pd.DataFrame(data)
print("Original DataFrame:")
display(df)

In [None]:
### Creating Numeric Columns

# Adding a new column: Bonus (10% of Salary)
df['Bonus'] = df['Salary'] * 0.10
print("\nDataFrame after adding Bonus column:")
display(df)

In [None]:
# Creating a column that categorizes employees based on salary level

def categorize_salary(salary):
    if salary < 50000:
        return 'Low'
    elif 50000 <= salary < 55000:
        return 'Medium'
    else:
        return 'High'

df['Salary Category'] = df['Salary'].apply(categorize_salary)
print("\nDataFrame after adding Salary Category:")
display(df)

In [None]:
### Creating DateTime Columns

# Convert 'Joining Date' to datetime format
df['Joining Date'] = pd.to_datetime(df['Joining Date'])

# Extract year, month, and day from 'Joining Date'
df['Joining Year'] = df['Joining Date'].dt.year
df['Joining Month'] = df['Joining Date'].dt.month
df['Joining Day'] = df['Joining Date'].dt.day

print("\nDataFrame after extracting DateTime features:")
display(df)

In [None]:
### Creating Text Columns

# Combining Employee Name and Department to create a new column
df['Employee Info'] = df['Employee'] + ' - ' + df['Department']
print("\nDataFrame after adding Employee Info column:")
display(df)


In [None]:
# Creating an uppercase version of Employee names

df['Employee Uppercase'] = df['Employee'].str.upper()
print("\nDataFrame after adding Uppercase Employee column:")
display(df)
