Handling Missing Values

In [None]:
import pandas as pd
import numpy as np

In [None]:
data = {"Name":["John", "Mary", np.nan, "David"], "Age": [25, 31, 22, np.nan]}
df = pd.DataFrame(data)
df

Unnamed: 0,Name,Age
0,John,25.0
1,Mary,31.0
2,,22.0
3,David,


In [None]:
#Drop rows with missing vlaues
df.dropna()

Unnamed: 0,Name,Age
0,John,25.0
1,Mary,31.0


In [None]:
# Full missing values with mean/mode/median
df.fillna(df[["Age"]].mean())

Unnamed: 0,Name,Age
0,John,25.0
1,Mary,31.0
2,,22.0
3,David,26.0


In [None]:
df.fillna("Unknown")

Unnamed: 0,Name,Age
0,John,25.0
1,Mary,31.0
2,Unknown,22.0
3,David,Unknown


Data Normalization

In [2]:
import pandas as pd

# Create a sample DataFrame
data = {'Name': ['JOHN', 'mary', 'DAVID', 'jane'],
        'Age': [25, 31, 22, 35]}
df = pd.DataFrame(data)

# Convert to lowercase
df['Name'] = df['Name'].str.lower()

# Scale numeric data
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler()
df['Age'] = scaler.fit_transform(df[['Age']])


In [3]:
df

Unnamed: 0,Name,Age
0,john,0.230769
1,mary,0.692308
2,david,0.0
3,jane,1.0


Data Transformation

In [4]:
import pandas as pd

# Create a sample DataFrame
data = {'Date': ['2022-01-01', '2022-01-02', '2022-01-03'],
        'Sales': [100, 200, 300]}
df = pd.DataFrame(data)

# Convert date column to datetime
df['Date'] = pd.to_datetime(df['Date'])

# Extract month/year from date
df['Month'] = df['Date'].dt.month
df['Year'] = df['Date'].dt.year


In [5]:
df

Unnamed: 0,Date,Sales,Month,Year
0,2022-01-01,100,1,2022
1,2022-01-02,200,1,2022
2,2022-01-03,300,1,2022


Handling Outliers

In [8]:
import pandas as pd
import numpy as np

# Create a sample DataFrame
data = {'Values': [1, 2, 3, 100, 5, 6]}
df = pd.DataFrame(data)
print(df)

   Values
0       1
1       2
2       3
3     100
4       5
5       6


In [10]:
# Detect outliers using Z-score
from scipy import stats
z_scores = stats.zscore(df['Values'])
df[(z_scores < 2) & (z_scores > -2)]

# Remove outliers
df = df[(np.abs(z_scores) < 2)]
print(df)

   Values
0       1
1       2
2       3
4       5
5       6


Merging DataFrames

In [15]:
import pandas as pd

# Create sample DataFrames
df1 = pd.DataFrame({'ID': [1, 2, 3], 'Name': ['John', 'Mary', 'David']})
df2 = pd.DataFrame({'ID': [1, 2, 3], 'Age': [25, 31, 22]})

print ("\nDF1\n",df1)
print ("\nDF2\n",df2)


# Merge DataFrames on ID
df_merged = pd.merge(df1, df2, on='ID')

print("\nDF Mergged\n",df_merged)


DF1
    ID   Name
0   1   John
1   2   Mary
2   3  David

DF2
    ID  Age
0   1   25
1   2   31
2   3   22

DF Mergged
    ID   Name  Age
0   1   John   25
1   2   Mary   31
2   3  David   22


Pivoting Data

In [17]:
import pandas as pd

# Create sample DataFrame
data = {'Name': ['John', 'John', 'Mary', 'Mary'],
        'Year': [2020, 2021, 2020, 2021],
        'Sales': [100, 200, 50, 75]}
df = pd.DataFrame(data)
print("\nOG DF\n", df)

# Pivot data
df_pivot = df.pivot_table(values='Sales', index='Name', columns='Year')
print("\ndf_pivot\n", df_pivot)


OG DF
    Name  Year  Sales
0  John  2020    100
1  John  2021    200
2  Mary  2020     50
3  Mary  2021     75

df_pivot
 Year   2020   2021
Name              
John  100.0  200.0
Mary   50.0   75.0


Grouping Data

In [25]:
import pandas as pd

# Create sample DataFrame
data = {'Name': ['John', 'John', 'Mary', 'Mary'],
        'Year': [2020, 2021, 2020, 2021],
        'Sales': [100, 200, 50, 75]}
df = pd.DataFrame(data)
print("\nOG DF\n", df)

# Group data by Name and calculate sum
df_grouped = df.groupby('Name')['Sales'].sum()
print("\ndf_grouped\n", df_grouped)


OG DF
    Name  Year  Sales
0  John  2020    100
1  John  2021    200
2  Mary  2020     50
3  Mary  2021     75

df_grouped
 Name
John    300
Mary    125
Name: Sales, dtype: int64
