In [None]:
# Importance of Data Cleaning

# 1. Missing Values: Missing data points in a dataset can lead to biased results.
#     Task 1: Load a dataset and identify which columns have missing values.
#     Task 2: Replace missing values in a dataset with the column mean or mode.
#     Task 3: Compare model performance with and without handling missing values.
    
import pandas as pd
import numpy as np
from sklearn.datasets import load_diabetes
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score

# Task 1: Load a dataset and identify which columns have missing values
data = load_diabetes()
df = pd.DataFrame(data.data, columns=data.feature_names)

# Introduce missing values artificially for demonstration
df.loc[5:10, 'bmi'] = np.nan
df.loc[20:25, 'bp'] = np.nan

# Identify missing values
missing_info = df.isnull().sum()
print("Missing Values per Column:\n", missing_info[missing_info > 0])

# Task 2: Replace missing values in a dataset with the column mean
df_cleaned = df.fillna(df.mean())

# Task 3: Compare model performance with and without handling missing values

# Original target
target = pd.Series(data.target)

# Prepare data by dropping rows with missing values
df_dropped = df.dropna()
target_dropped = target[df_dropped.index]

# Train-test split for dropped data
X_train1, X_test1, y_train1, y_test1 = train_test_split(df_dropped, target_dropped, test_size=0.2, random_state=42)
model1 = LinearRegression().fit(X_train1, y_train1)
pred1 = model1.predict(X_test1)
print("\nR2 score without filling missing values:", r2_score(y_test1, pred1))

# Train-test split for mean-filled data
X_train2, X_test2, y_train2, y_test2 = train_test_split(df_cleaned, target, test_size=0.2, random_state=42)
model2 = LinearRegression().fit(X_train2, y_train2)
pred2 = model2.predict(X_test2)
print("R2 score after filling missing values with mean:", r2_score(y_test2, pred2))





In [None]:
# 2. Duplicate Data: Repeated data points can skew analysis and model results.
#     Task 1: Identify and remove duplicate entries from a dataset using a programming language or tool.
#     Task 2: Document the before-and-after dataset shape to understand the impact of duplicates.
#     Task 3: Explain to a classmate how duplicate data can affect prediction accuracy.
    
import pandas as pd

# Sample dataset with duplicate entries
data = {
    'id': [1, 2, 2, 3, 4, 4, 4, 5],
    'name': ['Alice', 'Bob', 'Bob', 'Charlie', 'David', 'David', 'David', 'Eve'],
    'score': [85, 90, 90, 95, 80, 80, 80, 88]
}

df = pd.DataFrame(data)

# Task 1: Identify duplicates
duplicates = df.duplicated()
print("Duplicate rows (True indicates duplicate):\n", duplicates)

# Task 2: Remove duplicates and compare shapes
print("\nOriginal dataset shape:", df.shape)
df_no_duplicates = df.drop_duplicates()
print("Dataset shape after removing duplicates:", df_no_duplicates.shape)

# Task 3: Explanation (as comment)
# Duplicate data can bias the model by over-representing certain observations,
# which may lead to inaccurate or skewed predictions because the model
# might treat duplicates as additional evidence rather than repeated data points.

    

In [None]:
# 3. Incorrect Data Types: Data stored in incorrect formats can lead to parsing errors or incorrect analysis.
#     Task 1: Convert a column of string numbers to integers in a dataset.
#     Task 2: Identify and correct columns with inconsistent data types in a dataset.
#     Task 3: Discuss why correct data types are critical for feature engineering.
import pandas as pd

# Sample dataset with incorrect data types
data = {
    'id': ['1', '2', '3', '4'],         # numbers as strings
    'age': [25, '30', '35', 40],        # mixed types: int and string
    'salary': ['50000', '60000', '70000', '80000']  # numbers as strings
}

df = pd.DataFrame(data)

# Task 1: Convert a column of string numbers to integers
df['id'] = df['id'].astype(int)
print("After converting 'id' to integers:\n", df['id'])

# Task 2: Identify columns with inconsistent data types
for col in df.columns:
    types = df[col].apply(type).unique()
    print(f"Column '{col}' has data types: {types}")

# Correct inconsistent data types in 'age' column
df['age'] = pd.to_numeric(df['age'], errors='coerce').astype('Int64')
print("\nAfter correcting 'age' column data types:\n", df['age'])

# Task 3: Explanation (as comment)
# Correct data types are critical for feature engineering because many
# operations, like mathematical transformations, aggregations, or scaling,
# require numeric types. Incorrect types may cause errors or inaccurate features.

    
    

In [None]:
# 4. Outliers & Inconsistencies: Irregularities in data can mislead statistical analysis and model predictions.
#     Task 1: Visualize a dataset and identify outliers using a boxplot.
#     Task 2: Remove or adjust outliers and re-analyze the dataset.
#     Task 3: Research and report on a technique for handling outliers effectively.
    
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np

# Sample dataset with potential outliers
data = {
    'age': [22, 25, 30, 24, 100, 28, 26, 27, 29, 24]  # 100 is an outlier
}
df = pd.DataFrame(data)

# Task 1: Visualize dataset and identify outliers using a boxplot
plt.boxplot(df['age'])
plt.title("Boxplot of Age")
plt.show()

# Task 2: Remove outliers using IQR method
Q1 = df['age'].quantile(0.25)
Q3 = df['age'].quantile(0.75)
IQR = Q3 - Q1
lower_bound = Q1 - 1.5 * IQR
upper_bound = Q3 + 1.5 * IQR

df_no_outliers = df[(df['age'] >= lower_bound) & (df['age'] <= upper_bound)]

# Re-analyze: Show boxplot after removing outliers
plt.boxplot(df_no_outliers['age'])
plt.title("Boxplot of Age After Removing Outliers")
plt.show()

print("Dataset after removing outliers:\n", df_no_outliers)

# Task 3: Explanation (as comment)
# One common technique for handling outliers is the IQR method,
# which detects outliers as values outside 1.5*IQR range from Q1 and Q3.
# Outliers can be removed, capped, or transformed depending on the use case.

    