In [None]:
# Steps in Data Preprocessing

# 1. Data Collection: Gathering raw data from various sources.
# Task 1: Collect data from two different sources and merge them.
# Task 2: Validate the integrity of the collected datasets.
# Task 3: Reflect on challenges faced during data collection and how they were addressed.


import pandas as pd

# Task 1: Collect data from two different sources and merge them
# For demonstration, create two sample DataFrames mimicking data sources

data_source1 = {
    'id': [1, 2, 3],
    'name': ['Alice', 'Bob', 'Charlie'],
    'age': [25, 30, 35]
}

data_source2 = {
    'id': [3, 4, 5],
    'name': ['Charlie', 'David', 'Eve'],
    'age': [35, 40, 28]
}

df1 = pd.DataFrame(data_source1)
df2 = pd.DataFrame(data_source2)

# Merge datasets on 'id', using outer join to keep all records
merged_df = pd.merge(df1, df2, on=['id', 'name', 'age'], how='outer')

print("Merged DataFrame:")
print(merged_df)

# Task 2: Validate integrity of the merged dataset
# Check for duplicates
duplicates = merged_df.duplicated()
print("\nDuplicate rows in merged dataset:")
print(merged_df[duplicates])

# Check for missing values
missing_values = merged_df.isnull().sum()
print("\nMissing values in each column:")
print(missing_values)

# Task 3: Reflection (as comment)
# Challenges faced during data collection:
# - Data format inconsistencies between sources
# - Overlapping records requiring careful merging to avoid duplicates
# - Missing or inconsistent data fields
# Addressed by:
# - Standardizing column names and formats before merging
# - Using outer joins to avoid loss of records
# - Validating merged data for duplicates and missing values


In [None]:
# 2. Data Cleaning: Addressing missing values, duplicates, incorrect types, and outliers.
# Task 1: Clean a given dataset and document the changes made.
# Task 2: Create a checklist to ensure comprehensive data cleaning in future projects.
# Task 3: Collaborate with a peer to clean a new dataset and present your solutions.

import pandas as pd
import numpy as np

data = {
    'id': [1, 2, 2, 4, 5],
    'name': ['Alice', 'Bob', 'Bob', 'David', None],
    'age': ['25', 'thirty', '30', 40, 35],
    'salary': [50000, 60000, 60000, 800000, np.nan]
}

df = pd.DataFrame(data)

df_cleaned = df.drop_duplicates()

df_cleaned['age'] = pd.to_numeric(df_cleaned['age'], errors='coerce')

age_median = df_cleaned['age'].median()
df_cleaned['age'].fillna(age_median, inplace=True)

salary_median = df_cleaned['salary'].median()
df_cleaned['salary'].fillna(salary_median, inplace=True)

q1 = df_cleaned['salary'].quantile(0.25)
q3 = df_cleaned['salary'].quantile(0.75)
iqr = q3 - q1
lower_bound = q1 - 1.5 * iqr
upper_bound = q3 + 1.5 * iqr
df_cleaned.loc[df_cleaned['salary'] > upper_bound, 'salary'] = salary_median

print("Original DataFrame:")
print(df)
print("\nCleaned DataFrame:")
print(df_cleaned)


In [None]:
# 3. Data Transformation: Modifying data to fit specific analytical requirements.
# Task 1: Transform a date column into separate 'day', 'month', and 'year' columns.
# Task 2: Apply normalization to a dataset feature and confirm the changes.
# Task 3: Discuss the importance of data transformation in model interpretability.

import pandas as pd
from sklearn.preprocessing import MinMaxScaler

data = {
    'date': ['2023-01-15', '2023-03-22', '2023-07-30'],
    'feature': [10, 15, 20]
}

df = pd.DataFrame(data)

df['date'] = pd.to_datetime(df['date'])
df['day'] = df['date'].dt.day
df['month'] = df['date'].dt.month
df['year'] = df['date'].dt.year

scaler = MinMaxScaler()
df['feature_normalized'] = scaler.fit_transform(df[['feature']])

print(df)



In [None]:
# 4. Feature Scaling: Adjusting data features to a common scale.
# Task 1: Apply Min-Max scaling to a dataset.
# Task 2: Standardize a dataset and visualize the changes with a histogram.
# Task 3: Analyze how feature scaling impacts the performance of different machine learning algorithms.
import pandas as pd
import numpy as np
from sklearn.preprocessing import MinMaxScaler, StandardScaler
import matplotlib.pyplot as plt

data = {
    'feature1': [10, 20, 30, 40, 50],
    'feature2': [100, 200, 300, 400, 500]
}

df = pd.DataFrame(data)

min_max_scaler = MinMaxScaler()
df_minmax_scaled = pd.DataFrame(min_max_scaler.fit_transform(df), columns=df.columns)

standard_scaler = StandardScaler()
df_standard_scaled = pd.DataFrame(standard_scaler.fit_transform(df), columns=df.columns)

plt.figure(figsize=(12, 5))

plt.subplot(1, 2, 1)
plt.hist(df_minmax_scaled['feature1'], bins=5, alpha=0.7)
plt.title('Min-Max Scaled feature1')

plt.subplot(1, 2, 2)
plt.hist(df_standard_scaled['feature1'], bins=5, alpha=0.7)
plt.title('Standard Scaled feature1')

plt.show()





In [None]:
# 5. Feature Engineering: Creating new features from existing ones to improve model accuracy.
# Task 1: Create a new synthetic feature from existing dataset features.
# Task 2: Evaluate the impact of new features on model accuracy.
# Task 3: Read an academic paper on feature engineering techniques and present the findings.
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

data = {
    'age': [25, 45, 35, 50, 23, 40, 60, 30],
    'salary': [50000, 80000, 60000, 90000, 48000, 75000, 100000, 52000],
    'purchased': [0, 1, 0, 1, 0, 1, 1, 0]
}

df = pd.DataFrame(data)

df['age_salary_ratio'] = df['age'] / df['salary']

X = df[['age', 'salary']]
X_new = df[['age', 'salary', 'age_salary_ratio']]
y = df['purchased']

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)
X_new_train, X_new_test, y_train, y_test = train_test_split(X_new, y, random_state=42)

model_original = LogisticRegression()
model_original.fit(X_train, y_train)
y_pred_original = model_original.predict(X_test)
accuracy_original = accuracy_score(y_test, y_pred_original)

model_new = LogisticRegression()
model_new.fit(X_new_train, y_train)
y_pred_new = model_new.predict(X_new_test)
accuracy_new = accuracy_score(y_test, y_pred_new)

print(f'Accuracy without new feature: {accuracy_original:.2f}')
print(f'Accuracy with new feature: {accuracy_new:.2f}')


