In [4]:
# Question 1

import pandas as pd

# Load the dataset
data = 'Dataset_for_Python_Questions.csv'
dataset = pd.read_csv(data)

# Identify columns with missing values
missing_values = dataset.isnull().sum()
missing_columns = missing_values[missing_values > 0].index.tolist()

# Replace missing numerical values with the column mean
for column in dataset.select_dtypes(include=['float64', 'int64']).columns:
    dataset[column].fillna(dataset[column].mean(), inplace=True)

# Replace missing categorical values with the most frequent value
for column in dataset.select_dtypes(include=['object']).columns:
    dataset[column].fillna(dataset[column].mode()[0], inplace=True)

missing_columns, dataset

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  dataset[column].fillna(dataset[column].mean(), inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  dataset[column].fillna(dataset[column].mode()[0], inplace=True)


(['Age', 'Income'],
          Age   Income  Gender Region      City Purchased
 0  25.000000  50000.0    Male  North  New York       Yes
 1  30.000000  60000.0  Female  South    London        No
 2  35.000000  75000.0  Female   East     Tokyo       Yes
 3  40.000000  80000.0    Male   West     Paris        No
 4  33.888889  45000.0    Male  North  New York       Yes
 5  50.000000  59875.0  Female  South    London        No
 6  28.000000  52000.0  Female   East     Tokyo       Yes
 7  32.000000  62000.0    Male   West     Paris        No
 8  36.000000  59875.0    Male  North  New York       Yes
 9  29.000000  55000.0  Female   East     Tokyo        No)

In [5]:
# Question 2

# Simulating dataset for demonstration
data = { 'date_column': ['2024-01-01', '2024-02-15', '2024-03-20'] }
date_df = pd.DataFrame(data)

# Convert the date column into datetime objects
date_df['date_column'] = pd.to_datetime(date_df['date_column'])

# Extract day, month, and year into separate columns
date_df['day'] = date_df['date_column'].dt.day
date_df['month'] = date_df['date_column'].dt.month
date_df['year'] = date_df['date_column'].dt.year

date_df[['day', 'month', 'year']]

Unnamed: 0,day,month,year
0,1,1,2024
1,15,2,2024
2,20,3,2024


In [6]:
# Count the number of rows before removing duplicates
duplicates_before = dataset.shape[0]  # Get the total number of rows in the original dataset

# Remove duplicate rows from the dataset
dataset_cleaned = dataset.drop_duplicates()  # Create a new DataFrame without duplicates

# Count the number of rows after duplicates have been removed
duplicates_after = dataset_cleaned.shape[0]  # Get the total number of rows in the cleaned dataset

# Calculate how many duplicates were removed
duplicates_removed = duplicates_before - duplicates_after  # Subtract to find the count of removed duplicates

# Output the number of duplicates removed and the cleaned dataset
duplicates_removed, dataset_cleaned  # Return the count of duplicates removed and the cleaned DataFrame

(0,
          Age   Income  Gender Region      City Purchased
 0  25.000000  50000.0    Male  North  New York       Yes
 1  30.000000  60000.0  Female  South    London        No
 2  35.000000  75000.0  Female   East     Tokyo       Yes
 3  40.000000  80000.0    Male   West     Paris        No
 4  33.888889  45000.0    Male  North  New York       Yes
 5  50.000000  59875.0  Female  South    London        No
 6  28.000000  52000.0  Female   East     Tokyo       Yes
 7  32.000000  62000.0    Male   West     Paris        No
 8  36.000000  59875.0    Male  North  New York       Yes
 9  29.000000  55000.0  Female   East     Tokyo        No)

In [7]:
# Question 4

# Identify outliers using the IQR method for the 'Income' column
Q1 = dataset['Income'].quantile(0.25)
Q3 = dataset['Income'].quantile(0.75)
IQR = Q3 - Q1
lower_bound = Q1 - 1.5 * IQR
upper_bound = Q3 + 1.5 * IQR

# Remove outliers
outliers_removed = dataset[(dataset['Income'] >= lower_bound) & (dataset['Income'] <= upper_bound)]
rows_removed = dataset.shape[0] - outliers_removed.shape[0]

rows_removed, outliers_removed

(2,
          Age   Income  Gender Region      City Purchased
 0  25.000000  50000.0    Male  North  New York       Yes
 1  30.000000  60000.0  Female  South    London        No
 4  33.888889  45000.0    Male  North  New York       Yes
 5  50.000000  59875.0  Female  South    London        No
 6  28.000000  52000.0  Female   East     Tokyo       Yes
 7  32.000000  62000.0    Male   West     Paris        No
 8  36.000000  59875.0    Male  North  New York       Yes
 9  29.000000  55000.0  Female   East     Tokyo        No)

In [8]:
# Question 5

from sklearn.preprocessing import LabelEncoder

# Sample data for the City column
cities = ['New York', 'London', 'Tokyo', 'Paris']

# Initialize the LabelEncoder
label_encoder = LabelEncoder()

# Fit and transform the City labels
encoded_cities = label_encoder.fit_transform(cities)
encoded_cities

array([1, 0, 3, 2])

In [9]:
# Question 6

from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder

# Load the dataset again for transformation
dataset = pd.read_csv(file_path)

# Initialize the ColumnTransformer
column_transformer = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), ['Age', 'Income']),
        ('cat', OneHotEncoder(), ['City'])
    ])

# Apply transformations
transformed_data = column_transformer.fit_transform(dataset)
transformed_data

array([[-1.24484409, -0.86490014,  0.        ,  1.        ,  0.        ,
         0.        ],
       [-0.54461929,  0.0109481 ,  1.        ,  0.        ,  0.        ,
         0.        ],
       [ 0.15560551,  1.32472047,  0.        ,  0.        ,  0.        ,
         1.        ],
       [ 0.85583031,  1.7626446 ,  0.        ,  0.        ,  1.        ,
         0.        ],
       [        nan, -1.30282427,  0.        ,  1.        ,  0.        ,
         0.        ],
       [ 2.25627991,         nan,  1.        ,  0.        ,  0.        ,
         0.        ],
       [-0.82470921, -0.68973049,  0.        ,  0.        ,  0.        ,
         1.        ],
       [-0.26452937,  0.18611775,  0.        ,  0.        ,  1.        ,
         0.        ],
       [ 0.29565047,         nan,  0.        ,  1.        ,  0.        ,
         0.        ],
       [-0.68466425, -0.42697602,  0.        ,  0.        ,  0.        ,
         1.        ]])

In [10]:
# Question 7

from sklearn.model_selection import train_test_split

# Split the dataset into features and target variable
features = dataset.drop(columns=['Purchased'])
target = dataset['Purchased']

# Split the dataset into training and testing sets (80% train, 20% test)
X_train, X_test, y_train, y_test = train_test_split(features, target, test_size=0.2, random_state=42)

X_train, X_test, y_train, y_test

(    Age   Income  Gender Region      City
 5  50.0      NaN  Female  South    London
 0  25.0  50000.0    Male  North  New York
 7  32.0  62000.0    Male   West     Paris
 2  35.0  75000.0  Female   East     Tokyo
 9  29.0  55000.0  Female   East     Tokyo
 4   NaN  45000.0    Male  North  New York
 3  40.0  80000.0    Male   West     Paris
 6  28.0  52000.0  Female   East     Tokyo,
     Age   Income  Gender Region      City
 8  36.0      NaN    Male  North  New York
 1  30.0  60000.0  Female  South    London,
 5     No
 0    Yes
 7     No
 2    Yes
 9     No
 4    Yes
 3     No
 6    Yes
 Name: Purchased, dtype: object,
 8    Yes
 1     No
 Name: Purchased, dtype: object)

In [11]:
# Question 8

from sklearn.preprocessing import StandardScaler, MinMaxScaler

# Selecting numerical features for scaling
numerical_features = dataset[['Age', 'Income']]

# Scale using StandardScaler
standard_scaler = StandardScaler()
standard_scaled = standard_scaler.fit_transform(numerical_features)

# Scale using MinMaxScaler
minmax_scaler = MinMaxScaler()
minmax_scaled = minmax_scaler.fit_transform(numerical_features)

standard_scaled, minmax_scaled

(array([[-1.24484409, -0.86490014],
        [-0.54461929,  0.0109481 ],
        [ 0.15560551,  1.32472047],
        [ 0.85583031,  1.7626446 ],
        [        nan, -1.30282427],
        [ 2.25627991,         nan],
        [-0.82470921, -0.68973049],
        [-0.26452937,  0.18611775],
        [ 0.29565047,         nan],
        [-0.68466425, -0.42697602]]),
 array([[0.        , 0.14285714],
        [0.2       , 0.42857143],
        [0.4       , 0.85714286],
        [0.6       , 1.        ],
        [       nan, 0.        ],
        [1.        ,        nan],
        [0.12      , 0.2       ],
        [0.28      , 0.48571429],
        [0.44      ,        nan],
        [0.16      , 0.28571429]]))

In [13]:
import pandas as pd

# Load the dataset again for comprehensive processing
dataset = pd.read_csv(file_path)

# Handle missing values: replace Age with mean and Income with median.
dataset['Age'].fillna(dataset['Age'].mean(), inplace=True)
dataset['Income'].fillna(dataset['Income'].median(), inplace=True)

# Encode Gender using label encoding.
label_encoder = LabelEncoder()
dataset['Gender'] = label_encoder.fit_transform(dataset['Gender'])

# One-hot encode Region.
dataset = pd.get_dummies(dataset, columns=['Region'], drop_first=True)

# Split into features and target variable.
features = dataset.drop(columns=['Purchased'])
target = dataset['Purchased']

# Split into training and testing sets.
X_train, X_test, y_train, y_test = train_test_split(features, target, test_size=0.2, random_state=42)

# Scale numerical features.
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train[['Age', 'Income']])
X_test_scaled = scaler.transform(X_test[['Age', 'Income']])

X_train_scaled, X_test_scaled, y_train, y_test

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  dataset['Age'].fillna(dataset['Age'].mean(), inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  dataset['Income'].fillna(dataset['Income'].median(), inplace=True)


(array([[ 2.14510726, -0.18022565],
        [-1.2300615 , -0.83559163],
        [-0.28501425,  0.21299395],
        [ 0.120006  ,  1.34896166],
        [-0.6900345 , -0.39868098],
        [-0.0300015 , -1.27250229],
        [ 0.79503975,  1.78587232],
        [-0.82504125, -0.66082737]]),
 array([[ 0.25501275, -0.18022565],
        [-0.55502775,  0.03822968]]),
 5     No
 0    Yes
 7     No
 2    Yes
 9     No
 4    Yes
 3     No
 6    Yes
 Name: Purchased, dtype: object,
 8    Yes
 1     No
 Name: Purchased, dtype: object)