In [1]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import LabelEncoder, OneHotEncoder, StandardScaler
from sklearn.model_selection import train_test_split

# 1. Load the dataset
dataset = pd.read_csv("C:/vscodefolder/AML_Lab/Datasets/Data.csv")
print(f'Dataset shape: {dataset.shape}')
print('Subset of the dataset:\n', dataset.head())

# 2. Separate the dependent and independent variables
X = dataset.iloc[:, :-1].values  # Features (including categorical)
Y = dataset.iloc[:, -1].values   # Target
print('Subset of Features X:\n', X[:5])
print('Subset of Target Y:\n', Y[:5])

# 3. Handle categorical data first
X[:, 0] = LabelEncoder().fit_transform(X[:, 0])
X = OneHotEncoder(sparse_output=False).fit_transform(X)
print('Subset of Features X after encoding:\n', X[:5])

# Convert back to DataFrame to handle missing values in numerical columns
df = pd.DataFrame(X)

# 4. Handle missing values by replacing with mean or interpolation
df.fillna(df.mean(), inplace=True)
df.interpolate(inplace=True)
X = df.values
print('Subset of Features X after handling missing values:\n', X[:5])

# 5. Split the dataset into training and test sets
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, random_state=0)
print(f'X_train shape: {X_train.shape}, X_test shape: {X_test.shape}')

# 6. Feature Scaling
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)
print('Subset of Features X_train after scaling:\n', X_train[:5])
print('Subset of Features X_test after scaling:\n', X_test[:5])

print('Data preprocessing completed.')

Dataset shape: (15, 4)
Subset of the dataset:
      Country   Age   Salary Purchased
0      India  34.0  92000.0       Yes
1  Sri lanka  22.0  25000.0       Yes
2      China  31.0  74000.0       Yes
3  Sri lanka  29.0      NaN        No
4      China  55.0  98000.0       Yes
Subset of Features X:
 [['India' 34.0 92000.0]
 ['Sri lanka' 22.0 25000.0]
 ['China' 31.0 74000.0]
 ['Sri lanka' 29.0 nan]
 ['China' 55.0 98000.0]]
Subset of Target Y:
 ['Yes' 'Yes' 'Yes' 'No' 'Yes']
Subset of Features X after encoding:
 [[0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
  0. 0. 0. 0. 1. 0. 0.]
 [0. 0. 1. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0.
  0. 0. 0. 0. 0. 0. 0.]
 [1. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
  0. 1. 0. 0. 0. 0. 0.]
 [0. 0. 1. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
  0. 0. 0. 0. 0. 0. 1.]
 [1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0.
  0. 0. 0. 0. 0. 1. 0

In [7]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder, OneHotEncoder, StandardScaler
from sklearn.model_selection import train_test_split

# 1. Load the dataset
dataset = pd.read_csv("C:/vscodefolder/AML_Lab/Datasets/Data.csv")
print(f'Dataset shape: {dataset.shape}')
print('Subset of the dataset:\n', dataset.head())

# 2. Check for missing values
print('Missing values in dataset before handling:\n', dataset.isnull().sum())

# 3. Handle missing values
# Fill missing 'Age' with the median age
dataset['Age'] = dataset['Age'].fillna(dataset['Age'].median())
# Fill missing 'Salary' with the median salary
dataset['Salary'] = dataset['Salary'].fillna(dataset['Salary'].median())

# Check for missing values again
print('Missing values in dataset after handling:\n', dataset.isnull().sum())

# 4. Encode categorical data
# Encode 'Country' using OneHotEncoder
X = dataset[['Country', 'Age', 'Salary']].values
Y = dataset['Purchased'].values

# Use LabelEncoder for 'Purchased' target variable
labelencoder_Y = LabelEncoder()
Y = labelencoder_Y.fit_transform(Y)

# OneHotEncoding for 'Country' feature
labelencoder_X = LabelEncoder()
X[:, 0] = labelencoder_X.fit_transform(X[:, 0])
onehotencoder = OneHotEncoder(sparse_output=False, drop='first')  # drop='first' to avoid multicollinearity
X = onehotencoder.fit_transform(X)
print('Subset of Features X after encoding:\n', X[:1])

# 5. Split the dataset into training and test sets
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, random_state=0)
print(f'X_train shape: {X_train.shape}, X_test shape: {X_test.shape}')

# 6. Feature Scaling
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)
print('Subset of Features X_train after scaling:\n', X_train[:1])
print('Subset of Features X_test after scaling:\n', X_test[:1])

print('Data preprocessing completed.')


Dataset shape: (15, 4)
Subset of the dataset:
      Country   Age   Salary Purchased
0      India  34.0  92000.0       Yes
1  Sri lanka  22.0  25000.0       Yes
2      China  31.0  74000.0       Yes
3  Sri lanka  29.0      NaN        No
4      China  55.0  98000.0       Yes
Missing values in dataset before handling:
 Country      0
Age          1
Salary       1
Purchased    0
dtype: int64
Missing values in dataset after handling:
 Country      0
Age          0
Salary       0
Purchased    0
dtype: int64
Subset of Features X after encoding:
 [[1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
  0. 0. 1. 0.]]
X_train shape: (12, 28), X_test shape: (3, 28)
Subset of Features X_train after scaling:
 [[ 1.         -0.57735027  0.         -0.30151134 -0.30151134  0.
  -0.30151134 -0.30151134 -0.30151134 -0.4472136  -0.30151134 -0.30151134
   3.31662479  0.         -0.30151134 -0.30151134  0.         -0.4472136
   0.         -0.30151134 -0.30151134 -0.30151134 -0.30151134

: 