'''

@Author: Jayesh Patil 

@Date: 11-11-24 

@Title: Data preprocessing

'''

Import libraries

In [31]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import sklearn
import statistics as st
import warnings
warnings.filterwarnings('ignore')
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler



Import the dataset

In [9]:
df = pd.read_csv('Data.csv')
df

Unnamed: 0,Country,Age,Salary,Purchased
0,France,44.0,72000.0,No
1,Spain,27.0,48000.0,Yes
2,Germany,30.0,54000.0,No
3,Spain,38.0,61000.0,No
4,Germany,40.0,,Yes
5,France,35.0,58000.0,Yes
6,Spain,,52000.0,No
7,France,48.0,79000.0,Yes
8,Germany,50.0,83000.0,No
9,France,37.0,67000.0,Yes


Analysing Dataset

In [15]:
print(df.describe())
print(df.shape)
print("Median Age:", st.median(df['Age']))
print("Mode Age:", st.mode(df['Age']))

             Age        Salary
count   9.000000      9.000000
mean   38.777778  63777.777778
std     7.693793  12265.579662
min    27.000000  48000.000000
25%    35.000000  54000.000000
50%    38.000000  61000.000000
75%    44.000000  72000.000000
max    50.000000  83000.000000
(10, 4)
Median Age: 39.0
Mode Age: 44.0


In [12]:
X = df.iloc[:, :-1].values
y = df.iloc[:, -1].values

In [13]:
print(X)
print(y)

[['France' 44.0 72000.0]
 ['Spain' 27.0 48000.0]
 ['Germany' 30.0 54000.0]
 ['Spain' 38.0 61000.0]
 ['Germany' 40.0 nan]
 ['France' 35.0 58000.0]
 ['Spain' nan 52000.0]
 ['France' 48.0 79000.0]
 ['Germany' 50.0 83000.0]
 ['France' 37.0 67000.0]]
['No' 'Yes' 'No' 'No' 'Yes' 'Yes' 'No' 'Yes' 'No' 'Yes']


Handling the missing values

In [16]:
null_values = df.isnull().sum()
print("null values in dataset :",null_values)

null values in dataset : Country      0
Age          1
Salary       1
Purchased    0
dtype: int64


In [17]:
# fill missing values
df['Age'].fillna(df['Age'].mean(),inplace=True)
df['Salary'].fillna(df['Salary'].median(),inplace=True)
df

Unnamed: 0,Country,Age,Salary,Purchased
0,France,44.0,72000.0,No
1,Spain,27.0,48000.0,Yes
2,Germany,30.0,54000.0,No
3,Spain,38.0,61000.0,No
4,Germany,40.0,61000.0,Yes
5,France,35.0,58000.0,Yes
6,Spain,38.777778,52000.0,No
7,France,48.0,79000.0,Yes
8,Germany,50.0,83000.0,No
9,France,37.0,67000.0,Yes


In [19]:
imputer = SimpleImputer(missing_values=np.nan, strategy='mean')
imputer.fit(X[:, 1:3])
X[:, 1:3] = imputer.transform(X[:, 1:3])
print(X)

[['France' 44.0 72000.0]
 ['Spain' 27.0 48000.0]
 ['Germany' 30.0 54000.0]
 ['Spain' 38.0 61000.0]
 ['Germany' 40.0 63777.77777777778]
 ['France' 35.0 58000.0]
 ['Spain' 38.77777777777778 52000.0]
 ['France' 48.0 79000.0]
 ['Germany' 50.0 83000.0]
 ['France' 37.0 67000.0]]


Handling Categorical data

In [27]:
ct = ColumnTransformer(
    transformers=[
        ('country_encoder', OneHotEncoder(), ['Country']),
        ('Purchased_encoder', OneHotEncoder(), ['Purchased'])
    ],
    remainder='passthrough'  # This keeps the other columns unchanged
)

# Fit and transform the data
transformed = ct.fit_transform(df)

# Get feature names
country_names = ct.named_transformers_['country_encoder'].get_feature_names_out(['Country'])
purchased_names = ct.named_transformers_['Purchased_encoder'].get_feature_names_out(['Purchased'])

# Combine feature names with unchanged columns
all_feature_names = list(country_names) + list(purchased_names) + ['Age', 'Salary']

# Check shapes for debugging
print(f"Transformed shape: {transformed.shape}")  # Should print (10, 5)
print(f"Number of feature names: {len(all_feature_names)}")  # Should print 5

# Create a new DataFrame with transformed data
transformed_df = pd.DataFrame(transformed, columns=all_feature_names)

print(transformed_df)

Transformed shape: (10, 7)
Number of feature names: 7
   Country_France  Country_Germany  Country_Spain  Purchased_No  \
0             1.0              0.0            0.0           1.0   
1             0.0              0.0            1.0           0.0   
2             0.0              1.0            0.0           1.0   
3             0.0              0.0            1.0           1.0   
4             0.0              1.0            0.0           0.0   
5             1.0              0.0            0.0           0.0   
6             0.0              0.0            1.0           1.0   
7             1.0              0.0            0.0           0.0   
8             0.0              1.0            0.0           1.0   
9             1.0              0.0            0.0           0.0   

   Purchased_Yes        Age   Salary  
0            0.0  44.000000  72000.0  
1            1.0  27.000000  48000.0  
2            0.0  30.000000  54000.0  
3            0.0  38.000000  61000.0  
4            

In [24]:
## Using LabelEncoder
label_encoder = LabelEncoder()

transform = label_encoder.fit_transform(df['Country'])
transform_df = pd.DataFrame(transform, columns=['Country_Encoded'])

transform1 = label_encoder.fit_transform(df['Purchased'])
transform_df1 = pd.DataFrame(transform1, columns=['Purchased_Encoded'])

# Combine with the original DataFrame if necessary
df_transformed = pd.concat([df, transform_df,transform_df1], axis=1)
df_transformed.drop(['Country','Purchased'],axis=1,inplace=True)
df_transformed

Unnamed: 0,Age,Salary,Country_Encoded,Purchased_Encoded
0,44.0,72000.0,0,0
1,27.0,48000.0,2,1
2,30.0,54000.0,1,0
3,38.0,61000.0,2,0
4,40.0,61000.0,1,1
5,35.0,58000.0,0,1
6,38.777778,52000.0,2,0
7,48.0,79000.0,0,1
8,50.0,83000.0,1,0
9,37.0,67000.0,0,1


Splitting the dataset

In [25]:
# Splitting the dataset into training and test sets
# 80% training, 20% test
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 1)

In [36]:
# Output to verify the splits
print("Training set (X_train):")
print(X_train)
print("\nTest set (X_test):")
print(X_test)

print("\nTraining labels (y_train):")
print(y_train)
print("\nTest labels (y_test):")
print(y_test)

Training set (X_train):
[[0.0 0.0 1.0 38.77777777777778 52000.0]
 [0.0 1.0 0.0 40.0 63777.77777777778]
 [1.0 0.0 0.0 44.0 72000.0]
 [0.0 0.0 1.0 38.0 61000.0]
 [0.0 0.0 1.0 27.0 48000.0]
 [1.0 0.0 0.0 48.0 79000.0]
 [0.0 1.0 0.0 50.0 83000.0]
 [1.0 0.0 0.0 35.0 58000.0]]

Test set (X_test):
[[0.0 1.0 0.0 30.0 54000.0]
 [1.0 0.0 0.0 37.0 67000.0]]

Training labels (y_train):
['No' 'Yes' 'No' 'No' 'Yes' 'Yes' 'No' 'Yes']

Test labels (y_test):
['No' 'Yes']


Feature scaling

In [34]:
stsc = StandardScaler()
X_train_scaled = stsc.fit_transform(X_train)
X_test_scaled = stsc.fit_transform(X_test)  # Use 'transform' instead of 'fit_transform' for test data
print(X_train_scaled)
print(X_test_scaled)

[[-0.77459667 -0.57735027  1.29099445 -0.19159184 -1.07812594]
 [-0.77459667  1.73205081 -0.77459667 -0.01411729 -0.07013168]
 [ 1.29099445 -0.57735027 -0.77459667  0.56670851  0.63356243]
 [-0.77459667 -0.57735027  1.29099445 -0.30453019 -0.30786617]
 [-0.77459667 -0.57735027  1.29099445 -1.90180114 -1.42046362]
 [ 1.29099445 -0.57735027 -0.77459667  1.14753431  1.23265336]
 [-0.77459667  1.73205081 -0.77459667  1.43794721  1.57499104]
 [ 1.29099445 -0.57735027 -0.77459667 -0.74014954 -0.56461943]]
[[-1.  1.  0. -1. -1.]
 [ 1. -1.  0.  1.  1.]]


In [35]:
mima = MinMaxScaler()
stsc = StandardScaler()
X_train_scaled = mima.fit_transform(X_train)
X_test_scaled = mima.fit_transform(X_test)
print(X_train_scaled)
print(X_test_scaled)

[[0.         0.         1.         0.51207729 0.11428571]
 [0.         1.         0.         0.56521739 0.45079365]
 [1.         0.         0.         0.73913043 0.68571429]
 [0.         0.         1.         0.47826087 0.37142857]
 [0.         0.         1.         0.         0.        ]
 [1.         0.         0.         0.91304348 0.88571429]
 [0.         1.         0.         1.         1.        ]
 [1.         0.         0.         0.34782609 0.28571429]]
[[0. 1. 0. 0. 0.]
 [1. 0. 0. 1. 1.]]
