## Data preprocessing - Handling missing values 

In [39]:
import pandas as pd 
dataset = pd.read_csv("Placement.csv")
dataset

Unnamed: 0,sl_no,gender,ssc_p,ssc_b,hsc_p,hsc_b,hsc_s,degree_p,degree_t,workex,etest_p,specialisation,mba_p,status,salary
0,1,M,67.00,Others,91.00,Others,Commerce,58.00,Sci&Tech,No,55.0,Mkt&HR,58.80,Placed,270000.0
1,2,M,79.33,Central,78.33,Others,Science,77.48,Sci&Tech,Yes,86.5,Mkt&Fin,66.28,Placed,200000.0
2,3,M,65.00,Central,68.00,Central,Arts,64.00,Comm&Mgmt,No,75.0,Mkt&Fin,57.80,Placed,250000.0
3,4,M,56.00,Central,52.00,Central,Science,52.00,Sci&Tech,No,66.0,Mkt&HR,59.43,Not Placed,
4,5,M,85.80,Central,73.60,Central,Commerce,73.30,Comm&Mgmt,No,96.8,Mkt&Fin,55.50,Placed,425000.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
210,211,M,80.60,Others,82.00,Others,Commerce,77.60,Comm&Mgmt,No,91.0,Mkt&Fin,74.49,Placed,400000.0
211,212,M,58.00,Others,60.00,Others,Science,72.00,Sci&Tech,No,74.0,Mkt&Fin,53.62,Placed,275000.0
212,213,M,67.00,Others,67.00,Others,Commerce,73.00,Comm&Mgmt,Yes,59.0,Mkt&Fin,69.72,Placed,295000.0
213,214,F,74.00,Others,66.00,Others,Commerce,58.00,Comm&Mgmt,No,70.0,Mkt&HR,60.23,Placed,204000.0


In [40]:
# 1. find missing values 
dataset.isnull().sum()

sl_no              0
gender             0
ssc_p              0
ssc_b              0
hsc_p              0
hsc_b              0
hsc_s              0
degree_p           0
degree_t           0
workex             0
etest_p            0
specialisation     0
mba_p              0
status             0
salary            67
dtype: int64

In [41]:
# or
dataset.isna().sum()

sl_no              0
gender             0
ssc_p              0
ssc_b              0
hsc_p              0
hsc_b              0
hsc_s              0
degree_p           0
degree_t           0
workex             0
etest_p            0
specialisation     0
mba_p              0
status             0
salary            67
dtype: int64

In [42]:
# 2. find the cause of missing value occurence 
"""
status      salary 
Placed      23000
Not Placed	NaN


In the MBA placement dataset, students with a “Not Placed” status have missing salary values, while placed students have salary values such as 20,000,
30,000, or 40,000.

There are four common methods to handle missing values:
1.Replace using central tendency (mean, median, or mode).
2.Delete the entire column.
3.Replace with a value based on the problem statement or domain knowledge.
4.Use a predictive model (semi-supervised approach) to estimate missing values.

For this dataset, the salary column falls under the third option. According to the problem context, students who are not placed do 
not receive any salary. Therefore, replacing missing salary values with 0 is the most appropriate approach.
Using mean, median, or mode would produce incorrect values, and deleting the salary column is not suitable because it is an important feature. 
Hence, replacing missing salary values with zero for not-placed students is the correct and logical solution.
"""

'\nstatus      salary \nPlaced      23000\nNot Placed\tNaN\n\n\nIn the MBA placement dataset, students with a “Not Placed” status have missing salary values, while placed students have salary values such as 20,000,\n30,000, or 40,000.\n\nThere are four common methods to handle missing values:\n1.Replace using central tendency (mean, median, or mode).\n2.Delete the entire column.\n3.Replace with a value based on the problem statement or domain knowledge.\n4.Use a predictive model (semi-supervised approach) to estimate missing values.\n\nFor this dataset, the salary column falls under the third option. According to the problem context, students who are not placed do \nnot receive any salary. Therefore, replacing missing salary values with 0 is the most appropriate approach.\nUsing mean, median, or mode would produce incorrect values, and deleting the salary column is not suitable because it is an important feature. \nHence, replacing missing salary values with zero for not-placed student

In [43]:
# 3. replace the missing values 
dataset.fillna({"salary":0},inplace=True)

In [44]:
dataset

Unnamed: 0,sl_no,gender,ssc_p,ssc_b,hsc_p,hsc_b,hsc_s,degree_p,degree_t,workex,etest_p,specialisation,mba_p,status,salary
0,1,M,67.00,Others,91.00,Others,Commerce,58.00,Sci&Tech,No,55.0,Mkt&HR,58.80,Placed,270000.0
1,2,M,79.33,Central,78.33,Others,Science,77.48,Sci&Tech,Yes,86.5,Mkt&Fin,66.28,Placed,200000.0
2,3,M,65.00,Central,68.00,Central,Arts,64.00,Comm&Mgmt,No,75.0,Mkt&Fin,57.80,Placed,250000.0
3,4,M,56.00,Central,52.00,Central,Science,52.00,Sci&Tech,No,66.0,Mkt&HR,59.43,Not Placed,0.0
4,5,M,85.80,Central,73.60,Central,Commerce,73.30,Comm&Mgmt,No,96.8,Mkt&Fin,55.50,Placed,425000.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
210,211,M,80.60,Others,82.00,Others,Commerce,77.60,Comm&Mgmt,No,91.0,Mkt&Fin,74.49,Placed,400000.0
211,212,M,58.00,Others,60.00,Others,Science,72.00,Sci&Tech,No,74.0,Mkt&Fin,53.62,Placed,275000.0
212,213,M,67.00,Others,67.00,Others,Commerce,73.00,Comm&Mgmt,Yes,59.0,Mkt&Fin,69.72,Placed,295000.0
213,214,F,74.00,Others,66.00,Others,Commerce,58.00,Comm&Mgmt,No,70.0,Mkt&HR,60.23,Placed,204000.0


# other methods

In [45]:
# other methods - for Practice
# using central tendency - for single column
dataset.fillna({"salary":dataset["salary"].mean()},inplace=True)

In [12]:
dataset

In [34]:
# deleting entire column (axis=1)
dataset.dropna(axis=1, inplace=True)

In [36]:
dataset     # salary column was deleted (which has null values)

Unnamed: 0,sl_no,gender,ssc_p,ssc_b,hsc_p,hsc_b,hsc_s,degree_p,degree_t,workex,etest_p,specialisation,mba_p,status
0,1,M,67.00,Others,91.00,Others,Commerce,58.00,Sci&Tech,No,55.0,Mkt&HR,58.80,Placed
1,2,M,79.33,Central,78.33,Others,Science,77.48,Sci&Tech,Yes,86.5,Mkt&Fin,66.28,Placed
2,3,M,65.00,Central,68.00,Central,Arts,64.00,Comm&Mgmt,No,75.0,Mkt&Fin,57.80,Placed
3,4,M,56.00,Central,52.00,Central,Science,52.00,Sci&Tech,No,66.0,Mkt&HR,59.43,Not Placed
4,5,M,85.80,Central,73.60,Central,Commerce,73.30,Comm&Mgmt,No,96.8,Mkt&Fin,55.50,Placed
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
210,211,M,80.60,Others,82.00,Others,Commerce,77.60,Comm&Mgmt,No,91.0,Mkt&Fin,74.49,Placed
211,212,M,58.00,Others,60.00,Others,Science,72.00,Sci&Tech,No,74.0,Mkt&Fin,53.62,Placed
212,213,M,67.00,Others,67.00,Others,Commerce,73.00,Comm&Mgmt,Yes,59.0,Mkt&Fin,69.72,Placed
213,214,F,74.00,Others,66.00,Others,Commerce,58.00,Comm&Mgmt,No,70.0,Mkt&HR,60.23,Placed


In [None]:
# importing quan qual function logic to find - quan cols from dataset
from Univariate_analysis import Univariate
Univariate=Univariate(dataset)
quan, qual = Univariate.quanQual()

In [None]:
dataset[quan].head()

In [None]:
# using central tendency - for multiple column
from sklearn.impute import SimpleImputer 
import numpy as np 
imp_mean = SimpleImputer(missing_values=np.nan, strategy='mean') 
imp_mean.fit(dataset[quan])
dataset = imp_mean.transform(dataset[quan])

In [None]:
dataset

In [None]:
# convert this into table - dataframe
dataset = pd.DataFrame(dataset,columns=quan[1:])
dataset