In [1]:
import numpy as np
import pandas as pd

In [2]:
df = pd.read_csv("claimants sample.csv")
df

Unnamed: 0,CASENUM,CLMSEX,CLMINSUR,SEATBELT,CLMAGE,LOSS,ATTORNEY
0,5,0,1.0,0,50.0,34.94,0
1,3,1,0.0,0,18.0,0.891,1
2,66,0,1.0,0,5.0,0.33,1
3,70,1,1.0,1,31.0,0.037,0
4,96,0,1.0,0,30.0,,1
5,97,1,1.0,0,35.0,0.309,0
6,10,0,,0,9.0,3.538,0
7,36,1,,0,34.0,4.881,0
8,51,1,1.0,0,60.0,0.874,1
9,55,1,1.0,0,,0.35,1


In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10 entries, 0 to 9
Data columns (total 7 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   CASENUM   10 non-null     int64  
 1   CLMSEX    10 non-null     int64  
 2   CLMINSUR  8 non-null      float64
 3   SEATBELT  10 non-null     int64  
 4   CLMAGE    9 non-null      float64
 5   LOSS      9 non-null      float64
 6   ATTORNEY  10 non-null     int64  
dtypes: float64(3), int64(4)
memory usage: 688.0 bytes


# Checking for Missing Values

In [4]:
df.isnull().sum()

CASENUM     0
CLMSEX      0
CLMINSUR    2
SEATBELT    0
CLMAGE      1
LOSS        1
ATTORNEY    0
dtype: int64

###### So from above clminsur have two missing values clmage one and loss.

#### Note: 
- Empty cells can potentially give you a wrong result while analyzing the data. 

### Dealing with missing values.

#### +++++---------------------+++++

### Option 1. Remove the rows that contain missing values.

Note:: The method should only be considered when. 
- The drop data is less than 5% of over all original data.
- Because droping is easy compare to collecting the data.

##### In the below example we dropped 4 records out of 10, so its 40% of data is dropped and its not good approch. 

In [5]:
df1 = df.dropna()
df1

Unnamed: 0,CASENUM,CLMSEX,CLMINSUR,SEATBELT,CLMAGE,LOSS,ATTORNEY
0,5,0,1.0,0,50.0,34.94,0
1,3,1,0.0,0,18.0,0.891,1
2,66,0,1.0,0,5.0,0.33,1
3,70,1,1.0,1,31.0,0.037,0
5,97,1,1.0,0,35.0,0.309,0
8,51,1,1.0,0,60.0,0.874,1


#### +++++---------------------+++++
### Option 2:: Replace the nan values.
- Mean
- Median
- Mode
- Fill with some value (appropriate).
- - Continous Variables --> AGE,LOSS --> Replace with either mean or median.
  - Mean when there are no outliers, median when there are outliers. 
  - Discrete Variables --> INSUR --> Mode is used for discrete data.
 
#### 1. We can replace the missing values by using fillna() within pandas.

In [9]:
 # replacing the age column with mean(Check outliers first, for this column there are no outliers)
df["CLMAGE"].fillna(df["CLMAGE"].mean(),inplace=True) #inplace = True will replace in original data. 
df

Unnamed: 0,CASENUM,CLMSEX,CLMINSUR,SEATBELT,CLMAGE,LOSS,ATTORNEY
0,5,0,1.0,0,50.0,34.94,0
1,3,1,0.0,0,18.0,0.891,1
2,66,0,1.0,0,5.0,0.33,1
3,70,1,1.0,1,31.0,0.037,0
4,96,0,1.0,0,30.0,,1
5,97,1,1.0,0,35.0,0.309,0
6,10,0,,0,9.0,3.538,0
7,36,1,,0,34.0,4.881,0
8,51,1,1.0,0,60.0,0.874,1
9,55,1,1.0,0,30.222222,0.35,1


In [10]:
# replacing the loss column with median. 
df["LOSS"].fillna(df["LOSS"].median(),inplace = True)
df

Unnamed: 0,CASENUM,CLMSEX,CLMINSUR,SEATBELT,CLMAGE,LOSS,ATTORNEY
0,5,0,1.0,0,50.0,34.94,0
1,3,1,0.0,0,18.0,0.891,1
2,66,0,1.0,0,5.0,0.33,1
3,70,1,1.0,1,31.0,0.037,0
4,96,0,1.0,0,30.0,0.874,1
5,97,1,1.0,0,35.0,0.309,0
6,10,0,,0,9.0,3.538,0
7,36,1,,0,34.0,4.881,0
8,51,1,1.0,0,60.0,0.874,1
9,55,1,1.0,0,30.222222,0.35,1


In [14]:
# replacing the insur colum with mode. 
df["CLMINSUR"].fillna(df["CLMINSUR"].mode()[0],inplace=True)
df

#here we write mode()[0] because there can be more than one mode. 

Unnamed: 0,CASENUM,CLMSEX,CLMINSUR,SEATBELT,CLMAGE,LOSS,ATTORNEY
0,5,0,1.0,0,50.0,34.94,0
1,3,1,0.0,0,18.0,0.891,1
2,66,0,1.0,0,5.0,0.33,1
3,70,1,1.0,1,31.0,0.037,0
4,96,0,1.0,0,30.0,0.874,1
5,97,1,1.0,0,35.0,0.309,0
6,10,0,1.0,0,9.0,3.538,0
7,36,1,1.0,0,34.0,4.881,0
8,51,1,1.0,0,60.0,0.874,1
9,55,1,1.0,0,30.222222,0.35,1


In [13]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10 entries, 0 to 9
Data columns (total 7 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   CASENUM   10 non-null     int64  
 1   CLMSEX    10 non-null     int64  
 2   CLMINSUR  10 non-null     float64
 3   SEATBELT  10 non-null     int64  
 4   CLMAGE    10 non-null     float64
 5   LOSS      10 non-null     float64
 6   ATTORNEY  10 non-null     int64  
dtypes: float64(3), int64(4)
memory usage: 688.0 bytes


#### 2. We can replace the missing values by using SimpleImputer() within Sklearn

In [16]:
df = pd.read_csv("claimants sample.csv")
df

Unnamed: 0,CASENUM,CLMSEX,CLMINSUR,SEATBELT,CLMAGE,LOSS,ATTORNEY
0,5,0,1.0,0,50.0,34.94,0
1,3,1,0.0,0,18.0,0.891,1
2,66,0,1.0,0,5.0,0.33,1
3,70,1,1.0,1,31.0,0.037,0
4,96,0,1.0,0,30.0,,1
5,97,1,1.0,0,35.0,0.309,0
6,10,0,,0,9.0,3.538,0
7,36,1,,0,34.0,4.881,0
8,51,1,1.0,0,60.0,0.874,1
9,55,1,1.0,0,,0.35,1


In [17]:
from sklearn.impute import SimpleImputer

In [19]:
mean_imputer = SimpleImputer(strategy = "mean")
df["CLMAGE"] = mean_imputer.fit_transform(df[["CLMAGE"]])
df

Unnamed: 0,CASENUM,CLMSEX,CLMINSUR,SEATBELT,CLMAGE,LOSS,ATTORNEY
0,5,0,1.0,0,50.0,34.94,0
1,3,1,0.0,0,18.0,0.891,1
2,66,0,1.0,0,5.0,0.33,1
3,70,1,1.0,1,31.0,0.037,0
4,96,0,1.0,0,30.0,,1
5,97,1,1.0,0,35.0,0.309,0
6,10,0,,0,9.0,3.538,0
7,36,1,,0,34.0,4.881,0
8,51,1,1.0,0,60.0,0.874,1
9,55,1,1.0,0,30.222222,0.35,1


In [20]:
median_imputer = SimpleImputer(strategy = "median")
df["LOSS"] = mean_imputer.fit_transform(df[["LOSS"]])
df

Unnamed: 0,CASENUM,CLMSEX,CLMINSUR,SEATBELT,CLMAGE,LOSS,ATTORNEY
0,5,0,1.0,0,50.0,34.94,0
1,3,1,0.0,0,18.0,0.891,1
2,66,0,1.0,0,5.0,0.33,1
3,70,1,1.0,1,31.0,0.037,0
4,96,0,1.0,0,30.0,5.127778,1
5,97,1,1.0,0,35.0,0.309,0
6,10,0,,0,9.0,3.538,0
7,36,1,,0,34.0,4.881,0
8,51,1,1.0,0,60.0,0.874,1
9,55,1,1.0,0,30.222222,0.35,1


In [21]:
mode_imputer = SimpleImputer(strategy = "mode")
df["CLMINSUR"] = mean_imputer.fit_transform(df[["CLMINSUR"]])
df

Unnamed: 0,CASENUM,CLMSEX,CLMINSUR,SEATBELT,CLMAGE,LOSS,ATTORNEY
0,5,0,1.0,0,50.0,34.94,0
1,3,1,0.0,0,18.0,0.891,1
2,66,0,1.0,0,5.0,0.33,1
3,70,1,1.0,1,31.0,0.037,0
4,96,0,1.0,0,30.0,5.127778,1
5,97,1,1.0,0,35.0,0.309,0
6,10,0,0.875,0,9.0,3.538,0
7,36,1,0.875,0,34.0,4.881,0
8,51,1,1.0,0,60.0,0.874,1
9,55,1,1.0,0,30.222222,0.35,1


In [22]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10 entries, 0 to 9
Data columns (total 7 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   CASENUM   10 non-null     int64  
 1   CLMSEX    10 non-null     int64  
 2   CLMINSUR  10 non-null     float64
 3   SEATBELT  10 non-null     int64  
 4   CLMAGE    10 non-null     float64
 5   LOSS      10 non-null     float64
 6   ATTORNEY  10 non-null     int64  
dtypes: float64(3), int64(4)
memory usage: 688.0 bytes
