In [1]:
import pandas as pd
import numpy as np

In [2]:
# read datasdet using pandas
df = pd.read_csv('/content/employees.csv')
df.head(5)

Unnamed: 0,First Name,Gender,Start Date,Last Login Time,Salary,Bonus %,Senior Management,Team
0,Douglas,Male,8/6/1993,12:42 PM,97308,6.945,True,Marketing
1,Thomas,Male,3/31/1996,6:53 AM,61933,4.17,True,
2,Maria,Female,4/23/1993,11:17 AM,130590,11.858,False,Finance
3,Jerry,Male,3/4/2005,1:00 PM,138705,9.34,True,Finance
4,Larry,Male,1/24/1998,4:47 PM,101004,1.389,True,Client Services


In [3]:
df.shape

(1000, 8)

In [4]:
df.describe()

Unnamed: 0,Salary,Bonus %
count,1000.0,1000.0
mean,90662.181,10.207555
std,32923.693342,5.528481
min,35013.0,1.015
25%,62613.0,5.40175
50%,90428.0,9.8385
75%,118740.25,14.838
max,149908.0,19.944


In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 8 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   First Name         933 non-null    object 
 1   Gender             855 non-null    object 
 2   Start Date         1000 non-null   object 
 3   Last Login Time    1000 non-null   object 
 4   Salary             1000 non-null   int64  
 5   Bonus %            1000 non-null   float64
 6   Senior Management  933 non-null    object 
 7   Team               957 non-null    object 
dtypes: float64(1), int64(1), object(6)
memory usage: 62.6+ KB


In [6]:
df.nunique()

Unnamed: 0,0
First Name,200
Gender,2
Start Date,972
Last Login Time,720
Salary,995
Bonus %,971
Senior Management,2
Team,10


In [7]:
df.isnull().sum()

Unnamed: 0,0
First Name,67
Gender,145
Start Date,0
Last Login Time,0
Salary,0
Bonus %,0
Senior Management,67
Team,43


####  let’s try to fill in the missing values of gender with the string “No Gender”.

In [8]:
df['Gender'].fillna("No_gender", inplace=True)
df.isnull().sum()

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['Gender'].fillna("No_gender", inplace=True)


Unnamed: 0,0
First Name,67
Gender,0
Start Date,0
Last Login Time,0
Salary,0
Bonus %,0
Senior Management,67
Team,43


#### We can see that now there is no null value for the gender column. Now, Let’s fill the senior management with the mode value.

In [9]:
mode = df['Senior Management'].mode().values[0]
df['Senior Management']= df['Senior Management'].replace(np.nan, mode)
df.isnull().sum()

  df['Senior Management']= df['Senior Management'].replace(np.nan, mode)


Unnamed: 0,0
First Name,67
Gender,0
Start Date,0
Last Login Time,0
Salary,0
Bonus %,0
Senior Management,0
Team,43


#### Now for the first name and team, we cannot fill the missing values with arbitrary data, so, let’s drop all the rows containing these missing values.

In [10]:
df = df.dropna(axis = 0, how ='any')
print(df.isnull().sum())

First Name           0
Gender               0
Start Date           0
Last Login Time      0
Salary               0
Bonus %              0
Senior Management    0
Team                 0
dtype: int64


In [11]:
df.shape

(899, 8)

#### We can see that our dataset is now free of all the missing values and after dropping the data the number of rows also reduced from 1000 to 899.
  