In [None]:
# Working with Missing Data in Pandas
# Missing Data can occur when no information is provided for one or more
# items or for a whole unit. Missing Data is a very big problem in a real-life scenarios.
# Missing Data can also refer to as NA(Not Available) values in pandas.
# In DataFrame sometimes many datasets simply arrive with missing data,
# either because it exists and was not collected or it never existed.
# For Example, Suppose different users being surveyed may choose not to share their income,
# some users may choose not to share the address in this way many datasets went missing.

# Dataset : https://raw.githubusercontent.com/yashy1626/ds_dataset/refs/heads/main/ufo.csv

In [None]:
import pandas as pd

In [None]:
ufo = pd.read_csv('https://raw.githubusercontent.com/yashy1626/ds_dataset/refs/heads/main/ufo.csv')
ufo.head()

Unnamed: 0,City,Colors Reported,Shape Reported,State,Time
0,Ithaca,,TRIANGLE,NY,6/1/1930 22:00
1,Willingboro,,OTHER,NJ,6/30/1930 20:00
2,Holyoke,,OVAL,CO,2/15/1931 14:00
3,Abilene,,DISK,KS,6/1/1931 13:00
4,New York Worlds Fair,,LIGHT,NY,4/18/1933 19:00


In [None]:
ufo.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 18241 entries, 0 to 18240
Data columns (total 5 columns):
 #   Column           Non-Null Count  Dtype 
---  ------           --------------  ----- 
 0   City             18215 non-null  object
 1   Colors Reported  2882 non-null   object
 2   Shape Reported   15597 non-null  object
 3   State            18241 non-null  object
 4   Time             18241 non-null  object
dtypes: object(5)
memory usage: 712.7+ KB


In [None]:
# check missing value
ufo.isnull().sum()

Unnamed: 0,0
City,26
Colors Reported,15359
Shape Reported,2644
State,0
Time,0


In [None]:
ufo.shape

(18241, 5)

In [None]:
# Handle missing values     (can be use if small numbers of missing values | 50-60 rows )
# inplace = True (will change only if this code is written)
ufo.dropna().shape
# ufo.dropna(inplace = True)

(2486, 5)

In [None]:
# fillna()
# Fill NA or NaN values
# to make it permanent add inplace=True
ufo['City'].fillna(value='CityNotKnown',inplace = True)

In [None]:
ufo['City'].value_counts().head(60)

Unnamed: 0_level_0,count
City,Unnamed: 1_level_1
Seattle,187
New York City,161
Phoenix,137
Houston,108
Las Vegas,105
Portland,102
San Diego,101
Los Angeles,98
Chicago,73
Austin,62


In [None]:
ufo['Colors Reported'].value_counts()
# do not show missing values

Unnamed: 0_level_0,count
Colors Reported,Unnamed: 1_level_1
RED,780
GREEN,531
ORANGE,528
BLUE,450
YELLOW,169
RED GREEN,89
RED BLUE,78
RED ORANGE,44
GREEN BLUE,34
RED GREEN BLUE,33


In [None]:
ufo['Colors Reported'].fillna(value='ColorsNotKnown',inplace = True)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  ufo['Colors Reported'].fillna(value='ColorsNotKnown',inplace = True)


In [None]:
ufo['Shape Reported'].fillna(value='ShapeNotKnown',inplace = True)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  ufo['Shape Reported'].fillna(value='ShapeNotKnown',inplace = True)


In [None]:
ufo.isnull().sum()

Unnamed: 0,0
City,0
Colors Reported,0
Shape Reported,0
State,0
Time,0


In [None]:
# Load the Dataset
data = pd.read_csv('https://raw.githubusercontent.com/yashy1626/ds_dataset/refs/heads/main/sample11.csv')
data

Unnamed: 0,Id,Name,Marks,Percentage
0,1,Alex,78.0,78.0
1,2,Alex,23.0,
2,3,Alex,,67.0
3,4,Alex,12.0,
4,5,Alex,,
5,6,Alex,54.0,
6,7,Alex,65.0,66.0


In [None]:
data.isnull().sum()

Unnamed: 0,0
Id,0
Name,0
Marks,2
Percentage,4


In [None]:
# fillna()
# method --> ffill   [forward fill]

data['Marks'].fillna(method='ffill',inplace=True)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  data['Marks'].fillna(method='ffill',inplace=True)
  data['Marks'].fillna(method='ffill',inplace=True)


In [None]:
data

Unnamed: 0,Id,Name,Marks,Percentage
0,1,Alex,78.0,78.0
1,2,Alex,23.0,
2,3,Alex,23.0,67.0
3,4,Alex,12.0,
4,5,Alex,12.0,
5,6,Alex,54.0,
6,7,Alex,65.0,66.0


In [None]:
# method --> bfill   [backward fill]

data['Percentage'].fillna(method='bfill',inplace=True)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  data['Percentage'].fillna(method='bfill',inplace=True)
  data['Percentage'].fillna(method='bfill',inplace=True)


In [None]:
data

Unnamed: 0,Id,Name,Marks,Percentage
0,1,Alex,78.0,78.0
1,2,Alex,23.0,67.0
2,3,Alex,23.0,67.0
3,4,Alex,12.0,66.0
4,5,Alex,12.0,66.0
5,6,Alex,54.0,66.0
6,7,Alex,65.0,66.0


In [None]:
# Load the Dataset
data = pd.read_csv('https://raw.githubusercontent.com/yashy1626/ds_dataset/refs/heads/main/sample11.csv')
data

Unnamed: 0,Id,Name,Marks,Percentage
0,1,Alex,78.0,78.0
1,2,Alex,23.0,
2,3,Alex,,67.0
3,4,Alex,12.0,
4,5,Alex,,
5,6,Alex,54.0,
6,7,Alex,65.0,66.0


In [None]:
# Fill missing values by taking up a mean
mean_val = data['Percentage'].mean()
mean_val = round(mean_val,1)
mean_val
data['Percentage'].fillna(value=mean_val,inplace=True)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  data['Percentage'].fillna(value=mean_val,inplace=True)


In [None]:
mean_val2 = data['Marks'].mean()
mean_val2 = round(mean_val,1)
mean_val2
data['Marks'].fillna(value=mean_val2,inplace=True)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  data['Marks'].fillna(value=mean_val2,inplace=True)


In [None]:
data

Unnamed: 0,Id,Name,Marks,Percentage
0,1,Alex,78.0,78.0
1,2,Alex,23.0,70.3
2,3,Alex,70.3,67.0
3,4,Alex,12.0,70.3
4,5,Alex,70.3,70.3
5,6,Alex,54.0,70.3
6,7,Alex,65.0,66.0


In [None]:
# Load the Dataset
data = pd.read_csv('https://raw.githubusercontent.com/yashy1626/ds_dataset/refs/heads/main/sample11.csv')
data

Unnamed: 0,Id,Name,Marks,Percentage
0,1,Alex,78.0,78.0
1,2,Alex,23.0,
2,3,Alex,,67.0
3,4,Alex,12.0,
4,5,Alex,,
5,6,Alex,54.0,
6,7,Alex,65.0,66.0


In [None]:
# interpolate()
'''
Python Pandas interpolate() method is used to fill NaN values in the DataFrame
or Series using various interpolation techniques to fill the missing values
rather than hard-coding the value.
Interpolation in Python is a technique used to estimate unknown data points
between two known data points.
'''

data.interpolate()

  data.interpolate()


Unnamed: 0,Id,Name,Marks,Percentage
0,1,Alex,78.0,78.0
1,2,Alex,23.0,72.5
2,3,Alex,17.5,67.0
3,4,Alex,12.0,66.75
4,5,Alex,33.0,66.5
5,6,Alex,54.0,66.25
6,7,Alex,65.0,66.0


In [None]:
# Dealing With Duplicate Data
# Create a DataFrame
data = {
      'StudentName': ['Mark', 'Ali', 'Bob', 'John', 'Johny', 'Mark'],
      'Score': [45, 65, 76, 44, 39, 45]
}
df = pd.DataFrame(data)
df

Unnamed: 0,StudentName,Score
0,Mark,45
1,Ali,65
2,Bob,76
3,John,44
4,Johny,39
5,Mark,45


In [None]:
# dropping duplicates
df.drop_duplicates()

Unnamed: 0,StudentName,Score
0,Mark,45
1,Ali,65
2,Bob,76
3,John,44
4,Johny,39


In [None]:
df.drop_duplicates(inplace=True)

In [None]:
df

Unnamed: 0,StudentName,Score
0,Mark,45
1,Ali,65
2,Bob,76
3,John,44
4,Johny,39


In [None]:
data = {
      'StudentName': ['Mark', 'Ali', 'Bob', 'John', 'Johny', 'Mark'],
      'Score': [45, 65, 76, 44, 39, 45]
}
df = pd.DataFrame(data)
df

Unnamed: 0,StudentName,Score
0,Mark,45
1,Ali,65
2,Bob,76
3,John,44
4,Johny,39
5,Mark,45


In [None]:
df.drop_duplicates(keep='last',inplace=True)

In [None]:
df

Unnamed: 0,StudentName,Score
1,Ali,65
2,Bob,76
3,John,44
4,Johny,39
5,Mark,45
