# Create a simple dataset with intentional missing values

In [1]:
import pandas as pd
import numpy as np

# Create an example DataFrame with missing data
data = {'A': [1, 2, None, 4],
        'B': [None, 2, 3, 4],
        'C': [1, 2, None, 4]}
df = pd.DataFrame(data)
print(df)

     A    B    C
0  1.0  NaN  1.0
1  2.0  2.0  2.0
2  NaN  3.0  NaN
3  4.0  4.0  4.0


# Checking for Missing Data

In [2]:
print(df.isnull().sum())

A    1
B    1
C    1
dtype: int64


# Filling Missing Data

In [3]:
# Fill missing values with column mean
df_filled = df.fillna(df.mean())
print(df_filled)

          A    B         C
0  1.000000  3.0  1.000000
1  2.000000  2.0  2.000000
2  2.333333  3.0  2.333333
3  4.000000  4.0  4.000000


# Dropping Missing Data

In [4]:
# Remove rows with missing data
df_dropped = df.dropna()
print(df_dropped)

     A    B    C
1  2.0  2.0  2.0
3  4.0  4.0  4.0


# Identifying Outliers Using Z-Score

In [5]:
from scipy import stats

# Dataset with an outlier
data = [10, 12, 12, 13, 12, 120]
z_scores = stats.zscore(data)
print(z_scores)

[-0.49173134 -0.44214499 -0.44214499 -0.41735181 -0.44214499  2.2355181 ]


# Identifying Outliers Using IQR

In [6]:
Q1 = np.percentile(data, 25)
Q3 = np.percentile(data, 75)
IQR = Q3 - Q1
lower_bound = Q1 - 1.5 * IQR
upper_bound = Q3 + 1.5 * IQR
print(lower_bound, upper_bound)

10.875 13.875


# Removing Outliers Using Z-score

In [7]:
filtered_data = [d for d, z in zip(data, z_scores) if np.abs(z) < 3]
print(filtered_data)

[10, 12, 12, 13, 12, 120]


# Real-World Example: Financial Data

In [8]:
import yfinance as yf
import pandas as pd
from scipy import stats

# Step 1: Download stock data for Tesla (TSLA)
stock_data = yf.download('TSLA', start="2020-01-01", end="2024-01-01")

# Step 1: Handle missing data by forward-filling previous values
stock_data.ffill(inplace=True)

# Step 2: Calculate daily returns
stock_data['Returns'] = stock_data['Close'].pct_change()

# Step 3: Compute z-scores of returns, omitting NaNs explicitly
stock_data['z_score'] = stats.zscore(stock_data['Returns'], nan_policy='omit')

# Step 4: Remove outliers (returns with |z_score| >= 3 are extreme outliers)
clean_data = stock_data[stock_data['z_score'].abs() < 3]

print(clean_data.head())
print(clean_data.tail())



YF.download() has changed argument auto_adjust default to True


[*********************100%***********************]  1 of 1 completed

Price           Close       High        Low       Open     Volume   Returns  \
Ticker           TSLA       TSLA       TSLA       TSLA       TSLA             
Date                                                                          
2020-01-03  29.534000  30.266666  29.128000  29.366667  266677500  0.029633   
2020-01-06  30.102667  30.104000  29.333332  29.364668  151995000  0.019255   
2020-01-07  31.270666  31.441999  30.224001  30.760000  268231500  0.038801   
2020-01-08  32.809334  33.232666  31.215334  31.580000  467164500  0.049205   
2020-01-09  32.089333  33.253334  31.524668  33.139999  426606000 -0.021945   

Price        z_score  
Ticker                
Date                  
2020-01-03  0.619480  
2020-01-06  0.377443  
2020-01-07  0.833267  
2020-01-08  1.075902  
2020-01-09 -0.583362  
Price            Close        High         Low        Open     Volume  \
Ticker            TSLA        TSLA        TSLA        TSLA       TSLA   
Date                                 


