In [21]:
import pandas as pd
import numpy as np

dates = pd.date_range("2025-01-01", periods=10, freq="d")
tickers = ['NVDA','AAPL','MSFT']
df = pd.MultiIndex.from_product([dates,tickers], names=["date", "ticker"]).to_frame()
df['close'] = df.apply(lambda r : np.random.randint(50,500), axis=1)

df.reset_index(drop=True,inplace=True)
df.loc[(df['close'] < 75) | (df['close'] > 400),'close'] = np.nan
##################  Inspecting + Diagnosing ##################

df.info()
df.head()
df.tail()
df.shape
df.describe()
df.columns
df.columns.tolist()
df.isnull().sum()
print(df['close'].size)
df.count()

##################  Handling Missing Data ##################
df2 = df.dropna()
print(df2['close'].size)
df3 = df.interpolate()
df3 = df.ffill()
df3 = df.bfill()
df3 = df.interpolate()
df3 = df3.fillna(0)
#Always check .isna().sum() before and after any fill/drop .that’s your sanity check.
df3.isna().sum()

##################  Dropping / Renaming / Reindexing ##################
df2 = df.drop(['close'], axis=1)
df2.columns
df.drop(index=[0])
df.reset_index(drop=True)
df.set_index('ticker')
df.rename(columns={'ticker':'symbol'})

##################  Detecting Duplicates & Inconsistencies ##################
print(df[df.duplicated()])
df.drop_duplicates(subset=['close'])
df['close'].unique()
df['ticker'].value_counts(normalize=True)

##################  Type Conversion & Normalization ##################

df['close'] = df['close'].astype(float)
df['date'] = pd.to_datetime(df['date'])
df['ticker'] = df['ticker'].astype('category')
df['name'] = df['ticker']
df['name'] = df['name'].str.strip().str.lower()
df['name']= df['name'].replace({'aapl': 'Apple'})
print(df)

##################  Index alignment + sanity checks ##################

df.index.isnull().any()
df.reindex(sorted(df.index))
df.sort_index()

#################### Minimal Data Cleaning Routine (for every dataset)
df.columns = df.columns.str.strip().str.lower()
df = df.drop_duplicates()
df = df.dropna(how='all') 
df = df.reset_index(drop=True)


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 30 entries, 0 to 29
Data columns (total 3 columns):
 #   Column  Non-Null Count  Dtype         
---  ------  --------------  -----         
 0   date    30 non-null     datetime64[ns]
 1   ticker  30 non-null     object        
 2   close   26 non-null     float64       
dtypes: datetime64[ns](1), float64(1), object(1)
memory usage: 852.0+ bytes
30
26
Empty DataFrame
Columns: [date, ticker, close]
Index: []
         date ticker  close   name
0  2025-01-01   NVDA   76.0   nvda
1  2025-01-01   AAPL  351.0  Apple
2  2025-01-01   MSFT   75.0   msft
3  2025-01-02   NVDA  307.0   nvda
4  2025-01-02   AAPL    NaN  Apple
5  2025-01-02   MSFT  269.0   msft
6  2025-01-03   NVDA  332.0   nvda
7  2025-01-03   AAPL  278.0  Apple
8  2025-01-03   MSFT   76.0   msft
9  2025-01-04   NVDA    NaN   nvda
10 2025-01-04   AAPL    NaN  Apple
11 2025-01-04   MSFT  176.0   msft
12 2025-01-05   NVDA  173.0   nvda
13 2025-01-05   AAPL  193.0  Apple
14 2025-01-05 

Unnamed: 0,date,ticker,close,name
0,2025-01-01,NVDA,76.0,nvda
1,2025-01-01,AAPL,351.0,Apple
2,2025-01-01,MSFT,75.0,msft
3,2025-01-02,NVDA,307.0,nvda
4,2025-01-02,AAPL,,Apple
5,2025-01-02,MSFT,269.0,msft
6,2025-01-03,NVDA,332.0,nvda
7,2025-01-03,AAPL,278.0,Apple
8,2025-01-03,MSFT,76.0,msft
9,2025-01-04,NVDA,,nvda
