In [1]:
import pandas as pd

In [2]:
df = pd.read_csv('../data/processed/bike_sharing_cleaned.csv')
df

Unnamed: 0,instant,dteday,season,yr,mnth,hr,holiday,weekday,workingday,weathersit,temp,atemp,hum,windspeed,casual,registered,cnt,mixed_type_col
0,1.0,2011-01-01,1.0,0.0,1.0,0.0,0.0,6.0,0.0,1.0,0.24,0.2879,0.81,0.0000,3.0,13.0,16.0,702.0
1,2.0,2011-01-01,1.0,0.0,1.0,1.0,0.0,6.0,0.0,1.0,0.22,0.2727,0.80,0.0000,8.0,32.0,40.0,831.0
2,3.0,2011-01-01,1.0,0.0,1.0,2.0,0.0,6.0,0.0,1.0,0.22,0.2727,0.80,0.0000,5.0,27.0,32.0,175.0
3,4.0,2011-01-01,1.0,0.0,1.0,3.0,0.0,6.0,0.0,1.0,0.24,0.2879,0.75,0.0000,3.0,10.0,13.0,581.0
4,5.0,2011-01-01,1.0,0.0,1.0,4.0,0.0,6.0,0.0,1.0,0.24,0.2879,0.75,0.0000,0.0,1.0,1.0,659.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
17721,13401.0,2012-07-17,3.0,1.0,7.0,13.0,0.0,2.0,1.0,1.0,0.92,0.8182,0.31,0.1940,53.0,168.0,221.0,
17722,17216.0,2012-12-25,1.0,1.0,12.0,4.0,1.0,2.0,0.0,2.0,0.24,0.2576,0.87,0.0896,0.0,1.0,1.0,27.0
17723,9341.0,2012-01-30,1.0,1.0,1.0,2.0,0.0,1.0,1.0,1.0,0.24,0.2121,0.48,0.3582,1.0,6.0,7.0,
17724,6268.0,2011-09-23,4.0,0.0,9.0,17.0,0.0,5.0,1.0,2.0,0.60,0.5000,1.00,0.0000,13.0,86.0,,602.0


In [3]:
# Delete mixed_type_col
df =  df.drop('mixed_type_col', axis=1)

In [4]:
# Check NaN values in casual, registered, and cnt columns
casual_nan = df['casual'].isna().sum()
registered_nan = df['registered'].isna().sum()
cnt_nan = df['cnt'].isna().sum()

total_rows = len(df)

print(f"NaN values in 'casual': {casual_nan} ({casual_nan/total_rows*100:.2f}%)")
print(f"NaN values in 'registered': {registered_nan} ({registered_nan/total_rows*100:.2f}%)")
print(f"NaN values in 'cnt': {cnt_nan} ({cnt_nan/total_rows*100:.2f}%)")

# Sum all NaN values from these 3 columns
total_nan = casual_nan + registered_nan + cnt_nan
print(f"\nTotal NaN values in casual, registered, and cnt: {total_nan} ({total_nan/total_rows*100:.2f}%)")

NaN values in 'casual': 272 (1.53%)
NaN values in 'registered': 256 (1.44%)
NaN values in 'cnt': 247 (1.39%)

Total NaN values in casual, registered, and cnt: 775 (4.37%)


In [5]:
# Calculate missing values using the relationship: casual + registered = cnt

# Fill missing cnt values where casual and registered are available
df['cnt'] = df['cnt'].fillna(df['casual'] + df['registered'])

# Fill missing casual values where registered and cnt are available
df['casual'] = df['casual'].fillna(df['cnt'] - df['registered'])

# Fill missing registered values where casual and cnt are available
df['registered'] = df['registered'].fillna(df['cnt'] - df['casual'])

# Check NaN values after filling
casual_nan_after = df['casual'].isna().sum()
registered_nan_after = df['registered'].isna().sum()
cnt_nan_after = df['cnt'].isna().sum()

print("After filling missing values:")
print(f"NaN values in 'casual': {casual_nan_after} ({casual_nan_after/total_rows*100:.2f}%)")
print(f"NaN values in 'registered': {registered_nan_after} ({registered_nan_after/total_rows*100:.2f}%)")
print(f"NaN values in 'cnt': {cnt_nan_after} ({cnt_nan_after/total_rows*100:.2f}%)")

total_nan_after = casual_nan_after + registered_nan_after + cnt_nan_after
print(f"\nTotal NaN values after filling: {total_nan_after} ({total_nan_after/total_rows*100:.2f}%)")

After filling missing values:
NaN values in 'casual': 12 (0.07%)
NaN values in 'registered': 10 (0.06%)
NaN values in 'cnt': 8 (0.05%)

Total NaN values after filling: 30 (0.17%)


In [6]:
# Delete rows with NaN values in casual, registered, or cnt columns
rows_before = len(df)
df = df.dropna(subset=['casual', 'registered', 'cnt'])
rows_after = len(df)

rows_deleted = rows_before - rows_after
print(f"Rows before deletion: {rows_before}")
print(f"Rows after deletion: {rows_after}")
print(f"Rows deleted: {rows_deleted} ({rows_deleted/rows_before*100:.2f}%)")

Rows before deletion: 17726
Rows after deletion: 17711
Rows deleted: 15 (0.08%)


In [7]:
# Detect cases where casual + registered != cnt
# Using a small tolerance for floating point comparison
tolerance = 0.01
inconsistent_mask = abs((df['casual'] + df['registered']) - df['cnt']) > tolerance

inconsistent_count = inconsistent_mask.sum()
print(f"Number of rows where casual + registered != cnt: {inconsistent_count}")
print(f"Percentage: {inconsistent_count/len(df)*100:.2f}%")

if inconsistent_count > 0:
    print("\n" + "="*80)
    print("Rows where casual + registered != cnt:")
    print("="*80)
    inconsistent_df = df[inconsistent_mask].copy()
    inconsistent_df['calculated_sum'] = inconsistent_df['casual'] + inconsistent_df['registered']
    inconsistent_df['difference'] = inconsistent_df['calculated_sum'] - inconsistent_df['cnt']
    
    # Display relevant columns
    display_cols = ['instant', 'dteday', 'casual', 'registered', 'cnt', 'calculated_sum', 'difference']
    display(inconsistent_df[display_cols])
else:
    print("\nAll rows are consistent: casual + registered = cnt")

Number of rows where casual + registered != cnt: 477
Percentage: 2.69%

Rows where casual + registered != cnt:


Unnamed: 0,instant,dteday,casual,registered,cnt,calculated_sum,difference
16,17.0,2011-01-01,41.0,1041.0,93.0,1082.0,989.0
23,24.0,2011-01-01,15.0,24.0,415.0,39.0,-376.0
39,40.0,2011-01-02,108.0,67.0,76.0,175.0,99.0
78,79.0,2011-01-04,55.0,37.0,42.0,92.0,50.0
100,101.0,2011-01-05,6.0,109.0,5405.0,115.0,-5290.0
...,...,...,...,...,...,...,...
17624,9861.0,2012-02-20,9.0,513.0,126.0,522.0,396.0
17657,5193.0,2011-08-09,6.0,95.0,2424.0,101.0,-2323.0
17671,15536.0,2012-10-14,8600.0,362.0,562.0,8962.0,8400.0
17708,4725.0,2011-07-20,1023.0,429.0,497.0,1452.0,955.0


In [8]:
# Function to detect outliers using IQR method
def detect_outliers(df, column_name, iqr_multiplier=1.5):
    """
    Detect outliers in a column using the IQR method.
    Returns the index of outlier rows.
    
    Parameters:
    - df: DataFrame
    - column_name: Column to analyze
    - iqr_multiplier: Multiplier for IQR (default=1.5)
    """
    Q1 = df[column_name].quantile(0.25)
    Q3 = df[column_name].quantile(0.75)
    IQR = Q3 - Q1
    
    lower_bound = Q1 - iqr_multiplier * IQR
    upper_bound = Q3 + iqr_multiplier * IQR
    
    outlier_mask = (df[column_name] < lower_bound) | (df[column_name] > upper_bound)
    outlier_indices = df[outlier_mask].index
    
    print(f"\nColumn: {column_name} (IQR multiplier: {iqr_multiplier})")
    print(f"  Q1: {Q1:.2f}, Q3: {Q3:.2f}, IQR: {IQR:.2f}")
    print(f"  Lower bound: {lower_bound:.2f}, Upper bound: {upper_bound:.2f}")
    print(f"  Number of outliers: {len(outlier_indices)} ({len(outlier_indices)/len(df)*100:.2f}%)")
    
    return outlier_indices


IQR_MULTIPLIER = 2.5

print("="*80)
print(f"OUTLIER DETECTION (IQR Multiplier: {IQR_MULTIPLIER})")
print("="*80)

casual_outliers = detect_outliers(df, 'casual', IQR_MULTIPLIER)
registered_outliers = detect_outliers(df, 'registered', IQR_MULTIPLIER)
cnt_outliers = detect_outliers(df, 'cnt', IQR_MULTIPLIER)

OUTLIER DETECTION (IQR Multiplier: 2.5)

Column: casual (IQR multiplier: 2.5)
  Q1: 4.00, Q3: 50.00, IQR: 46.00
  Lower bound: -111.00, Upper bound: 165.00
  Number of outliers: 783 (4.42%)

Column: registered (IQR multiplier: 2.5)
  Q1: 35.00, Q3: 224.00, IQR: 189.00
  Lower bound: -437.50, Upper bound: 696.50
  Number of outliers: 297 (1.68%)

Column: cnt (IQR multiplier: 2.5)
  Q1: 41.00, Q3: 285.00, IQR: 244.00
  Lower bound: -569.00, Upper bound: 895.00
  Number of outliers: 130 (0.73%)


## Outlier Detection Strategy

I am using an IQR multiplier of **2.5** instead of the standard 1.5 to detect more extreme outliers only. 

**Rationale:**
- Outliers in bike sharing data could be legitimate due to holidays, special events, or favorable weather conditions
- Using a higher threshold (2.5) helps avoid flagging legitimate high-demand periods as outliers
- This approach focuses on identifying truly anomalous data points that are likely data entry errors rather than natural variations in demand

In [9]:
# Identify rows where outliers appear in only one column
casual_only = set(casual_outliers) - set(registered_outliers) - set(cnt_outliers)
registered_only = set(registered_outliers) - set(casual_outliers) - set(cnt_outliers)
cnt_only = set(cnt_outliers) - set(casual_outliers) - set(registered_outliers)

print("\n" + "="*80)
print("OUTLIERS IN SINGLE COLUMNS")
print("="*80)
print(f"Outliers only in 'casual': {len(casual_only)}")
print(f"Outliers only in 'registered': {len(registered_only)}")
print(f"Outliers only in 'cnt': {len(cnt_only)}")

# Recalculate values for rows with outliers in only one column
print("\n" + "="*80)
print("RECALCULATING VALUES FOR SINGLE-COLUMN OUTLIERS")
print("="*80)

# For casual outliers only: recalculate casual = cnt - registered
for idx in casual_only:
    old_casual = df.loc[idx, 'casual']
    df.loc[idx, 'casual'] = df.loc[idx, 'cnt'] - df.loc[idx, 'registered']
    new_casual = df.loc[idx, 'casual']
    if abs(old_casual - new_casual) > 0.01:
        print(f"Row {idx}: casual changed from {old_casual:.2f} to {new_casual:.2f}")

# For registered outliers only: recalculate registered = cnt - casual
for idx in registered_only:
    old_registered = df.loc[idx, 'registered']
    df.loc[idx, 'registered'] = df.loc[idx, 'cnt'] - df.loc[idx, 'casual']
    new_registered = df.loc[idx, 'registered']
    if abs(old_registered - new_registered) > 0.01:
        print(f"Row {idx}: registered changed from {old_registered:.2f} to {new_registered:.2f}")

# For cnt outliers only: recalculate cnt = casual + registered
for idx in cnt_only:
    old_cnt = df.loc[idx, 'cnt']
    df.loc[idx, 'cnt'] = df.loc[idx, 'casual'] + df.loc[idx, 'registered']
    new_cnt = df.loc[idx, 'cnt']
    if abs(old_cnt - new_cnt) > 0.01:
        print(f"Row {idx}: cnt changed from {old_cnt:.2f} to {new_cnt:.2f}")

print(f"\nTotal values recalculated: {len(casual_only) + len(registered_only) + len(cnt_only)}")


OUTLIERS IN SINGLE COLUMNS
Outliers only in 'casual': 759
Outliers only in 'registered': 255
Outliers only in 'cnt': 94

RECALCULATING VALUES FOR SINGLE-COLUMN OUTLIERS
Row 4096: casual changed from 3953.00 to 67.00
Row 16386: casual changed from 5950.00 to 70.00
Row 10253: casual changed from 272.00 to 8.00
Row 6166: casual changed from 4437.00 to 51.00
Row 6168: casual changed from 4185.00 to 45.00
Row 12319: casual changed from 1025.00 to 250.00
Row 8235: casual changed from 504.00 to 9.00
Row 2093: casual changed from 546.00 to 1.00
Row 6210: casual changed from 287.00 to 18.00
Row 10316: casual changed from 272.00 to 19.00
Row 16464: casual changed from 1150.00 to 50.00
Row 10326: casual changed from 539.00 to 20.00
Row 14433: casual changed from 6873.00 to 87.00
Row 112: casual changed from 264.00 to 3.00
Row 130: casual changed from 528.00 to 12.00
Row 14487: casual changed from 450.00 to 102.00
Row 8362: casual changed from 615.00 to 1.00
Row 6318: casual changed from 406.00 t

In [10]:
# Find rows with outliers in 2 or more columns
all_indices = set(casual_outliers) | set(registered_outliers) | set(cnt_outliers)

multi_outlier_rows = []
for idx in all_indices:
    outlier_count = 0
    outlier_cols = []
    
    if idx in casual_outliers:
        outlier_count += 1
        outlier_cols.append('casual')
    if idx in registered_outliers:
        outlier_count += 1
        outlier_cols.append('registered')
    if idx in cnt_outliers:
        outlier_count += 1
        outlier_cols.append('cnt')
    
    if outlier_count >= 2:
        multi_outlier_rows.append((idx, outlier_count, outlier_cols))

print("="*80)
print("ROWS WITH OUTLIERS IN 2 OR MORE COLUMNS")
print("="*80)

if len(multi_outlier_rows) > 0:
    # Sort by number of outlier columns (descending)
    multi_outlier_rows.sort(key=lambda x: x[1], reverse=True)
    
    outliers_in_3 = sum(1 for _, count, _ in multi_outlier_rows if count == 3)
    outliers_in_2 = sum(1 for _, count, _ in multi_outlier_rows if count == 2)
    total_multi_outliers = outliers_in_3 + outliers_in_2
    
    print(f"Total rows with outliers in 2 or more columns: {total_multi_outliers} ({total_multi_outliers/len(df)*100:.2f}%)")
    print("\nBreakdown:")
    print(f"  Outliers in all 3 columns: {outliers_in_3} ({outliers_in_3/len(df)*100:.2f}%)")
    print(f"  Outliers in exactly 2 columns: {outliers_in_2} ({outliers_in_2/len(df)*100:.2f}%)")
    
    # Show the actual rows
    print("\n" + "="*80)
    print("DETAILED VIEW OF ROWS WITH MULTIPLE OUTLIERS")
    print("="*80)
    
    multi_outlier_indices = [idx for idx, _, _ in multi_outlier_rows]
    result_df = df.loc[multi_outlier_indices].copy()
    result_df['outlier_columns'] = [', '.join(cols) for _, _, cols in multi_outlier_rows]
    result_df['outlier_count'] = [count for _, count, _ in multi_outlier_rows]
    
    # Display relevant columns
    display_cols = ['instant', 'dteday', 'casual', 'registered', 'cnt', 'outlier_count', 'outlier_columns']
    display(result_df[display_cols].head())
else:
    print("No rows found with outliers in 2 or more columns.")

ROWS WITH OUTLIERS IN 2 OR MORE COLUMNS
Total rows with outliers in 2 or more columns: 50 (0.28%)

Breakdown:
  Outliers in all 3 columns: 2 (0.01%)
  Outliers in exactly 2 columns: 48 (0.27%)

DETAILED VIEW OF ROWS WITH MULTIPLE OUTLIERS


Unnamed: 0,instant,dteday,casual,registered,cnt,outlier_count,outlier_columns
14748,14749.0,2012-09-11,168.0,802.0,970.0,3,"casual, registered, cnt"
15780,15781.0,2012-10-24,1065.0,876.0,963.0,3,"casual, registered, cnt"
12323,12324.0,2012-06-02,275.0,842.0,644.0,2,"casual, registered"
4188,4189.0,2011-06-28,1845.0,112.0,1957.0,2,"casual, cnt"
12487,12488.0,2012-06-09,196.0,1018.0,531.0,2,"casual, registered"


In [11]:
# Delete rows with outliers in 2 or more columns
rows_before_delete = len(df)

# Get indices of rows with multiple outliers
multi_outlier_indices = [idx for idx, _, _ in multi_outlier_rows]

# Drop these rows from the dataframe
df = df.drop(index=multi_outlier_indices)

rows_after_delete = len(df)
rows_deleted_outliers = rows_before_delete - rows_after_delete

print("="*80)
print("DELETING ROWS WITH OUTLIERS IN 2 OR MORE COLUMNS")
print("="*80)
print(f"Rows before deletion: {rows_before_delete}")
print(f"Rows after deletion: {rows_after_delete}")
print(f"Rows deleted: {rows_deleted_outliers} ({rows_deleted_outliers/rows_before_delete*100:.2f}%)")
print(f"\nRemaining rows: {rows_after_delete}")

DELETING ROWS WITH OUTLIERS IN 2 OR MORE COLUMNS
Rows before deletion: 17711
Rows after deletion: 17661
Rows deleted: 50 (0.28%)

Remaining rows: 17661


In [12]:
df.to_csv("../data/processed/bike_sharing_cleaned.csv", index=False)
print("Data saved")

Data saved
