In [1]:
import pandas as pd

In [2]:
df = pd.read_csv('../data/processed/bike_sharing_cleaned.csv')
df  

Unnamed: 0,instant,dteday,season,yr,mnth,hr,holiday,weekday,workingday,weathersit,temp,atemp,hum,windspeed,casual,registered,cnt,mixed_type_col
0,1.0,2011-01-01,1.0,0.0,1.0,0.0,0.0,6.0,0.0,1.0,0.24,0.2879,0.81,0.0000,3.0,13.0,16.0,702.0
1,2.0,2011-01-01,1.0,0.0,1.0,1.0,0.0,6.0,0.0,1.0,0.22,0.2727,0.80,0.0000,8.0,32.0,40.0,831.0
2,3.0,2011-01-01,1.0,0.0,1.0,2.0,0.0,6.0,0.0,1.0,0.22,0.2727,0.80,0.0000,5.0,27.0,32.0,175.0
3,4.0,2011-01-01,1.0,0.0,1.0,3.0,0.0,6.0,0.0,1.0,0.24,0.2879,0.75,0.0000,3.0,10.0,13.0,581.0
4,5.0,2011-01-01,1.0,0.0,1.0,4.0,0.0,6.0,0.0,1.0,0.24,0.2879,0.75,0.0000,0.0,1.0,1.0,659.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
17721,13401.0,2012-07-17,3.0,1.0,7.0,13.0,0.0,2.0,1.0,1.0,0.92,0.8182,0.31,0.1940,53.0,168.0,221.0,
17722,17216.0,2012-12-25,1.0,1.0,12.0,4.0,1.0,2.0,0.0,2.0,0.24,0.2576,0.87,0.0896,0.0,1.0,1.0,27.0
17723,9341.0,2012-01-30,1.0,1.0,1.0,2.0,0.0,1.0,1.0,1.0,0.24,0.2121,0.48,0.3582,1.0,6.0,7.0,
17724,6268.0,2011-09-23,4.0,0.0,9.0,17.0,0.0,5.0,1.0,2.0,0.60,0.5000,1.00,0.0000,13.0,86.0,,602.0


In [3]:
# Delete mixed_type_col
df =  df.drop('mixed_type_col', axis=1)

In [4]:
# Round values to integers
print("="*80)
print("ROUNDING VALUES TO INTEGERS")
print("="*80)

df['casual'] = df['casual'].round().astype('Int64')  # Use Int64 to handle NaN
df['registered'] = df['registered'].round().astype('Int64')
df['cnt'] = df['cnt'].round().astype('Int64')

print("All values in casual, registered, and cnt have been rounded to integers")

ROUNDING VALUES TO INTEGERS
All values in casual, registered, and cnt have been rounded to integers


In [5]:
# Use the absolute value to avoid negative values
print("="*80)
print("USING THE ABSOLUTE VALUE TO AVOID NEGATIVE VALUES")
print("="*80)

df['cnt'] = df['cnt'].abs()
df['casual'] = df['casual'].abs()
df['registered'] = df['registered'].abs()

print("All values in casual, registered, and cnt have been converted to absolute values")

USING THE ABSOLUTE VALUE TO AVOID NEGATIVE VALUES
All values in casual, registered, and cnt have been converted to absolute values


In [6]:
# Function to detect outliers using IQR method
def detect_outliers(df, column_name, iqr_multiplier=1.5):
    """
    Detect outliers in a column using the IQR method.
    Returns the index of outlier rows.
    
    Parameters:
    - df: DataFrame
    - column_name: Column to analyze
    - iqr_multiplier: Multiplier for IQR (default=1.5)
    """
    Q1 = df[column_name].quantile(0.25)
    Q3 = df[column_name].quantile(0.75)
    IQR = Q3 - Q1
    
    lower_bound = Q1 - iqr_multiplier * IQR
    upper_bound = Q3 + iqr_multiplier * IQR
    
    outlier_mask = (df[column_name] < lower_bound) | (df[column_name] > upper_bound)
    outlier_indices = df[outlier_mask].index
    
    print(f"\nColumn: {column_name} (IQR multiplier: {iqr_multiplier})")
    print(f"  Q1: {Q1:.2f}, Q3: {Q3:.2f}, IQR: {IQR:.2f}")
    print(f"  Lower bound: {lower_bound:.2f}, Upper bound: {upper_bound:.2f}")
    print(f"  Number of outliers: {len(outlier_indices)} ({len(outlier_indices)/len(df)*100:.2f}%)")
    
    return outlier_indices


IQR_MULTIPLIER = 2.5

print("="*80)
print(f"OUTLIER DETECTION (IQR Multiplier: {IQR_MULTIPLIER})")
print("="*80)

casual_outliers = detect_outliers(df, 'casual', IQR_MULTIPLIER)
registered_outliers = detect_outliers(df, 'registered', IQR_MULTIPLIER)
cnt_outliers = detect_outliers(df, 'cnt', IQR_MULTIPLIER)

OUTLIER DETECTION (IQR Multiplier: 2.5)

Column: casual (IQR multiplier: 2.5)
  Q1: 4.00, Q3: 49.75, IQR: 45.75
  Lower bound: -110.38, Upper bound: 164.12
  Number of outliers: 773 (4.36%)

Column: registered (IQR multiplier: 2.5)
  Q1: 35.00, Q3: 224.00, IQR: 189.00
  Lower bound: -437.50, Upper bound: 696.50
  Number of outliers: 290 (1.64%)

Column: cnt (IQR multiplier: 2.5)
  Q1: 41.00, Q3: 285.00, IQR: 244.00
  Lower bound: -569.00, Upper bound: 895.00
  Number of outliers: 127 (0.72%)


In [7]:
# Identify rows where outliers appear in only one column
casual_only = set(casual_outliers) - set(registered_outliers) - set(cnt_outliers)
registered_only = set(registered_outliers) - set(casual_outliers) - set(cnt_outliers)
cnt_only = set(cnt_outliers) - set(casual_outliers) - set(registered_outliers)

print("\n" + "="*80)
print("OUTLIERS IN SINGLE COLUMNS")
print("="*80)
print(f"Outliers only in 'casual': {len(casual_only)}")
print(f"Outliers only in 'registered': {len(registered_only)}")
print(f"Outliers only in 'cnt': {len(cnt_only)}")

# Calculate medians (excluding NaN)
casual_median = df['casual'].median()
registered_median = df['registered'].median()
cnt_median = df['cnt'].median()

print("\n" + "="*80)
print("REPLACING SINGLE-COLUMN OUTLIERS WITH MEDIAN")
print("="*80)
print(f"Median values:")
print(f"  casual: {casual_median}")
print(f"  registered: {registered_median}")
print(f"  cnt: {cnt_median}")

# Replace casual outliers with median
if len(casual_only) > 0:
    print(f"\nReplacing {len(casual_only)} outliers in 'casual' with median ({casual_median})")
    df.loc[list(casual_only), 'casual'] = casual_median

# Replace registered outliers with median
if len(registered_only) > 0:
    print(f"Replacing {len(registered_only)} outliers in 'registered' with median ({registered_median})")
    df.loc[list(registered_only), 'registered'] = registered_median

# Replace cnt outliers with median
if len(cnt_only) > 0:
    print(f"Replacing {len(cnt_only)} outliers in 'cnt' with median ({cnt_median})")
    df.loc[list(cnt_only), 'cnt'] = cnt_median

print(f"\nTotal outliers replaced: {len(casual_only) + len(registered_only) + len(cnt_only)}")


OUTLIERS IN SINGLE COLUMNS
Outliers only in 'casual': 759
Outliers only in 'registered': 256
Outliers only in 'cnt': 95

REPLACING SINGLE-COLUMN OUTLIERS WITH MEDIAN
Median values:
  casual: 17.0
  registered: 118.0
  cnt: 144.0

Replacing 759 outliers in 'casual' with median (17.0)
Replacing 256 outliers in 'registered' with median (118.0)
Replacing 95 outliers in 'cnt' with median (144.0)

Total outliers replaced: 1110


In [8]:
# Calculate missing values using the relationship: casual + registered = cnt
print("="*80)
print("CALCULATING MISSING VALUES USING casual + registered = cnt")
print("="*80)

# Check NaN values before filling
casual_nan_before = df['casual'].isna().sum()
registered_nan_before = df['registered'].isna().sum()
cnt_nan_before = df['cnt'].isna().sum()

print(f"Missing values before calculation:")
print(f"  casual: {casual_nan_before}")
print(f"  registered: {registered_nan_before}")
print(f"  cnt: {cnt_nan_before}")

# Fill missing cnt values where casual and registered are available
df['cnt'] = df['cnt'].fillna(df['casual'] + df['registered'])

# Fill missing casual values where registered and cnt are available
df['casual'] = df['casual'].fillna(df['cnt'] - df['registered'])

# Fill missing registered values where casual and cnt are available
df['registered'] = df['registered'].fillna(df['cnt'] - df['casual'])

# Check NaN values after filling
casual_nan_after = df['casual'].isna().sum()
registered_nan_after = df['registered'].isna().sum()
cnt_nan_after = df['cnt'].isna().sum()

print(f"\nMissing values after calculation:")
print(f"  casual: {casual_nan_after}")
print(f"  registered: {registered_nan_after}")
print(f"  cnt: {cnt_nan_after}")

total_filled = (casual_nan_before - casual_nan_after) + (registered_nan_before - registered_nan_after) + (cnt_nan_before - cnt_nan_after)
print(f"\nTotal missing values filled: {total_filled}")

CALCULATING MISSING VALUES USING casual + registered = cnt
Missing values before calculation:
  casual: 272
  registered: 256
  cnt: 247

Missing values after calculation:
  casual: 12
  registered: 10
  cnt: 8

Total missing values filled: 745


In [9]:
# Delete rows with NaN values in casual, registered, or cnt columns
rows_before = len(df)
df = df.dropna(subset=['casual', 'registered', 'cnt'])
rows_after = len(df)

rows_deleted = rows_before - rows_after
print(f"Rows before deletion: {rows_before}")
print(f"Rows after deletion: {rows_after}")
print(f"Rows deleted: {rows_deleted} ({rows_deleted/rows_before*100:.2f}%)")

Rows before deletion: 17726
Rows after deletion: 17711
Rows deleted: 15 (0.08%)


In [10]:
# Recalculate rows where casual + registered != cnt
print("="*80)
print("FIXING INCONSISTENT ROWS (casual + registered != cnt)")
print("="*80)

# Find inconsistent rows (using integer comparison, no tolerance needed)
inconsistent_mask = (df['casual'] + df['registered']) != df['cnt']

display(df[inconsistent_mask][['casual', 'registered', 'cnt']])
inconsistent_count = inconsistent_mask.sum()

print(f"Rows where casual + registered != cnt: {inconsistent_count} ({inconsistent_count/len(df)*100:.2f}%)")

if inconsistent_count > 0:
    print(f"\nRecalculating the biggest value in each inconsistent row\n")

    # Track what was recalculated
    recalc_casual = 0
    recalc_registered = 0
    recalc_cnt = 0

    for idx in df[inconsistent_mask].index:
        casual_val = df.loc[idx, 'casual']
        registered_val = df.loc[idx, 'registered']
        cnt_val = df.loc[idx, 'cnt']

        # Find which value is the biggest
        max_val = max(casual_val, registered_val, cnt_val)

        if casual_val == max_val:
            # Recalculate casual = cnt - registered
            df.loc[idx, 'casual'] = df.loc[idx, 'cnt'] - df.loc[idx, 'registered']
            recalc_casual += 1
        elif registered_val == max_val:
            # Recalculate registered = cnt - casual
            df.loc[idx, 'registered'] = df.loc[idx, 'cnt'] - df.loc[idx, 'casual']
            recalc_registered += 1
        else:
            # Recalculate cnt = casual + registered
            df.loc[idx, 'cnt'] = df.loc[idx, 'casual'] + df.loc[idx, 'registered']
            recalc_cnt += 1
    
    print(f"Recalculation summary:")
    print(f"  casual recalculated: {recalc_casual}")
    print(f"  registered recalculated: {recalc_registered}")
    print(f"  cnt recalculated: {recalc_cnt}")
    
    # Verify all rows are now consistent
    still_inconsistent = ((df['casual'] + df['registered']) != df['cnt']).sum()
    
    print(f"\nVerification after recalculation:")
    print(f"  Inconsistent rows remaining: {still_inconsistent}")
    
    if still_inconsistent == 0:
        print("\nAll rows now follow the relationship: casual + registered = cnt")
    else:
        print("\nWarning: Some inconsistent rows remain!")
else:
    print("\nAll rows already follow the relationship: casual + registered = cnt")


display(df[inconsistent_mask][['casual', 'registered', 'cnt']])

FIXING INCONSISTENT ROWS (casual + registered != cnt)


Unnamed: 0,casual,registered,cnt
16,41,118,93
23,15,24,415
39,108,67,76
78,55,37,42
100,6,109,144
...,...,...,...
17671,17,362,562
17679,17,348,606
17706,17,221,386
17708,17,429,497


Rows where casual + registered != cnt: 1215 (6.86%)

Recalculating the biggest value in each inconsistent row

Recalculation summary:
  casual recalculated: 27
  registered recalculated: 153
  cnt recalculated: 1035

Verification after recalculation:
  Inconsistent rows remaining: 0

All rows now follow the relationship: casual + registered = cnt


Unnamed: 0,casual,registered,cnt
16,41,52,93
23,15,24,39
39,9,67,76
78,5,37,42
100,6,109,115
...,...,...,...
17671,17,362,379
17679,17,348,365
17706,17,221,238
17708,17,429,446


In [11]:
# Search for negative values
print("="*80)
print("CHECKING FOR NEGATIVE VALUES")
print("="*80)

negative_casual = (df['casual'] < 0).sum()
negative_registered = (df['registered'] < 0).sum()
negative_cnt = (df['cnt'] < 0).sum()

print(f"Negative values found:")
print(f"  casual: {negative_casual}")
print(f"  registered: {negative_registered}")
print(f"  cnt: {negative_cnt}")

total_negatives = negative_casual + negative_registered + negative_cnt
print(f"\nTotal negative values: {total_negatives}")

if total_negatives > 0:
    negative_mask = (df['casual'] < 0) | (df['registered'] < 0) | (df['cnt'] < 0)
    print(f"\nRows with negative values:")
    display(df[negative_mask][['instant', 'dteday', 'casual', 'registered', 'cnt']])
else:
    print("\nNo negative values found!")

CHECKING FOR NEGATIVE VALUES
Negative values found:
  casual: 4
  registered: 1
  cnt: 0

Total negative values: 5

Rows with negative values:


Unnamed: 0,instant,dteday,casual,registered,cnt
839,840.0,2011-02-07,-63,118,55
920,921.0,2011-02-10,-46,118,72
3675,3676.0,2011-06-07,66,-62,4
5834,5835.0,2011-09-05,-91,235,144
9773,9774.0,2012-02-17,-270,271,1


In [12]:
# Delete rows with at least one negative value
print("="*80)
print("DELETING ROWS WITH NEGATIVE VALUES")
print("="*80)

rows_before = len(df)

# Create mask for rows with any negative value in casual, registered, or cnt
negative_mask = (df['casual'] < 0) | (df['registered'] < 0) | (df['cnt'] < 0)
rows_with_negatives = negative_mask.sum()

# Drop rows with negative values
df = df[~negative_mask]

rows_after = len(df)
rows_deleted = rows_before - rows_after

print(f"Rows before deletion: {rows_before}")
print(f"Rows after deletion: {rows_after}")
print(f"Rows deleted: {rows_deleted} ({rows_deleted/rows_before*100:.2f}%)")


DELETING ROWS WITH NEGATIVE VALUES
Rows before deletion: 17711
Rows after deletion: 17706
Rows deleted: 5 (0.03%)


In [14]:
df.to_csv("../data/processed/bike_sharing_cleaned.csv", index=False)
print("Data saved")

Data saved
