In [None]:
# Let's compare the cleaned data with the original data and output a similarity score

import pandas as pd
df = pd.read_csv('../data/bike_sharing_cleaned.csv')
df_original = pd.read_csv('../data/bike_sharing_original.csv')
df_modified = pd.read_csv('../data/bike_sharing_modified.csv')

print("General info of cleaned data:")
print(df.info())
print("Missing values in each column of cleaned data:")
print(df.isna().sum())
print("\nGeneral info of original data:")
print(df_original.info())
print("Missing values in each column of original data:")
print(df_original.isna().sum())
print("\nGeneral info of modified data:")
print(df_modified.info())
print("Missing values in each column of modified data:")
print(df_modified.isna().sum())

General info of cleaned data:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 17726 entries, 0 to 17725
Data columns (total 18 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   instant         17726 non-null  int64  
 1   dteday          17531 non-null  object 
 2   season          17294 non-null  float64
 3   yr              17316 non-null  float64
 4   mnth            17267 non-null  float64
 5   hr              17259 non-null  float64
 6   holiday         17381 non-null  object 
 7   weekday         17322 non-null  float64
 8   workingday      17344 non-null  object 
 9   weathersit      17277 non-null  float64
 10  temp            17304 non-null  float64
 11  atemp           17278 non-null  float64
 12  hum             17282 non-null  float64
 13  windspeed       17319 non-null  float64
 14  casual          17454 non-null  float64
 15  registered      17470 non-null  float64
 16  cnt             17479 non-null  float64
 17  m

In [2]:
# Let's calculate a simple similarity score based on the number of non-missing values in each column
similarity_scores = {}
for column in df.columns:
    non_missing_cleaned = df[column].notna().sum()
    non_missing_original = df_original[column].notna().sum()
    non_missing_modified = df_modified[column].notna().sum()
    similarity_score = (non_missing_cleaned + non_missing_modified) / (2 * non_missing_original) if non_missing_original > 0 else 1
    similarity_scores[column] = similarity_score
    
print("\nSimilarity scores between cleaned and original data:")
for column, score in similarity_scores.items():
    print(f"{column}: {score:.2f}")


Similarity scores between cleaned and original data:
instant: 1.01
dteday: 1.00
season: 0.99
yr: 0.99
mnth: 0.99
hr: 0.99
holiday: 1.00
weekday: 0.99
workingday: 0.99
weathersit: 0.99
temp: 0.99
atemp: 0.99
hum: 0.99
windspeed: 0.99
casual: 1.00
registered: 1.00
cnt: 1.00
mixed_type_col: 0.89
