# Data Profiling & Quality Assessment

In [2]:
import pandas as pd
import hashlib
import pandas

In [2]:
divvydf = pd.read_csv("combined_2024_divvy_data.csv")
weatherdf = pd.read_json("weather_2024.json")

In [3]:
len(divvydf)

5860568

In [4]:
len(weatherdf)

366

**Weather Data Profiling & Quality Assessment**

In [5]:
weather_summary_stats = weatherdf.describe()
weather_missing_values = weatherdf.isnull().sum()
weather_duplicates = weatherdf.duplicated().sum()
weather_unique_counts = weatherdf.nunique()
weather_data_types = weatherdf.dtypes

In [6]:
#Weather Summary Stats
print(weather_summary_stats)

           LATITUDE     LONGITUDE     ELEVATION        PRCP        SNOW  \
count  3.660000e+02  3.660000e+02  3.660000e+02  366.000000  366.000000   
mean   4.196017e+01 -8.793164e+01  2.048000e+02    0.095410    0.064481   
std    1.921092e-13  4.126789e-13  1.394570e-12    0.229101    0.397658   
min    4.196017e+01 -8.793164e+01  2.048000e+02    0.000000    0.000000   
25%    4.196017e+01 -8.793164e+01  2.048000e+02    0.000000    0.000000   
50%    4.196017e+01 -8.793164e+01  2.048000e+02    0.000000    0.000000   
75%    4.196017e+01 -8.793164e+01  2.048000e+02    0.047500    0.000000   
max    4.196017e+01 -8.793164e+01  2.048000e+02    1.640000    4.700000   

             SNWD        TAVG        TMAX        TMIN  
count  366.000000  366.000000  366.000000  366.000000  
mean     0.193169   54.926230   63.497268   46.456284  
std      0.788814   18.535343   20.118791   17.636730  
min      0.000000   -7.000000    2.000000  -10.000000  
25%      0.000000   40.000000   47.000000   

In [7]:
#Weather Missing Value
print(weather_missing_values)

STATION      0
NAME         0
LATITUDE     0
LONGITUDE    0
ELEVATION    0
DATE         0
PRCP         0
SNOW         0
SNWD         0
TAVG         0
TMAX         0
TMIN         0
dtype: int64


In [8]:
#Weather Duplicates
print(weather_duplicates)

0


In [9]:
#Weather Unique Counts
print(weather_unique_counts)

STATION        1
NAME           1
LATITUDE       1
LONGITUDE      1
ELEVATION      1
DATE         366
PRCP          60
SNOW          16
SNWD           6
TAVG          73
TMAX          78
TMIN          71
dtype: int64


In [10]:
#Weather Data Types
print(weather_data_types)

STATION              object
NAME                 object
LATITUDE            float64
LONGITUDE           float64
ELEVATION           float64
DATE         datetime64[ns]
PRCP                float64
SNOW                float64
SNWD                float64
TAVG                  int64
TMAX                  int64
TMIN                  int64
dtype: object


**Divvy Data Profiling & Quality Assessment**

In [11]:
div_summary_stats = divvydf.describe()
div_missing_values = divvydf.isnull().sum()
div_duplicates = divvydf.duplicated().sum()
div_unique_counts = divvydf.nunique()
div_data_types = divvydf.dtypes

In [12]:
#Basic Summary Statistics
print(div_summary_stats)

                 ride_id  rideable_type  start_date start_time    end_date  \
count            5860568        5860568     5860568    5860568     5860568   
unique           5860357              3         366      86289         366   
top     43F5E06516957BF9  electric_bike  2024-09-21   17:11:03  2024-09-21   
freq                   2        2980595       34698        226       34764   

        end_time  
count    5860568  
unique     86296  
top     17:22:55  
freq         216  


In [13]:
#Missing Values
print(div_missing_values)

ride_id          0
rideable_type    0
start_date       0
start_time       0
end_date         0
end_time         0
dtype: int64


In [14]:
#Duplicates
print(div_duplicates)

211


In [15]:
#Unique counts within each column
print(div_unique_counts)

ride_id          5860357
rideable_type          3
start_date           366
start_time         86289
end_date             366
end_time           86296
dtype: int64


In [16]:
#Unique data types
print(div_data_types)

ride_id          object
rideable_type    object
start_date       object
start_time       object
end_date         object
end_time         object
dtype: object


**Combined 2024 Divvy Data Cleaning**

In [17]:
divvydf = divvydf.drop_duplicates()

In [18]:
divvydf.to_csv("cleaned_2024_divvy_data.csv", index=False)

In [19]:
cleaned_divvydf = pd.read_csv("cleaned_2024_divvy_data.csv")
len(cleaned_divvydf)

5860357

In [20]:
c_div_duplicates = cleaned_divvydf.duplicated().sum()
print(c_div_duplicates)

0


**Integrity Checks (Hashing - Detect unexpected changes)**


In [21]:
with open("cleaned_2024_divvy_data.csv", "rb") as f:
    data = f.read()
    sha256hash = hashlib.sha256(data).hexdigest()

with open("hashes/divvy.sha", "w") as f:
    f.write(sha256hash)

In [22]:
with open("weather_2024.json", "rb") as f:
    data = f.read()
    sha256hash = hashlib.sha256(data).hexdigest()

with open("hashes/weather.sha", "w") as f:
    f.write(sha256hash)