# Session 5C - Coding Co-Lab

[session link](https://eds-217-essential-python.github.io/course-materials/coding-colabs/5c_cleaning_data.html)

In [62]:
import pandas as pd
import numpy as np
import re

url = 'https://bit.ly/messy_csv'
messy_df = pd.read_csv(url)

Your task is to clean this dataframe by

In [12]:
#Removing duplicates
df_no_duplicates = messy_df.drop_duplicates().copy()
print(df_no_duplicates.head())

     site  temperature_c   pH  dissolved_oxygen_mg_L collection date
0  SITE_A           25.5  6.8                    8.5      2023-06-15
1  site_b           30.0  7.2                    7.8      2023-06-16
2   SiteC           22.8  6.5                    9.2      2023-06-15
4  site_b            NaN  7.0                    7.5      2023-06-17


In [63]:
# Handling missing values (either fill or dropna to remove rows with missing data)
print(messy_df.isnull().sum())

# Fill missing values with a specific value
messy_df['temperature_c'] = messy_df['temperature_c'].fillna('unknown')

print(messy_df)
print(messy_df.isnull().sum())

site                     0
temperature_c            1
pH                       0
dissolved_oxygen_mg_L    0
collection date          0
dtype: int64
     site temperature_c   pH  dissolved_oxygen_mg_L collection date
0  SITE_A          25.5  6.8                    8.5      2023-06-15
1  site_b          30.0  7.2                    7.8      2023-06-16
2   SiteC          22.8  6.5                    9.2      2023-06-15
3  SITE_A          25.5  6.8                    8.5      2023-06-15
4  site_b       unknown  7.0                    7.5      2023-06-17
site                     0
temperature_c            0
pH                       0
dissolved_oxygen_mg_L    0
collection date          0
dtype: int64


In [64]:
# Ensuring consistent data types (dates, strings)
messy_df.dtypes

messy_df['site'] = messy_df['site'].astype('string')
messy_df['collection date'] = pd.to_datetime(messy_df['collection date'])

messy_df.dtypes

site                     string[python]
temperature_c                    object
pH                              float64
dissolved_oxygen_mg_L           float64
collection date          datetime64[ns]
dtype: object

In [65]:
# Formatting the ‘site’ column for consistency
messy_df['site'] = messy_df['site'].str.lower()

messy_df.head()

Unnamed: 0,site,temperature_c,pH,dissolved_oxygen_mg_L,collection date
0,site_a,25.5,6.8,8.5,2023-06-15
1,site_b,30.0,7.2,7.8,2023-06-16
2,sitec,22.8,6.5,9.2,2023-06-15
3,site_a,25.5,6.8,8.5,2023-06-15
4,site_b,unknown,7.0,7.5,2023-06-17


In [66]:
# Making sure all column names are lower case, without whitespace.
messy_df['site'] = messy_df['site'].str.replace("sitec", "site_c")
messy_df.head()

Unnamed: 0,site,temperature_c,pH,dissolved_oxygen_mg_L,collection date
0,site_a,25.5,6.8,8.5,2023-06-15
1,site_b,30.0,7.2,7.8,2023-06-16
2,site_c,22.8,6.5,9.2,2023-06-15
3,site_a,25.5,6.8,8.5,2023-06-15
4,site_b,unknown,7.0,7.5,2023-06-17


In [68]:
# rename columns
messy_df = messy_df.rename(columns={'collection date': 'collection_date'})
messy_df.head()

Unnamed: 0,site,temperature_c,pH,dissolved_oxygen_mg_L,collection_date
0,site_a,25.5,6.8,8.5,2023-06-15
1,site_b,30.0,7.2,7.8,2023-06-16
2,site_c,22.8,6.5,9.2,2023-06-15
3,site_a,25.5,6.8,8.5,2023-06-15
4,site_b,unknown,7.0,7.5,2023-06-17


In [61]:
messy_df['site'].str.replace("[ec]", "_")

In [61]:
messy_df['site'] = re.sub(r'\w+(?=c)', r'\1_', messy_df['site'])

AttributeError: Can only use .str accessor with string values!

In [56]:
messy_df['site'] = messy_df['site']\w+(?=c)

TypeError: expected string or bytes-like object