In [5]:
import os
import pandas as pd

csv_folder = r"C:\Users\HYH\Desktop\archive\Data"

for filename in os.listdir(csv_folder):
    if filename.endswith(".csv"):
        filepath = os.path.join(csv_folder, filename)
        try:
            print(f"\nanalysis: {filename}")
            df = pd.read_csv(filepath, sep=";", low_memory=False)

            df["value"] = pd.to_numeric(df["value"], errors="coerce")


            total_rows = len(df)
            missing_info = df.isnull().sum()
            empty_string_counts = (df == "").sum()

        
            result = pd.DataFrame({
                "Missing Values (NaN): ": missing_info,
                "NaN(\"\")": empty_string_counts,
                "Total records": total_rows,
                "Missing Percent": ((missing_info + empty_string_counts) / total_rows * 100).round(2)
            })

            print(result)

        except Exception as e:
            print(f"ERROR Read: {filename}, ERROR: {e}")



analysis: april_2018_sensor_data_archive.csv
             Missing Values (NaN):   NaN("")  Total records  Missing Percent
sensor_id                         0        0         165156              0.0
sensor_type                       0        0         165156              0.0
location                          0        0         165156              0.0
lat                               0        0         165156              0.0
lon                               0        0         165156              0.0
timestamp                         0        0         165156              0.0
value_type                        0        0         165156              0.0
value                             0        0         165156              0.0

analysis: april_2019_sensor_data_archive.csv
             Missing Values (NaN):   NaN("")  Total records  Missing Percent
sensor_id                         0        0         668165              0.0
sensor_type                       0        0         668165  

In [7]:
import pandas as pd

filepath = r"C:\Users\HYH\Desktop\archive\Data\june_2019_sensor_data_archive.csv"

df = pd.read_csv(filepath, sep=";", low_memory=False)

df["value"] = pd.to_numeric(df["value"], errors="coerce")

missing_rows = df[df.isnull().any(axis=1)]

print(f"Totaly {len(missing_rows)} have missing values: ")

with pd.option_context('display.max_rows', 100, 'display.max_columns', None):
    display(missing_rows)


Totaly 418 have missing values: 


Unnamed: 0,sensor_id,sensor_type,location,lat,lon,timestamp,value_type,value
394364,134,SDS011,7,-1.298,36.791,2019-06-21T08:24:07.032170+00:00,timestamp,
394372,134,SDS011,7,-1.298,36.791,2019-06-21T08:24:27.547823+00:00,timestamp,
394384,134,SDS011,7,-1.298,36.791,2019-06-21T08:25:09.322161+00:00,timestamp,
394416,134,SDS011,7,-1.298,36.791,2019-06-21T08:28:08.073958+00:00,timestamp,
394420,134,SDS011,7,-1.298,36.791,2019-06-21T08:28:17.319902+00:00,timestamp,
...,...,...,...,...,...,...,...,...
486168,135,UltimateGPS,7,-1.298,36.791,2019-06-26T08:19:13.982435+00:00,timestamp,
486182,135,UltimateGPS,7,-1.298,36.791,2019-06-26T08:19:46.814764+00:00,timestamp,
486198,135,UltimateGPS,7,-1.298,36.791,2019-06-26T08:20:19.698593+00:00,timestamp,
486212,135,UltimateGPS,7,-1.298,36.791,2019-06-26T08:20:52.828044+00:00,timestamp,


In [10]:
import os
import re
import pandas as pd
from pymongo import MongoClient

csv_folder = r"C:\Users\HYH\Desktop\archive\Data"  
client = MongoClient("mongodb://localhost:27017") 
db = client["airquality"]
collection = db["nairobi_data"]

collection.delete_many({})

month_map = {
    "january": "01", "february": "02", "march": "03",
    "april": "04", "may": "05", "june": "06",
    "july": "07", "august": "08", "september": "09",
    "october": "10", "november": "11", "december": "12"
}

for filename in os.listdir(csv_folder):
    if filename.endswith(".csv"):
        filepath = os.path.join(csv_folder, filename)

        match = re.match(r"([a-z]+)_(\d{4})", filename.lower())
        if not match:
            print(f"None File: {filename}")
            continue

        month_name, year = match.groups()
        month = month_map.get(month_name)
        if not month:
            print(f"None month: {filename}")
            continue

        source_month = f"{year}-{month}"

        try:
            
            df = pd.read_csv(filepath, sep=";", low_memory=False)

            
            df["value"] = pd.to_numeric(df["value"], errors="coerce")

            
            df = df.dropna()

            
            df["source_file"] = filename
            df["source_month"] = source_month

            
            records = df.to_dict(orient="records")
            if records:
                collection.insert_many(records)
                print(f"Successfully import: {filename} (Totally {len(records)} records)")
            else:
                print(f"File {filename} have None records! ")

        except Exception as e:
            print(f"Error import {filename}, Error: {e}")

print("All files imported successfully! ")


Successfully import: april_2018_sensor_data_archive.csv (Totally 165156 records)
Successfully import: april_2019_sensor_data_archive.csv (Totally 668165 records)
Successfully import: august_2017_sensor_data_archive.csv (Totally 10344 records)
Successfully import: august_2018_sensor_data_archive.csv (Totally 555708 records)
Successfully import: august_2019_sensor_data_archive.csv (Totally 632330 records)
Successfully import: december_2017_sensor_data_archive.csv (Totally 129184 records)
Successfully import: december_2018_sensor_data_archive.csv (Totally 726511 records)
Successfully import: february_2018_sensor_data_archive.csv (Totally 140408 records)
Successfully import: february_2019_sensor_data_archive.csv (Totally 657024 records)
Successfully import: january_2018_sensor_data_archive.csv (Totally 103042 records)
Successfully import: january_2019_sensor_data_archive.csv (Totally 699772 records)
Successfully import: july_2017_sensor_data_archive.csv (Totally 378 records)
Successfully i

In [11]:
from pymongo import MongoClient

client = MongoClient("mongodb://localhost:27017/")
db = client["airquality"]
collection = db["nairobi_data"]

abnormal_conditions = {
    "$or": [
        {
            "value_type": {"$in": ["P1", "P2", "P0"]},
            "$or": [
                {"value": {"$gt": 1000}},
                {"value": {"$lt": 0}}
            ]
        },
        {
            "value_type": "temperature",
            "$or": [
                {"value": {"$lt": 0}},
                {"value": {"$gt": 40}}
            ]
        },
        {
            "value_type": "humidity",
            "$or": [
                {"value": {"$lt": 0}},
                {"value": {"$gt": 100}}
            ]
        }
    ]
}

result = collection.delete_many(abnormal_conditions)

print(f"Delete {result.deleted_count} records.")


Delete 10746 records.
