# Exploratory Data Analysis (EDA)

### Package Imports

In [None]:
import polars as pl
import plotly.express as px
import matplotlib.pyplot as plt



pl.Config.set_tbl_rows(-1)
%matplotlib inline

## Data Preparation

In [None]:
# read the whole csv file
df_pl = pl.scan_csv(
    "../data/IoT Data.csv",
    separator="|",
)

In [None]:
df = df_pl.collect()

In [None]:
# compute the length
length_computed = df.shape[0]

# remove the first row as it is irrelevant
df = df.slice(1, length_computed - 1)
df.head()

#### What is the total available data ?

In [None]:
df.shape[0]

#### What is the total number of devices ?

In [None]:
df['DEVICE_ID'].n_unique()

#### What are some descriptive statistics about the data ?

In [None]:
df.describe()

#### Filter out Water Consumption Data

In [None]:
df = df.filter((pl.col('OGI_MEASURE_TYPE') == 'Con') & (pl.col('TAG_DISPLAY_UNIT') == "L") & (pl.col('DEVICE_TYPE_NAME') == "WATER METER") )
df.head()

#### What are the data types ?

In [None]:
df.schema

Based on the schema, we need to convert some columns to the right data type.

#### Convert longitude and latitude to floats

Since the longitude are currently stored as strings in the schema, convert them to float to prepare them for future analysis.

In [None]:
spatial_columns = ['OGI_LAT','OGI_LONG']
for column in spatial_columns:
    df = df.with_columns(pl.col(column).str.replace(",",".").cast(pl.Float32))

#### Convert strings datatype to the right datatype

In [None]:
df = df.with_columns([
    pl.col('TAG_VALUE_RAW').str.replace(',', '.').cast(pl.Float64),
    pl.col('DEVICE_ID').cast(pl.Int32),
    pl.col("TAG_VALUE_DATE").str.strptime(pl.Datetime, "%Y-%m-%d %H:%M:%S.%f", strict=False)
])

In [None]:
# cross check the schema again

df.schema

## EDA

#### What are the different levels available in the data ?

In [None]:
value_counts = df['OGI_LEVEL'].value_counts()
value_counts

In [None]:
value_counts.to_pandas().plot(kind='bar', x='OGI_LEVEL', y='count')

#### How many devices are in each level ?

In [None]:
level_1_devices = df.filter(pl.col('OGI_LEVEL') == "1")
level_2_devices = df.filter(pl.col('OGI_LEVEL') == "2")
level_3_devices = df.filter(pl.col('OGI_LEVEL') == "3")

level_1_devices['DEVICE_ID'].n_unique() , level_2_devices['DEVICE_ID'].n_unique() , level_3_devices['DEVICE_ID'].n_unique()

#### What are the names of the different camps?

In [None]:
value_counts= df['DEVICE_LOCATION_NAME'].value_counts()
value_counts

#### How many devices are available in total?

In [None]:
df['DEVICE_ID'].n_unique()

#### What is the device distribution across the camps?

In [None]:
camp_device_counts = df.group_by("DEVICE_LOCATION_NAME").agg(pl.col("DEVICE_ID").n_unique().alias("device_count"))

camp_device_counts

#### Which camp has the highest data to work with ?

In [None]:
camp_data_counts_pd = camp_data_counts.to_pandas()

fig = px.bar(camp_data_counts_pd, x="DEVICE_LOCATION_NAME", y="data_count",
             labels={"DEVICE_LOCATION_NAME": "Camp Name", "data_count": "Number of Data Points"},
             title="Distribution of Data Points Across Camps")
fig.show()

highest_data_camp = camp_data_counts_pd.loc[camp_data_counts_pd['data_count'].idxmax()]
print(f"The camp with the highest data count is: {highest_data_camp['DEVICE_LOCATION_NAME']} with {highest_data_camp['data_count']} data points.")

In [None]:
camp_data_counts

In [None]:
df.shape[0]

Abyei has the highest data, so will be using Abyei as the study focus.

#### Filter out Abyei

In [None]:
abyei_df = df.filter((pl.col("DEVICE_LOCATION_NAME") == "Abyei"))

### Resample to daily consumption
Also select relevant columns

In [None]:
cummulative_daily_consumption = (
    abyei_df
    .with_columns(
        pl.col("TAG_VALUE_DATE").dt.truncate("1d").cast(pl.Date).alias("DATE")
    )
    .sort(["DEVICE_ID", "DATE"])  # Sort by truncated date
    .group_by(["DEVICE_ID", "DATE"])  # Group by DEVICE_ID and DATE
    .agg([
        pl.col("TAG_VALUE_RAW").max().alias("CUMMULATIVE_CONSUMPTION"),
        pl.col("OGI_LONG").first(),
        pl.col("OGI_LAT").first(),
    ])
    .sort(["DEVICE_ID", "DATE"])
)


In [None]:
cummulative_daily_consumption.head()

In [None]:
cummulative_daily_consumption.shape[0]

### Visualization

In [None]:
# --- TIME SERIES PLOT (Before & After Resampling for Most Reported Device) ---
# Select the most reported device (device with the highest number of records)
df_pandas = cummulative_daily_consumption.to_pandas()

device_id = df_pandas["DEVICE_ID"].value_counts().idxmax() 

print(device_id)

In [None]:
before_resampling_df = df.filter(pl.col('DEVICE_ID') == device_id).to_pandas()
df_device = df_pandas[df_pandas["DEVICE_ID"] == device_id]

plt.figure(figsize=(12, 5))

# Plot Before Resampling (raw data)
plt.plot(before_resampling_df["TAG_VALUE_DATE"], before_resampling_df["TAG_VALUE_RAW"], 
         marker="o", linestyle="-", color="red", alpha=0.5, markersize=4, 
         label="Raw Data (Before Resampling)")

# Plot After Resampling (daily cumulative max)
plt.plot(df_device["DATE"], df_device["CUMMULATIVE_CONSUMPTION"], 
         marker="s", linestyle="--", color="green", linewidth=2, markersize=5, 
         label="Daily Max (After Resampling)")

# Formatting
plt.xlabel("Time")
plt.ylabel("Cumulative Water Consumption (L)")
plt.title(f"Comparison of Cumulative Water Consumption Before and After Resampling (Device {device_id})")
plt.legend()
plt.grid(True)
# Save the figure as a PNG file
plt.savefig("../visualizations/plots/resampling_comparison.png", dpi=300, bbox_inches="tight")
plt.show()


## Time Series Visualizations

#### Time series chart for level 1 devices

In [None]:
datetime_column = 'DATE'
value_column = 'CUMMULATIVE_CONSUMPTION'

# List of unique device tags
device_tags = level_1_devices['DEVICE_ID'].unique().to_list()

for device_id in device_tags:
    # Filter data for the specific device tag
    device_data = cummulative_daily_consumption.filter(
        (pl.col("DEVICE_ID") == device_id)
    )

    # Convert to pandas for Plotly
    device_data_pd = device_data.to_pandas()

    if device_data_pd.shape[0] > 0:  # Check if there's data for the device
        # Create a Plotly figure with a slider for the whole year
        fig = px.line(device_data_pd, x=datetime_column, y=value_column, 
                      labels={datetime_column: 'Timestamp', value_column: 'Water Consumption'},
                      title=f'Water Consumption for Device {device_id}',
                     )

        # Update layout to add a range slider
        fig.update_layout(
            xaxis=dict(
                rangeslider=dict(visible=True),  # Add a range slider
                type="date"
            ),
            yaxis_title='Measurement fo ',
            xaxis_title='Timestamp'
        )

        # Show the interactive plot
        fig.show()

#### Time series chart for level 2 devices

In [None]:
bad_devices = [1307,2049,2048,4759]

for d in bad_devices:

    device_data_pd = cummulative_daily_consumption.filter(pl.col('DEVICE_ID')==d)
    fig = px.line(device_data_pd, x=datetime_column, y=value_column, 
                      labels={datetime_column: 'Timestamp', value_column: 'Water Consumption'},
                      title=f'Water Consumption for Device {d}',
                     )

    # Update layout to add a range slider
    fig.update_layout(
        xaxis=dict(
            rangeslider=dict(visible=True),  # Add a range slider
            type="date"
        ),
        yaxis_title='Measurement fo ',
        xaxis_title='Timestamp'
    )
    
    # Show the interactive plot
    fig.show()

In [None]:
# the last one is less than 60 records making it not useful.

In [None]:
cummulative_daily_consumption = cummulative_daily_consumption.filter(pl.col('DEVICE_ID')!=1307)
cummulative_daily_consumption = cummulative_daily_consumption.filter(pl.col('DEVICE_ID')!=2049)
cummulative_daily_consumption = cummulative_daily_consumption.filter(pl.col('DEVICE_ID')!=2048)
cummulative_daily_consumption = cummulative_daily_consumption.filter(pl.col('DEVICE_ID')!=4759)

### Export The Water Consumption Data for Abyei Assessment

In [None]:
df.write_csv('../exports/all_water_consumption.csv')

In [None]:
cummulative_daily_consumption.write_csv('../exports/cummulative_daily_consumption.csv')