In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

In [7]:
months = {
    1: "January",
    2: "February",
    3: "March",
    4: "April",
    5: "May",
    6: "June",
    7: "July",
    8: "August",
    9: "September",
    10: "October",
    11: "November",
    12: "December",
}

In [8]:
file_path = "../data/raw/benin-malanville.csv"
df = pd.read_csv(file_path)

In [9]:
df.head()

Unnamed: 0,Timestamp,GHI,DNI,DHI,ModA,ModB,Tamb,RH,WS,WSgust,WSstdev,WD,WDstdev,BP,Cleaning,Precipitation,TModA,TModB,Comments
0,2021-08-09 00:01,-1.2,-0.2,-1.1,0.0,0.0,26.2,93.4,0.0,0.4,0.1,122.1,0.0,998,0,0.0,26.3,26.2,
1,2021-08-09 00:02,-1.1,-0.2,-1.1,0.0,0.0,26.2,93.6,0.0,0.0,0.0,0.0,0.0,998,0,0.0,26.3,26.2,
2,2021-08-09 00:03,-1.1,-0.2,-1.1,0.0,0.0,26.2,93.7,0.3,1.1,0.5,124.6,1.5,997,0,0.0,26.4,26.2,
3,2021-08-09 00:04,-1.1,-0.1,-1.0,0.0,0.0,26.2,93.3,0.2,0.7,0.4,120.3,1.3,997,0,0.0,26.4,26.3,
4,2021-08-09 00:05,-1.0,-0.1,-1.0,0.0,0.0,26.2,93.3,0.1,0.7,0.3,113.2,1.0,997,0,0.0,26.4,26.3,


In [10]:
df["Timestamp"] = pd.to_datetime(df["Timestamp"])

df["Day"] = df["Timestamp"].dt.day
df["Year"] = df["Timestamp"].dt.year
df["Month"] = df["Timestamp"].dt.month
df["Time"] = df["Timestamp"].dt.strftime("%I:%M %p")
df["Day_Name"] = df["Timestamp"].dt.strftime("%A")
df["Month_Name"] = df["Timestamp"].dt.strftime("%B")

df.head()

Unnamed: 0,Timestamp,GHI,DNI,DHI,ModA,ModB,Tamb,RH,WS,WSgust,...,Precipitation,TModA,TModB,Comments,Day,Year,Month,Time,Day_Name,Month_Name
0,2021-08-09 00:01:00,-1.2,-0.2,-1.1,0.0,0.0,26.2,93.4,0.0,0.4,...,0.0,26.3,26.2,,9,2021,8,12:01 AM,Monday,August
1,2021-08-09 00:02:00,-1.1,-0.2,-1.1,0.0,0.0,26.2,93.6,0.0,0.0,...,0.0,26.3,26.2,,9,2021,8,12:02 AM,Monday,August
2,2021-08-09 00:03:00,-1.1,-0.2,-1.1,0.0,0.0,26.2,93.7,0.3,1.1,...,0.0,26.4,26.2,,9,2021,8,12:03 AM,Monday,August
3,2021-08-09 00:04:00,-1.1,-0.1,-1.0,0.0,0.0,26.2,93.3,0.2,0.7,...,0.0,26.4,26.3,,9,2021,8,12:04 AM,Monday,August
4,2021-08-09 00:05:00,-1.0,-0.1,-1.0,0.0,0.0,26.2,93.3,0.1,0.7,...,0.0,26.4,26.3,,9,2021,8,12:05 AM,Monday,August


#### How to handle the negative values in the GHI, DNI, and DHI?

Here I made the decision to apply data shifting technique to not lose the distribution of the data. My justification for this is because I think that night values should not matter, since sunlight does not generally have much meaning at night time, and by applying data shifting technique I won't lose the distribution of the data.

**NOTE**: This approach might not be optimal hence might result in unexpected results.

In [11]:
min_ghi = df["GHI"].min()
shift_value = abs(min_ghi)

print(
    f"The minimum value is: {min_ghi}, so the shift value for the GHI is {shift_value}\n"
)

df["GHI"] = df["GHI"] + shift_value

print("Extremes range after data shift is applied.\n")
print(f"The max and min value are: {df["GHI"].max()} and {df['GHI'].min()} respectively.")

The minimum value is: -12.9, so the shift value for the GHI is 12.9

Extremes range after data shift is applied.

The max and min value are: 1425.9 and 0.0 respectively.


In [12]:
min_dhi = df["DHI"].min()
shift_value = abs(min_dhi)

print(
    f"The minimum value is: {min_dhi}, so the shift value for the DHI is {shift_value}\n"
)

df["DHI"] = df["DHI"] + shift_value

print("Extremes range after data shift is applied.\n")
print(f"The max and min value are: {df["DHI"].max()} and {df['DHI'].min()} respectively.")

The minimum value is: -12.6, so the shift value for the DHI is 12.6

Extremes range after data shift is applied.

The max and min value are: 771.8000000000001 and 0.0 respectively.


In [13]:
min_dni = df["DNI"].min()
shift_value = abs(min_dni)

print(
    f"The minimum value is: {min_dni}, so the shift value for the DNI is {shift_value}:\n"
)

df["DNI"] = df["DNI"] + shift_value

print("Extremes range after data shift is applied:")
print(f"The max and the min value are: {df["DNI"].max()} and {df["DNI"].min()} respectively.")

The minimum value is: -7.8, so the shift value for the DNI is 7.8:

Extremes range after data shift is applied:
The max and the min value are: 960.0999999999999 and 0.0 respectively.


In [None]:
df.drop(axis=1, columns=["Comments"], inplace=True)

In [50]:
grouped_by_year = (
    df.groupby(["Year", "Month", "Day"])[["GHI", "DNI", "DHI"]].mean().reset_index()
)

(366, 6)

### Some Statistics about the data for the year. The below statistics is for the year where the mean is calculated by finding the mean GHI, DNI, and DHI for the day in each month, year.

In [52]:
summary_stats = grouped_by_year[["GHI", "DNI", "DHI"]].describe()

summary_stats

Unnamed: 0,GHI,DNI,DHI
count,366.0,366.0,366.0
mean,252.787618,174.530607,127.628522
std,48.297326,88.040723,35.613098
min,7.4,7.7,6.9
25%,238.354896,103.882865,100.803958
50%,266.036007,179.443194,128.295625
75%,284.650278,244.481198,154.838819
max,315.840833,348.107639,215.818889
