In [1]:
import pandas as pd
import os

In [9]:
# Directory containing subdirectories for each housing
# Each subdirectory contains a csv file 
data_path = "../../data/housing_consomation/"
list_housings = os.listdir(data_path)

In [35]:
def extract_housing_info_from_name(housing):
    """
    Extracts the housing information from the csv file
    The info is as follows:
        - The first upper case letter is the type of housing (A : appartement, M : maison)
        - The numbers after the letter represent the surface of the housing
        - The number after the dash represents the number of people living in the housing
    """
    info = housing.split(" ")[1]
    print(f"Housing {info}")
    h_type = info[0]
    info = info.replace(h_type, "")
    surface, nb_people = info.split("-")
    return h_type, surface, nb_people

In [37]:
h_type, h_surface, nb_people = extract_housing_info_from_name(list_housings[0])
print(f"Type: {h_type}, Surface: {h_surface}, Nb people: {nb_people}")

Housing A110-5
Type: A, Surface: 110, Nb people: 5


In [62]:
df = pd.read_csv(data_path + list_housings[0] + "/data_maison_A110-5-1.csv")
# change column name Unnamed: 1 to cons_day
# change columns names to the hours associated
df = df.rename(columns={"Unnamed: 1": "cons_day",
                        "1" : "00:00",
                        "2" : "00:30",
                        "3" : "01:00",
                        "4" : "01:30",
                        "5" : "02:00",
                        "6" : "02:30",
                        "7" : "03:00",
                        "8" : "03:30",
                        "9" : "04:00",
                        "10" : "04:30",
                        "11" : "05:00",
                        "12" : "05:30",
                        "13" : "06:00",
                        "14" : "06:30",
                        "15" : "07:00",
                        "16" : "07:30",
                        "17" : "08:00",
                        "18" : "08:30",
                        "19" : "09:00",
                        "20" : "09:30",
                        "21" : "10:00",
                        "22" : "10:30",
                        "23" : "11:00",
                        "24" : "11:30",
                        "25" : "12:00",
                        "26" : "12:30",
                        "27" : "13:00",
                        "28" : "13:30",
                        "29" : "14:00",
                        "30" : "14:30",
                        "31" : "15:00",
                        "32" : "15:30",
                        "33" : "16:00",
                        "34" : "16:30",
                        "35" : "17:00",
                        "36" : "17:30",
                        "37" : "18:00",
                        "38" : "18:30",
                        "39" : "19:00",
                        "40" : "19:30",
                        "41" : "20:00",
                        "42" : "20:30",
                        "43" : "21:00",
                        "44" : "21:30",
                        "45" : "22:00",
                        "46" : "22:30",
                        "47" : "23:00",
                        "48" : "23:30"
                        })

# after renaming the columns, we can drop the first row
df.drop(index=0, inplace=True)
df.reset_index(drop=True, inplace=True)

df["day"] = df["date"].apply(lambda x: x.split("/")[0])
df["month"] = df["date"].apply(lambda x: x.split("/")[1])
df["year"] = df["date"].apply(lambda x: x.split("/")[2])
df.drop(columns=["date"], inplace=True)
df["h_type"] = h_type
df["h_surface"] = h_surface
df["nb_people"] = nb_people

# transpose the dataframe
df = df.transpose()
df

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,1069,1070,1071,1072,1073,1074,1075,1076,1077,1078
cons_day,44.0604,63.9369,85.6734,10.4767,52.8727,52.6766,70.3011,52.579,52.1874,56.6909,...,47.2209,46.4536,45.2606,50.2896,53.1025,50.3755,48.1592,51.397,51.1419,45.8575
00:00,1.1583,1.5763,1.4716,0.2008,0.8891,1.2121,1.2507,1.0761,1.0637,0.9847,...,0.8648,1.1934,0.7554,0.7249,1.4115,0.8786,0.9739,1.308,0.8181,0.9579
00:30,1.0177,1.2686,1.759,0.2231,1.1314,0.9217,1.5186,0.8175,0.9223,1.1551,...,1.0491,0.8012,0.7219,0.7376,0.7801,0.9633,1.022,1.039,1.228,0.8065
01:00,0.9708,0.961,2.0153,0.2259,0.8801,1.1853,1.8924,1.2722,1.1653,1.5105,...,0.9119,0.9434,1.0907,0.6953,1.398,1.2049,1.0783,1.1778,0.9907,0.9011
01:30,1.2196,2.1133,2.0619,0.261,1.4455,1.6097,2.1354,1.1831,1.4834,1.939,...,1.1119,1.2395,1.4522,1.2112,1.3123,1.1032,1.3273,1.4946,1.6551,1.5598
02:00,1.7605,1.9399,3.4911,0.2603,2.4238,1.8599,2.8207,1.192,1.5895,1.4229,...,2.0728,1.3972,1.4596,1.6087,1.3574,2.26,1.6646,1.4339,1.4178,1.363
02:30,1.3206,1.442,2.1551,0.239,1.0686,1.0959,1.637,1.8963,1.9253,1.7004,...,1.2256,0.8781,1.0311,1.6806,1.3844,1.6922,1.7329,1.0216,0.9691,0.9314
03:00,1.2016,1.7217,2.3415,0.2298,1.9706,1.6365,1.855,1.9186,0.8781,1.0626,...,0.9825,1.2741,0.8746,0.8898,1.177,1.1625,0.9297,1.768,1.9959,0.9125
03:30,1.0538,2.2811,2.5901,0.2244,1.2391,1.8063,1.9796,1.027,1.8502,2.1094,...,1.8061,1.1319,0.9901,1.2789,2.061,1.1794,1.3032,1.0563,1.9829,1.0639
04:00,1.0141,1.3693,1.5104,0.213,1.1269,1.3908,1.0825,1.2187,1.4083,0.9993,...,1.359,0.8012,0.7815,0.9194,1.2176,1.5905,0.7048,1.4512,1.0338,0.9957


In [60]:
df.to_csv(f"df_preview.csv", index=False)

str