In [18]:
import pandas as pd
import os

In [19]:
# Directory containing subdirectories for each housing
# Each subdirectory contains a csv file 
data_path = "../../data/housing_data/"


In [20]:
def extract_housing_info_from_name(housing):
    """
    Extracts the housing information from the csv file
    The info is as follows:
        - The first upper case letter is the type of housing (A : appartement, M : maison)
        - The numbers after the letter represent the surface of the housing
        - The number after the dash represents the number of people living in the housing
    """
    try:
        info = housing.split(".")[0].split("_")[2]
        # print(f"Housing {info}")
        h_type = info[0]
        # print(f"Housing type: {h_type}")
        if h_type == "1":
            h_type = "M"
        elif h_type == "8":
            h_type = "A"
        # print(f"Housing type: {h_type}")
        info = info.replace(h_type, "")
        surface, nb_people, h_ref = info.split("-")
    except:
        h_ref = housing.split(" ")[1].replace(".csv", "").strip("()")
        info = housing.split(" ")[0].split("_")[2]
        h_type = info[0]
        # print(f"Ref: {h_ref}")
        # print(f"Type: {h_type}")
        info = info.replace(h_type, "")
        # print(f"Info: {info}")
        surface, nb_people = info.split("-")
        # print(f"Surface: {surface}")
        # print(f"Nb people: {nb_people}")

    return h_type, surface, nb_people, h_ref

In [21]:
extract_housing_info_from_name("data_maison_M110-2-4.csv")

('M', '110', '2', '4')

In [22]:
list_housings = os.listdir(data_path) # list of all the housings
len(list_housings)

26

In [23]:
df = pd.DataFrame()

for housing in list_housings:
    houses = os.listdir(data_path + housing)
    for house in houses[:1]:
        
        h_type, h_surface, nb_people, h_ref = extract_housing_info_from_name(house)
        print(data_path + housing + "/" + house + " :   " + h_type + " " + h_surface + " " + nb_people + " " + h_ref)
        
        temp = pd.read_csv(data_path + housing + "/" + house, header=[1],sep=',')
        #rename the columns Unnamed: 0 to timestamp
        temp.rename(columns={'Unnamed: 0':'date'}, inplace=True)
        #drop the column Unnamed: 1 (consommation totale)
        temp.drop('Unnamed: 1', axis=1, inplace=True)

        #Transpose
        temp = temp.T

        #make the first row as header
        temp['tmp_columns'] = temp.index
        temp = temp.reset_index(drop=True)
        temp.columns = temp.iloc[0]
        temp = temp.drop(temp.index[0])
        #make the column original_columns in first column
        cols = temp.columns.tolist()
        cols = cols[-1:] + cols[:-1]
        temp = temp[cols]
        #rename the column date to timestamp
        temp.rename(columns={'date':'timestamp'}, inplace=True)

        temp = pd.melt(temp, id_vars=['timestamp'], var_name='date', value_name='consommation')
        
        temp["h_type"] = h_type
        temp["h_surface"] = h_surface
        temp["nb_people"] = nb_people
        temp["h_ref"] = h_ref
        
        df = pd.concat([df, temp], axis=0)
    
df.reset_index(drop=True, inplace=True) 
df

../../data/housing_data/data A110-5/data_maison_A110-5-370.csv :   A 110 5 370
../../data/housing_data/data M170-6/data_maison_M170-6-249.csv :   M 170 6 249
../../data/housing_data/data M120-5/data_maison_M120-5-1035.csv :   M 120 5 1035
../../data/housing_data/data A150-6/data_maison_A150-6-154.csv :   A 150 6 154
../../data/housing_data/data M135-3/data_maison_M135-3-492.csv :   M 135 3 492
../../data/housing_data/data M150-4/data_maison_150-4-789.csv :   M 150 4 789
../../data/housing_data/data M100-3/data_maison_M100-3-91.csv :   M 100 3 91
../../data/housing_data/data A130-4/data_maison_A130-4-763.csv :   A 130 4 763
../../data/housing_data/data M140-5/data_maison_140-5-330.csv :   M 140 5 330
../../data/housing_data/data M250-5/data_maison_M250-5-11.csv :   M 250 5 11
../../data/housing_data/data M80-2/data_maison_M80-2-292.csv :   M 80 2 292
../../data/housing_data/data A50-3/data_maison_A50-3-630.csv :   A 50 3 630
../../data/housing_data/data M90-4/data_maison_M90-4-791.csv :

Unnamed: 0,timestamp,date,consommation,h_type,h_surface,nb_people,h_ref
0,0:00,12/14/2022,0.7197,A,110,5,370
1,0:30,12/14/2022,0.7353,A,110,5,370
2,1:00,12/14/2022,0.5541,A,110,5,370
3,1:30,12/14/2022,1.0196,A,110,5,370
4,2:00,12/14/2022,0.9321,A,110,5,370
...,...,...,...,...,...,...,...
1346587,21:30,1/1/2020,0.3647,A,50,2,478
1346588,22:00,1/1/2020,0.3388,A,50,2,478
1346589,22:30,1/1/2020,0.4558,A,50,2,478
1346590,23:00,1/1/2020,0.3219,A,50,2,478


In [24]:
df["h_type"].unique()

array(['A', 'M'], dtype=object)

In [25]:
# calculate df memory usage in GB
df.memory_usage(deep=False).sum() / 1024**3

0.07023036479949951

In [26]:
df.to_csv(f"../../data/dataframes/df2.csv", index=False)

In [27]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1346592 entries, 0 to 1346591
Data columns (total 7 columns):
 #   Column        Non-Null Count    Dtype 
---  ------        --------------    ----- 
 0   timestamp     1346592 non-null  object
 1   date          1346592 non-null  object
 2   consommation  1346592 non-null  object
 3   h_type        1346592 non-null  object
 4   h_surface     1346592 non-null  object
 5   nb_people     1346592 non-null  object
 6   h_ref         1346592 non-null  object
dtypes: object(7)
memory usage: 71.9+ MB


In [28]:
df["consommation"] = df["consommation"].astype(float)
df["h_surface"] = df["h_surface"].astype(int)
df["nb_people"] = df["nb_people"].astype(int)

In [29]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1346592 entries, 0 to 1346591
Data columns (total 7 columns):
 #   Column        Non-Null Count    Dtype  
---  ------        --------------    -----  
 0   timestamp     1346592 non-null  object 
 1   date          1346592 non-null  object 
 2   consommation  1346592 non-null  float64
 3   h_type        1346592 non-null  object 
 4   h_surface     1346592 non-null  int64  
 5   nb_people     1346592 non-null  int64  
 6   h_ref         1346592 non-null  object 
dtypes: float64(1), int64(2), object(4)
memory usage: 71.9+ MB


In [30]:
df.memory_usage(deep=True).sum() / 1024**3

0.33767449110746384

In [31]:
type(df.h_ref[0])

str

In [32]:
df.to_csv("../../data/dataframes/df2.csv", index = False)