In [1]:
import pandas as pd
import random
import os

In [2]:
# Directory containing subdirectories for each housing
# Each subdirectory contains a csv file 
data_path = "../../data/housing_consomation/"


In [3]:
def extract_housing_info_from_name(housing):
    """
    Extracts the housing information from the csv file
    The info is as follows:
        - The first upper case letter is the type of housing (A : appartement, M : maison)
        - The numbers after the letter represent the surface of the housing
        - The number after the dash represents the number of people living in the housing
    """
    info = housing.split(" ")[1]
    # print(f"Housing {info}")
    h_type = info[0]
    info = info.replace(h_type, "")
    surface, nb_people = info.split("-")
    return h_type, surface, nb_people

In [4]:
list_housings = os.listdir(data_path) # list of all the housings

In [5]:
print(list_housings)

['data A110-5', 'data M170-6', 'data M120-5', 'data A150-6', 'data M135-3', 'data M150-4', 'data M100-3', 'data A130-4', 'data M140-5', 'data M250-5', 'data M80-2', 'data A50-3', 'data M90-4', 'data M65-3', 'data A100-3', 'data A120-4', 'data A30-2', 'data M200-6', 'data M85-3', 'data M160-5', 'data M50-2', 'data A25-1', 'data M110-4', 'data M180-5', 'data A15-1', 'data A50-2']


In [6]:
df = pd.DataFrame()



for housing in list_housings:
    houses = os.listdir(data_path + housing)
    random_indexes = [random.randint(0, len(houses)) for i in range(10)]
    # print(random_indexes)
    random_houses = []
    for i in random_indexes:
        random_houses.append(houses[i])
    # print(random_houses)
    
    for house in random_houses:
        h_type, h_surface, nb_people = extract_housing_info_from_name(housing)
        print(data_path + housing + "/" + house)
        temp = pd.read_csv(data_path + housing + "/" + house, header=[1],sep=',')
        #rename the columns Unnamed: 0 to timestamp
        temp.rename(columns={'Unnamed: 0':'date'}, inplace=True)
        #drop the column Unnamed: 1 (consommation totale)
        temp.drop('Unnamed: 1', axis=1, inplace=True)

        #Transpose
        temp = temp.T

        #make the first row as header
        temp['tmp_columns'] = temp.index
        temp = temp.reset_index(drop=True)
        temp.columns = temp.iloc[0]
        temp = temp.drop(temp.index[0])
        #make the column original_columns in first column
        cols = temp.columns.tolist()
        cols = cols[-1:] + cols[:-1]
        temp = temp[cols]
        #rename the column date to timestamp
        temp.rename(columns={'date':'timestamp'}, inplace=True)

        temp = pd.melt(temp, id_vars=['timestamp'], var_name='date', value_name='consommation')

        temp["day"] = temp["date"].apply(lambda x: x.split("/")[1])
        temp["month"] = temp["date"].apply(lambda x: x.split("/")[0])
        temp["h_type"] = h_type
        temp["h_surface"] = h_surface
        temp["nb_people"] = nb_people
        temp.drop("date", axis=1, inplace=True)
        
        df = pd.concat([df, temp], axis=0)
    
df.reset_index(drop=True, inplace=True) 
df

../../data/housing_consomation/data A110-5/data_maison_A110-5-104.csv
../../data/housing_consomation/data A110-5/data_maison_A110-5-292.csv
../../data/housing_consomation/data A110-5/data_maison_A110-5-195.csv
../../data/housing_consomation/data A110-5/data_maison_A110-5-341.csv
../../data/housing_consomation/data A110-5/data_maison_A110-5-134.csv
../../data/housing_consomation/data A110-5/data_maison_A110-5-63.csv
../../data/housing_consomation/data A110-5/data_maison_A110-5-400.csv
../../data/housing_consomation/data A110-5/data_maison_A110-5-114.csv
../../data/housing_consomation/data A110-5/data_maison_A110-5-34.csv
../../data/housing_consomation/data A110-5/data_maison_A110-5-378.csv
../../data/housing_consomation/data M170-6/data_maison_M170-6-263.csv
../../data/housing_consomation/data M170-6/data_maison_M170-6-216.csv
../../data/housing_consomation/data M170-6/data_maison_M170-6-257.csv
../../data/housing_consomation/data M170-6/data_maison_M170-6-218.csv
../../data/housing_con

Unnamed: 0,timestamp,consommation,day,month,h_type,h_surface,nb_people
0,0:00,0.9074,14,12,A,110,5
1,0:30,1.0415,14,12,A,110,5
2,1:00,0.5913,14,12,A,110,5
3,1:30,1.2522,14,12,A,110,5
4,2:00,1.5268,14,12,A,110,5
...,...,...,...,...,...,...,...
13465915,21:30,0.4858,1,1,A,50,2
13465916,22:00,0.5008,1,1,A,50,2
13465917,22:30,0.5144,1,1,A,50,2
13465918,23:00,0.4206,1,1,A,50,2


In [7]:
# calculate df memory usage in GB
df.memory_usage(deep=True).sum() / 1024**3

4.842607274651527

In [9]:
df.to_csv(f"../../data/dataframes/df.csv", index=False)

In [10]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 13465920 entries, 0 to 13465919
Data columns (total 7 columns):
 #   Column        Dtype 
---  ------        ----- 
 0   timestamp     object
 1   consommation  object
 2   day           object
 3   month         object
 4   h_type        object
 5   h_surface     object
 6   nb_people     object
dtypes: object(7)
memory usage: 719.2+ MB


In [11]:
df["consommation"] = df["consommation"].astype(float)
df["day"]       = df["day"].astype(int)
df["month"]     = df["month"].astype(int)
df["h_surface"] = df["h_surface"].astype(int)
df["nb_people"] = df["nb_people"].astype(int)

In [13]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 13465920 entries, 0 to 13465919
Data columns (total 7 columns):
 #   Column        Dtype  
---  ------        -----  
 0   timestamp     object 
 1   consommation  float64
 2   day           int64  
 3   month         int64  
 4   h_type        object 
 5   h_surface     int64  
 6   nb_people     int64  
dtypes: float64(1), int64(4), object(2)
memory usage: 719.2+ MB


In [14]:
df.memory_usage(deep=True).sum() / 1024**3

2.0013530999422073

In [16]:
type(df.h_type[0])

str

In [None]:
df.to_csv("../../data/dataframes/df.csv", index = False)