In [1]:
import numpy as np
import pandas as pd

filepath = 'data/Weather_Braga_WithRain_Until_20190228.csv'

df = pd.read_csv(filepath, parse_dates=['sunrise', 'sunset', 'creation_date'])

In [2]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4388 entries, 0 to 4387
Data columns (total 9 columns):
city_name        4388 non-null object
cloudiness       3605 non-null object
atmosphere       454 non-null object
snow             0 non-null float64
thunderstorm     15 non-null object
rain             322 non-null object
sunrise          4388 non-null datetime64[ns]
sunset           4388 non-null datetime64[ns]
creation_date    4388 non-null datetime64[ns]
dtypes: datetime64[ns](3), float64(1), object(5)
memory usage: 308.6+ KB


In [3]:
def typecast_objects(gl_obj):
    gl_obj = gl_obj.apply(lambda x: x.str.strip())
    gl_obj = gl_obj.apply(lambda x: x.str.lower())
    
    converted_obj = pd.DataFrame()
    
    for col in gl_obj.columns:
        num_unique_values = len(gl_obj[col].unique())
        num_total_values = len(gl_obj[col])
        if num_unique_values / num_total_values < 0.5:
            converted_obj.loc[:, col] = gl_obj[col].astype('category')
        else:
            converted_obj.loc[:, col] = gl_obj[col]
    
    return converted_obj


def downcast(df):
    df_int = df.select_dtypes(include=['int'])
    converted_int = df_int.apply(pd.to_numeric, downcast='unsigned')
    
    df_float = df.select_dtypes(include=['float'])
    converted_float = df_float.apply(pd.to_numeric, downcast='float')

    df_obj = df.select_dtypes(include=['object'])
    converted_obj = typecast_objects(df_obj)

    df[converted_int.columns] = converted_int
    df[converted_float.columns] = converted_float
    df[converted_obj.columns] = converted_obj
    
    return df


def get_dtypes(df):
    dtypes = df.dtypes

    colnames = dtypes.index
    types = [i.name for i in dtypes.values]

    return dict(zip(colnames, types))

In [4]:
df = downcast(df)

In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4388 entries, 0 to 4387
Data columns (total 9 columns):
city_name        4388 non-null category
cloudiness       3605 non-null category
atmosphere       454 non-null category
snow             0 non-null float32
thunderstorm     15 non-null category
rain             322 non-null category
sunrise          4388 non-null datetime64[ns]
sunset           4388 non-null datetime64[ns]
creation_date    4388 non-null datetime64[ns]
dtypes: category(5), datetime64[ns](3), float32(1)
memory usage: 142.5 KB


In [6]:
df.columns

Index(['city_name', 'cloudiness', 'atmosphere', 'snow', 'thunderstorm', 'rain',
       'sunrise', 'sunset', 'creation_date'],
      dtype='object')

In [7]:
get_dtypes(df)

{'city_name': 'category',
 'cloudiness': 'category',
 'atmosphere': 'category',
 'snow': 'float32',
 'thunderstorm': 'category',
 'rain': 'category',
 'sunrise': 'datetime64[ns]',
 'sunset': 'datetime64[ns]',
 'creation_date': 'datetime64[ns]'}

In [11]:
df.describe(include=['category'])

Unnamed: 0,city_name,cloudiness,atmosphere,thunderstorm,rain
count,4388,3605,454,15,322
unique,1,7,3,3,8
top,braga,céu claro,névoa,trovoada com chuva leve,chuva fraca
freq,4388,2003,266,7,127
