In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

%matplotlib inline


filepath = 'data/Weather_Braga_Until_20190228.csv'

cols = ['weather_description', 'temperature', 'atmospheric_pressure',
        'humidity', 'wind_speed', 'cloudiness', 'rain', 'current_luminosity',
        'sunrise', 'sunset', 'creation_date']

dtypes = {'weather_description': 'category',
 'temperature': 'uint8',
 'atmospheric_pressure': 'uint16',
 'humidity': 'uint8',
 'wind_speed': 'uint8',
 'cloudiness': 'uint8',
 'rain': 'uint8',
 'current_luminosity': 'category',
 'sunrise': 'datetime64[ns]',
 'sunset': 'datetime64[ns]',
 'creation_date': 'datetime64[ns]'}

df = pd.read_csv(filepath, usecols=cols, parse_dates=['sunrise', 'sunset', 'creation_date'])

df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4388 entries, 0 to 4387
Data columns (total 11 columns):
weather_description     4388 non-null object
temperature             4388 non-null int64
atmospheric_pressure    4388 non-null int64
humidity                4388 non-null int64
wind_speed              4388 non-null int64
cloudiness              4388 non-null int64
rain                    4388 non-null int64
current_luminosity      4388 non-null object
sunrise                 4388 non-null datetime64[ns]
sunset                  4388 non-null datetime64[ns]
creation_date           4388 non-null datetime64[ns]
dtypes: datetime64[ns](3), int64(6), object(2)
memory usage: 377.2+ KB


In [2]:
def typecast_objects(gl_obj):
    gl_obj = gl_obj.apply(lambda x: x.str.strip())
    gl_obj = gl_obj.apply(lambda x: x.str.lower())
    
    converted_obj = pd.DataFrame()
    
    for col in gl_obj.columns:
        num_unique_values = len(gl_obj[col].unique())
        num_total_values = len(gl_obj[col])
        if num_unique_values / num_total_values < 0.5:
            converted_obj.loc[:, col] = gl_obj[col].astype('category')
        else:
            converted_obj.loc[:, col] = gl_obj[col]
    
    return converted_obj


def downcast(df):
    df_int = df.select_dtypes(include=['int'])
    converted_int = df_int.apply(pd.to_numeric, downcast='unsigned')

    df_obj = df.select_dtypes(include=['object'])
    converted_obj = typecast_objects(df_obj)

    df[converted_int.columns] = converted_int
    df[converted_obj.columns] = converted_obj
    
    return df


def get_dtypes(df):
    dtypes = df.dtypes

    colnames = dtypes.index
    types = [i.name for i in dtypes.values]

    return dict(zip(colnames, types))

In [3]:
df = df.sort_values(by=['creation_date'])
df = df.reset_index(drop=True)

In [4]:
df['date'] = df['creation_date'].dt.strftime('%Y-%m-%d %H')

In [5]:
df.head()

Unnamed: 0,weather_description,temperature,atmospheric_pressure,humidity,wind_speed,cloudiness,rain,current_luminosity,sunrise,sunset,creation_date,date
0,algumas nuvens,21,1016,77,5,20,0,LIGHT,2018-07-24 05:21:06,2018-07-24 19:58:50,2018-07-24 14:58:52,2018-07-24 14
1,algumas nuvens,22,1016,73,5,20,0,LIGHT,2018-07-24 05:21:08,2018-07-24 19:58:48,2018-07-24 15:46:28,2018-07-24 15
2,algumas nuvens,20,1016,82,4,20,0,LIGHT,2018-07-24 05:21:10,2018-07-24 19:58:46,2018-07-24 16:47:04,2018-07-24 16
3,algumas nuvens,20,1016,82,4,20,0,LIGHT,2018-07-24 05:21:13,2018-07-24 19:58:44,2018-07-24 17:47:04,2018-07-24 17
4,névoa,18,1016,93,3,75,0,LIGHT,2018-07-24 05:21:15,2018-07-24 19:58:41,2018-07-24 18:47:04,2018-07-24 18
