In [7]:
import pandas as pd

# Load the data
covid_data = pd.read_csv('dataset/modelling_data/cases/covid-case-counts.csv',low_memory=False)
weather_data = pd.read_csv('dataset/modelling_data/niwa/weather_data.csv',low_memory=False)

# Convert the 'Report Date' and 'Date' columns to datetime
covid_data['Report Date'] = pd.to_datetime(covid_data['Report Date'])
weather_data['Date'] = pd.to_datetime(weather_data['Date'])

# Align the datasets to the same time range
start_date = max(covid_data['Report Date'].min(), weather_data['Date'].min())
end_date = min(covid_data['Report Date'].max(), weather_data['Date'].max())

covid_data = covid_data[(covid_data['Report Date'] >= start_date) & (covid_data['Report Date'] <= end_date)]
weather_data = weather_data[(weather_data['Date'] >= start_date) & (weather_data['Date'] <= end_date)]

# Merge the datasets on the date columns
merged_data = pd.merge(covid_data, weather_data, left_on='Report Date', right_on='Date')

# Drop the redundant date column
merged_data = merged_data.drop(columns=['Date'])

merged_data.to_csv('dataset/preprocessed/preprocessed_data.csv', index=False)


In [7]:
import pandas as pd
from sklearn import preprocessing

# Define the data types for each column to save memory
dtypes = {
    'Report Date': 'str',
    'Number of cases reported': 'float32',
    'WSpd.m.s.': 'float32',
    'WindRun.Km.': 'float32',
    'Tmax.C.': 'float32',
    'Tmin.C.': 'float32'
}

# Load and process the data in chunks
chunksize = 10 ** 6
chunks = []

for chunk in pd.read_csv('dataset/preprocessed/preprocessed_data.csv', dtype=dtypes, usecols=list(dtypes.keys()), chunksize=chunksize):
    # Group by 'Report Date' and sum 'Number of cases reported'
    chunk = chunk.groupby('Report Date', as_index=False).agg({
        'Number of cases reported': 'sum',
        'WSpd.m.s.': 'mean',
        'WindRun.Km.': 'mean',
        'Tmax.C.': 'mean',
        'Tmin.C.': 'mean'
    })
    
    # Handle missing values (if any)
    chunk = chunk.fillna(chunk.mean())
    
    # Normalize the data
    scaler = preprocessing.MinMaxScaler()
    chunk[list(dtypes.keys())[1:]] = scaler.fit_transform(chunk[list(dtypes.keys())[1:]])
    
    chunks.append(chunk)

# Concatenate all chunks
df = pd.concat(chunks)

df = df.drop(columns=['Report Date'])
# Save the processed data
df.to_csv('dataset/processed_data.csv', index=False,header=False)


In [25]:
import pandas as pd

# Load the data
covid_data = pd.read_csv('dataset/modelling_data/cases/covid-case-counts.csv',low_memory=False)
weather_data = pd.read_csv('dataset/modelling_data/niwa/weather_data.csv',low_memory=False)
crime_data = pd.read_csv('dataset/modelling_data/nz_crime/nz_crime.csv',low_memory=False)

# Convert the 'Report Date', 'Date', and 'Year Month' columns to datetime
covid_data['Report Date'] = pd.to_datetime(covid_data['Report Date'])
weather_data['Date'] = pd.to_datetime(weather_data['Date'])
crime_data['Year Month'] = pd.to_datetime(crime_data['Year Month'])

# Resample the covid data and the weather data to a monthly frequency
covid_data = covid_data.resample('M', on='Report Date').sum()
covid_data.index = covid_data.index + pd.offsets.MonthBegin(-1)
weather_data = weather_data.resample('M', on='Date').mean()
weather_data.index = weather_data.index + pd.offsets.MonthBegin(-1)

start_date = max(covid_data.index.min(), weather_data.index.min(), crime_data['Year Month'].min())
end_date = min(covid_data.index.max(), weather_data.index.max(), crime_data['Year Month'].max())

covid_data = covid_data[(covid_data.index >= start_date) & (covid_data.index <= end_date)]
weather_data = weather_data[(weather_data.index >= start_date) & (weather_data.index <= end_date)]
crime_data = crime_data[(crime_data['Year Month'] >= start_date) & (crime_data['Year Month'] <= end_date)]


#print(covid_data.head(n=100))
#print(weather_data.head(n=100))
#print(crime_data.head(n=100))


# Merge the datasets on the date columns
merged_data = pd.merge(covid_data, weather_data, left_index=True, right_index=True)
#print(merged_data.head(n=100))
merged_data = pd.merge(merged_data, crime_data, left_index=True, right_on='Year Month')
#print(merged_data.head(n=100))
# Drop the redundant date column
#merged_data = merged_data.drop(columns=['Year Month'])
#print(merged_data.head(n=100))
# Preprocess the data for StemGNN
# This step will depend on the specific requirements of the StemGNN model

merged_data.to_csv('dataset/preprocessed/preprocessed_data1.csv', index=False)




In [28]:
import pandas as pd
from sklearn import preprocessing

# Define the data types for each column to save memory
dtypes = {
    'Year Month': 'str',
    'Number of cases reported': 'float32',
    'WSpd.m.s.': 'float32',
    'WindRun.Km.': 'float32',
    'Tmax.C.': 'float32',
    'Tmin.C.': 'float32',
    'vict_sum': 'float32',
    'vict_cnt': 'float32'
}

# Load and process the data in chunks
chunksize = 10 ** 6
chunks = []

for chunk in pd.read_csv('dataset/preprocessed/preprocessed_data1.csv', dtype=dtypes, usecols=list(dtypes.keys()), chunksize=chunksize):
    # Group by 'Report Date' and sum 'Number of cases reported'
    chunk = chunk.groupby('Year Month', as_index=False).agg({
        'Number of cases reported': 'sum',
        'WSpd.m.s.': 'mean',
        'WindRun.Km.': 'mean',
        'Tmax.C.': 'mean',
        'Tmin.C.': 'mean',
        'vict_sum': 'sum',
        'vict_cnt': 'sum'
    })
    
    # Handle missing values (if any)
    chunk = chunk.fillna(chunk.mean())
    
    # Normalize the data
    scaler = preprocessing.MinMaxScaler()
    chunk[list(dtypes.keys())[1:]] = scaler.fit_transform(chunk[list(dtypes.keys())[1:]])
    
    chunks.append(chunk)

# Concatenate all chunks
df = pd.concat(chunks)

df = df.drop(columns=['Year Month'])
# Save the processed data
df.to_csv('dataset/processed_data1.csv', index=False,header=False)

In [7]:
import pandas as pd
from sklearn.preprocessing import MinMaxScaler
import joblib

def load_and_resample(path, date_column, columns, start_date=None, end_date=None, frequency='D'):
    df = pd.read_csv(path,low_memory=False)
    df[date_column] = pd.to_datetime(df[date_column])  # convert date column to datetime
    df = df[columns + [date_column]]  # select only the specific columns and date column
    df = df.groupby(date_column).sum().reset_index()  # sum values on duplicate dates
    df.set_index(date_column, inplace=True)  # set date column as index
    df_resampled = df.resample(frequency).sum()  # resample data to daily frequency
    #df_resampled.fillna(0, inplace=True) # fill missing values with 0
    if start_date and end_date:
        all_dates = pd.date_range(start=start_date, end=end_date, freq=frequency)
        df_resampled = df_resampled.reindex(all_dates, fill_value=0)  # trim date range
    return df_resampled

def normalize_data(df, path_to_save_scaler):
    scalers = []
    for i in range(df.shape[1]):
        scaler = MinMaxScaler()
        df.iloc[:, i] = scaler.fit_transform(df.iloc[:, i].values.reshape(-1, 1))
        scalers.append(scaler)  # store the fitted scaler for later use

    # Save the scalers
    joblib.dump(scalers, path_to_save_scaler) 
    return df

def preprocess_data(paths, date_columns, columns_list, output_path):
    
    # Load covid-case-counts data and find its date range
    covid_data = load_and_resample(paths[0], date_columns[0], columns_list[0])
    #covid_data = normalize_data(covid_data,f'scaler_0.pkl')
    start_date = covid_data.index.min()
    end_date = covid_data.index.max()

    # Preprocess all other datasets
    preprocessed_data = []
    for i, (path, date_column, columns) in enumerate(zip(paths, date_columns, columns_list)):
        df = load_and_resample(path, date_column, columns, start_date, end_date)
        df = normalize_data(df, f'scaler_{i}.pkl')  # normalize data
        preprocessed_data.append(df)

    # Concatenate all dataframes along the columns axis
    merged_data = pd.concat(preprocessed_data, axis=1)

    # Save merged data to CSV
    merged_data.reset_index(drop=True, inplace=True)
    merged_data.to_csv(output_path,index=False,header=False)



    return merged_data

paths = ['dataset/modelling_data/cases/covid-case-counts.csv', 'dataset/modelling_data/niwa/weather_data.csv', 'dataset/modelling_data/nz_crime/nz_crime.csv', 'dataset/modelling_data/cases/weekly-hospitalisations-for-covid.csv', 'dataset/modelling_data/statsnz/regional/CPACT15_electricity.csv', 'dataset/modelling_data/statsnz/regional/CPEMP8_jobs.csv']
date_columns = ['Report Date', 'Date', 'Year Month', 'Admissions for COVID-19 in the week ending', 'Period', 'Period']
columns_list = [['Number of cases reported'], ['WSpd.m.s.', 'WindRun.Km.', 'Tmax.C.', 'Tmin.C.'], ['vict_sum', 'vict_cnt'], ['Hospitalisations'], ['Value'], ['Value']]
output_path = 'dataset/all_preprocessed_data.csv'

# Preprocess all datasets and merge them
merged_data = preprocess_data(paths, date_columns, columns_list, output_path)
# Now 'merged_data' is a dataframe that contains all preprocessed data, and it is also saved to a CSV file


In [None]:
paths = ['dataset/modelling_data/cases/covid-case-counts.csv', 'dataset/modelling_data/niwa/weather_data.csv', 'dataset/modelling_data/nz_crime/nz_crime.csv', 'dataset/modelling_data/cases/weekly-hospitalisations-for-covid.csv', 'dataset/modelling_data/statsnz/regional/CPACT15_electricity.csv', 'dataset/modelling_data/statsnz/regional/CPEMP8_jobs.csv']

In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import matplotlib.dates as mdates

# Read the data from the CSV file
data = pd.read_csv('dataset/covid-case-counts-processed-plot.csv', low_memory=False)
head_names = data.head().columns
print(head_names)

Index(['Report Date', 'At the border', 'Auckland', 'Bay of Plenty',
       'Canterbury/West Coast', 'Capital & Coast/Hutt', 'Counties Manukau',
       'Hawke's Bay', 'Lakes', 'MidCentral', 'Nelson Marlborough', 'Northland',
       'South Canterbury', 'Southern', 'Tairawhiti', 'Taranaki', 'Unknown',
       'Waikato', 'Wairarapa', 'Waitemata', 'Whanganui'],
      dtype='object')


In [7]:
import pandas as pd
from sklearn.preprocessing import MinMaxScaler
import joblib

def load_and_resample(path, date_column, columns, start_date=None, end_date=None, frequency='D'):
    df = pd.read_csv(path,low_memory=False)
    df[date_column] = pd.to_datetime(df[date_column])  # convert date column to datetime
    df = df[columns + [date_column]]  # select only the specific columns and date column
    df = df.groupby(date_column).sum().reset_index()  # sum values on duplicate dates
    df.set_index(date_column, inplace=True)  # set date column as index
    df.fillna(0, inplace=True) # fill missing values with 0
    if start_date and end_date:
        all_dates = pd.date_range(start=start_date, end=end_date, freq=frequency)
        df = df.reindex(all_dates, fill_value=0)  # trim date range
    return df

def normalize_data(df, path_to_save_scaler):
    scalers = []
    for i in range(df.shape[1]):
        scaler = MinMaxScaler()
        df.iloc[:, i] = scaler.fit_transform(df.iloc[:, i].values.reshape(-1, 1))
        scalers.append(scaler)  # store the fitted scaler for later use

    # Save the scalers
    joblib.dump(scalers, path_to_save_scaler) 
    return df

def preprocess_data(paths, date_columns, columns_list, output_path):
    
    # Load covid-case-counts data and find its date range
    covid_data = load_and_resample(paths[0], date_columns[0], columns_list[0])
    covid_data = normalize_data(covid_data,f'1scaler_0.pkl')
    start_date = covid_data.index.min()
    end_date = covid_data.index.max()

    # Preprocess all other datasets
    preprocessed_data = [covid_data]
    df = load_and_resample(paths[1], date_columns[1], columns_list[1], start_date, end_date)
    df = normalize_data(df, f'1scaler_{1}.pkl')  # normalize data
    preprocessed_data.append(df)
    

    # Concatenate all dataframes along the columns axis
    merged_data = pd.concat(preprocessed_data, axis=1)

    # Save merged data to CSV
    merged_data.reset_index(drop=True, inplace=True)
    merged_data.fillna(0, inplace=True)
    merged_data.to_csv(output_path,index=False,header=False)



    return merged_data

paths = ['dataset/modelling_data/cases/covid-case-counts.csv', 'dataset/modelling_data/niwa/weather_data.csv']
date_columns = ['Report Date', 'Date']
columns_list = [['Number of cases reported'], ['WSpd.m.s.', 'WindRun.Km.', 'Tmax.C.', 'Tmin.C.']]
output_path = 'dataset/preprocessed_data.csv'

# Preprocess all datasets and merge them
merged_data = preprocess_data(paths, date_columns, columns_list, output_path)
# Now 'merged_data' is a dataframe that contains all preprocessed data, and it is also saved to a CSV file