In [18]:
import numpy as np
import pandas as pd
import firebase_admin
from firebase_admin import credentials
from firebase_admin import firestore

In [19]:
# Replace numerical columns with median and add new columns for mean temperature, wind, rainfall, humidity, pressure, and cloud
def replace_numerical(df):
    numerical_cols = ['MinTemp', 'MaxTemp', 'Rainfall', 'WindGustSpeed', 'WindSpeed9am', 'WindSpeed3pm', 'Humidity9am', 'Humidity3pm', 'Pressure9am', 'Pressure3pm', 'Temp9am', 'Temp3pm']
    for col in numerical_cols:
        df[col] = df[col].fillna(df[col].median())
    df['AvgTemp'] = (df['MinTemp'] + df['MaxTemp']) / 2
    df['AvgWind'] = (df['WindGustSpeed'] + df['WindSpeed9am'] + df['WindSpeed3pm']) / 3
    df['AvgRainfall'] = df[['Rainfall', 'RainToday']].mean(axis=1)
    df['AvgHumidity'] = (df['Humidity9am'] + df['Humidity3pm']) / 2
    df['AvgPressure'] = (df['Pressure9am'] + df['Pressure3pm']) / 2
    return df


# Replace object columns with mode
def replace_object(df):
    for col in df.select_dtypes('object'):
        df[col] = df[col].fillna(method='ffill')
    return df

# Add surrogate key
def add_id(df):
    df.insert(0, 'id', range(1, 1+len(df)))
    return df

Etraction: extracting data from a csv file under the name "weatherAUS.csv"

In [20]:
# read csv file
data = pd.read_csv('weatherAUS.csv')
df = data.copy()
print(df)

              Date Location  MinTemp  MaxTemp  Rainfall  Evaporation  \
0       2008-12-01   Albury     13.4     22.9       0.6          NaN   
1       2008-12-02   Albury      7.4     25.1       0.0          NaN   
2       2008-12-03   Albury     12.9     25.7       0.0          NaN   
3       2008-12-04   Albury      9.2     28.0       0.0          NaN   
4       2008-12-05   Albury     17.5     32.3       1.0          NaN   
...            ...      ...      ...      ...       ...          ...   
145455  2017-06-21    Uluru      2.8     23.4       0.0          NaN   
145456  2017-06-22    Uluru      3.6     25.3       0.0          NaN   
145457  2017-06-23    Uluru      5.4     26.9       0.0          NaN   
145458  2017-06-24    Uluru      7.8     27.0       0.0          NaN   
145459  2017-06-25    Uluru     14.9      NaN       0.0          NaN   

        Sunshine WindGustDir  WindGustSpeed WindDir9am  ... Humidity9am  \
0            NaN           W           44.0          W  ... 

Transformation:
    1. Removing empty columns
    2. Remove empty rows from columns with 50 or more empty cells
    3. Replace numerical values with the median and add new columns for mean temperature, wind, rainfall, humidity, pressure, and cloud
    4. Replace object values with the mode

In [21]:
# Since most of the values are null, so drop Sunshine, Evaporation, Cloud9am and Cloud3pm 
col = ['Sunshine', 'Evaporation', 'Cloud9am', 'Cloud3pm']
df.drop(col, axis=1, inplace=True)
print(df)

              Date Location  MinTemp  MaxTemp  Rainfall WindGustDir  \
0       2008-12-01   Albury     13.4     22.9       0.6           W   
1       2008-12-02   Albury      7.4     25.1       0.0         WNW   
2       2008-12-03   Albury     12.9     25.7       0.0         WSW   
3       2008-12-04   Albury      9.2     28.0       0.0          NE   
4       2008-12-05   Albury     17.5     32.3       1.0           W   
...            ...      ...      ...      ...       ...         ...   
145455  2017-06-21    Uluru      2.8     23.4       0.0           E   
145456  2017-06-22    Uluru      3.6     25.3       0.0         NNW   
145457  2017-06-23    Uluru      5.4     26.9       0.0           N   
145458  2017-06-24    Uluru      7.8     27.0       0.0          SE   
145459  2017-06-25    Uluru     14.9      NaN       0.0         NaN   

        WindGustSpeed WindDir9am WindDir3pm  WindSpeed9am  WindSpeed3pm  \
0                44.0          W        WNW          20.0          24.0 

In [22]:
# Remove empty rows from columns with 50 or more empty cells
threshold = 50
for col in df.columns:
    if df[col].isna().sum() > threshold:
        df = df.dropna(subset=[col])
print(df)

              Date Location  MinTemp  MaxTemp  Rainfall WindGustDir  \
0       2008-12-01   Albury     13.4     22.9       0.6           W   
1       2008-12-02   Albury      7.4     25.1       0.0         WNW   
2       2008-12-03   Albury     12.9     25.7       0.0         WSW   
3       2008-12-04   Albury      9.2     28.0       0.0          NE   
4       2008-12-05   Albury     17.5     32.3       1.0           W   
...            ...      ...      ...      ...       ...         ...   
145454  2017-06-20    Uluru      3.5     21.8       0.0           E   
145455  2017-06-21    Uluru      2.8     23.4       0.0           E   
145456  2017-06-22    Uluru      3.6     25.3       0.0         NNW   
145457  2017-06-23    Uluru      5.4     26.9       0.0           N   
145458  2017-06-24    Uluru      7.8     27.0       0.0          SE   

        WindGustSpeed WindDir9am WindDir3pm  WindSpeed9am  WindSpeed3pm  \
0                44.0          W        WNW          20.0          24.0 

In [23]:
# Replace numerical values with the median and add new columns for mean temperature, wind, rainfall, humidity, pressure, and cloud
df = replace_numerical(df)

# Replace object values with the mode
df = replace_object(df)

#remove duplicates
df = df.drop_duplicates()

print(df)


              Date Location  MinTemp  MaxTemp  Rainfall WindGustDir  \
0       2008-12-01   Albury     13.4     22.9       0.6           W   
1       2008-12-02   Albury      7.4     25.1       0.0         WNW   
2       2008-12-03   Albury     12.9     25.7       0.0         WSW   
3       2008-12-04   Albury      9.2     28.0       0.0          NE   
4       2008-12-05   Albury     17.5     32.3       1.0           W   
...            ...      ...      ...      ...       ...         ...   
145454  2017-06-20    Uluru      3.5     21.8       0.0           E   
145455  2017-06-21    Uluru      2.8     23.4       0.0           E   
145456  2017-06-22    Uluru      3.6     25.3       0.0         NNW   
145457  2017-06-23    Uluru      5.4     26.9       0.0           N   
145458  2017-06-24    Uluru      7.8     27.0       0.0          SE   

        WindGustSpeed WindDir9am WindDir3pm  WindSpeed9am  ...  Pressure3pm  \
0                44.0          W        WNW          20.0  ...      

  df['AvgRainfall'] = df[['Rainfall', 'RainToday']].mean(axis=1)


Loading: generating surrogate keys

In [24]:
# Add surrogate keys
df = add_id(df)
print(df)

            id        Date Location  MinTemp  MaxTemp  Rainfall WindGustDir  \
0            1  2008-12-01   Albury     13.4     22.9       0.6           W   
1            2  2008-12-02   Albury      7.4     25.1       0.0         WNW   
2            3  2008-12-03   Albury     12.9     25.7       0.0         WSW   
3            4  2008-12-04   Albury      9.2     28.0       0.0          NE   
4            5  2008-12-05   Albury     17.5     32.3       1.0           W   
...        ...         ...      ...      ...      ...       ...         ...   
145454  112956  2017-06-20    Uluru      3.5     21.8       0.0           E   
145455  112957  2017-06-21    Uluru      2.8     23.4       0.0           E   
145456  112958  2017-06-22    Uluru      3.6     25.3       0.0         NNW   
145457  112959  2017-06-23    Uluru      5.4     26.9       0.0           N   
145458  112960  2017-06-24    Uluru      7.8     27.0       0.0          SE   

        WindGustSpeed WindDir9am WindDir3pm  ...  P

In [25]:
# Write the modified DataFrame to a new CSV file with index=False (to exclude the row numbers)
df.to_csv('modified_weatherAUS.csv', index=False)

Uploading data to firebase firestore as our DBMS.
***Firebase allows 20k writes a day, when they are exhausted...you can't write anything in the DB anymore. hence the errors if you run this part***

In [26]:
#Upload Data to firebase
# Firebase project's credentials file
cred = credentials.Certificate("service.json")
firebase_admin.initialize_app(cred)

# Name of Firestore collection
collection_name = "weather_data"

# Read the CSV file into a pandas DataFrame
df = pd.read_csv("modified_weatherAUS.csv")

# Create a Firestore client
db = firestore.client()

# Upload each row as a document in the Firestore collection
for index, row in df.iterrows():
    doc_id = str(row["id"])  # Use the "id" column as the document ID
    data = row.to_dict()     # Convert the row to a dictionary
    del data["id"]           # Remove the "id" key from the dictionary
    db.collection(collection_name).document(doc_id).set(data)
print("done!")