In [1]:
!pip install google-cloud-bigquery
!pip install google-cloud-bigquery[pandas]

Collecting google-cloud-bigquery
  Using cached google_cloud_bigquery-3.0.1-py2.py3-none-any.whl (210 kB)
Collecting google-cloud-bigquery-storage<3.0.0dev,>=2.0.0
  Using cached google_cloud_bigquery_storage-2.13.0-py2.py3-none-any.whl (179 kB)
Collecting grpcio<2.0dev,>=1.38.1
  Using cached grpcio-1.44.0-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (4.3 MB)
Collecting google-cloud-core<3.0.0dev,>=1.4.1
  Using cached google_cloud_core-2.2.3-py2.py3-none-any.whl (29 kB)
Collecting google-api-core[grpc]!=2.0.*,!=2.1.*,!=2.2.*,!=2.3.0,<3.0.0dev,>=1.31.5
  Using cached google_api_core-2.7.1-py3-none-any.whl (114 kB)
Collecting google-resumable-media<3.0dev,>=0.6.0
  Using cached google_resumable_media-2.3.2-py2.py3-none-any.whl (76 kB)
Collecting proto-plus>=1.15.0
  Using cached proto_plus-1.20.3-py3-none-any.whl (46 kB)
Collecting pyarrow<8.0dev,>=3.0.0
  Using cached pyarrow-7.0.0-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (26.7 MB)
Collecting googleapis-com

# Importing all the libraries

In [2]:
import csv
import os
import pandas as pd
import numpy as np
from google.cloud import bigquery
from google.oauth2 import service_account
from google.cloud.bigquery import magics
from sklearn.model_selection import train_test_split 
from sklearn.linear_model import LinearRegression
from sklearn import metrics
import matplotlib.pyplot as plt  
import statsmodels.api as sm
import itertools

# CONFIGURE THE BIGQUERY SETTINGS

In [6]:
BIGQUERY_PROJECT = 'ironhacks-covid19-data'
BIGQUERY_KEYPATH = 'service-account.json'

os.environ['GOOGLE_APPLICATION_CREDENTIALS'] = BIGQUERY_KEYPATH
bigquery_client = bigquery.Client(project=BIGQUERY_PROJECT)

In [7]:
query = """
SELECT *
FROM `ironhacks-covid19-data.ironhacks_covid19_training.weather_data`
"""

# QUERY THE DATA ONCE
query_job = bigquery_client.query(query)
data = query_job.to_dataframe()
data['date']= pd.to_datetime(data['date'])
data.head()

Unnamed: 0,date,max_rel_humidity,max_temperature,mean_temperature,min_rel_humidity,min_temperature,potential_water_deficit,precipitation_data,wind_speed
0,2019-10-12,69.2646,13.5804,7.6987,28.2524,1.817,-3.0055,0.0,5.124
1,2019-12-22,93.9565,11.8324,4.6358,34.5712,-2.5607,-1.2796,0.0,2.8557
2,2019-08-24,91.3571,24.6652,18.6607,44.4712,12.6562,-4.7381,0.0,4.4706
3,2020-07-14,90.5395,29.7732,23.158,43.2159,16.5427,-5.8112,0.0,2.3504
4,2019-12-07,98.1103,6.4609,0.5011,42.769,-5.4587,-0.9686,0.0,3.2087


### We will predict max_temprature in this example and convert them to data arrays

In [8]:
labels = np.array(data['max_temperature'])
features=data.drop(['max_temperature','date'], axis=1)
feature_list = list(features.columns)
features = np.array(features)

### Train test split with 25% of testing data.

In [9]:
train_features, test_features, train_labels, test_labels = train_test_split(features, labels, test_size = 0.25, random_state = 42)

In [10]:
print(f'Training Features Shape: {train_features.shape}')
print(f'Testing Features Shape: {test_features.shape}')
print(f'Training Labels Shape: {train_labels.shape}')
print(f'Testing Labels Shape: {test_labels.shape}')

Training Features Shape: (273, 7)
Testing Features Shape: (92, 7)
Training Labels Shape: (273,)
Testing Labels Shape: (92,)


### Importing the random forest model from sklearn

In [11]:
from sklearn.ensemble import RandomForestRegressor
rf = RandomForestRegressor(n_estimators=1000, random_state=42)
rf.fit(train_features, train_labels)

RandomForestRegressor(n_estimators=1000, random_state=42)

In [13]:
predictions = rf.predict(test_features)

In [14]:
errors = abs(test_labels - predictions)

In [15]:
print(f'Mean Absolute Error: {round(np.mean(errors), 2)}')

Mean Absolute Error: 0.87


In [16]:
# Calculate mean absolute percentage error (MAPE)
mape = 100 * (errors/test_labels)
# Calcualte and display accuracy
accuracy = 100 - np.mean(mape)
print(f'Accuracy: {round(accuracy, 2)}%.')

Accuracy: 93.33%.
