In [None]:
import urllib.request
import io
import gzip
import pandas as pd
import numpy as np
import time
import requests
from pandas.io.json import json_normalize
import json
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
from sklearn import ensemble, preprocessing
from sklearn.model_selection import train_test_split
import sklearn.metrics

In [101]:
import warnings
warnings.filterwarnings("ignore")

Downloading the file with City ID and population data from the OpenWeatherMapData

In [102]:
url = "http://bulk.openweathermap.org/sample/current.city.list.json.gz"
outfile_path = "current_city.json"

In [103]:
response = urllib.request.urlopen(url)
compressed_file = io.BytesIO(response.read())
decompressed_file = gzip.GzipFile(fileobj=compressed_file)

with open(outfile_path, 'wb') as outfile:
    outfile.write(decompressed_file.read())

In [104]:
data = json.load(open(outfile_path))

In [105]:
# Sample format of the output
data[0]

{'coord': {'lat': 34.790878, 'lon': 48.570728},
 'country': 'IR',
 'geoname': {'cl': 'P', 'code': 'PPL', 'parent': 132142},
 'id': 14256,
 'langs': [{'de': 'Azad Shahr'}, {'fa': 'آزادشهر'}],
 'name': 'Azadshahr',
 'stat': {'level': 1.0, 'population': 514102},
 'stations': [{'dist': 9, 'id': 7030, 'kf': 1}],
 'zoom': 10}

In [106]:
# Transforming json to dataframe
df = pd.DataFrame(index = np.arange(len(data)), columns = ['id', 'city', 'country', 'population'])
for i in range(len(data)):
    df['id'][i] = data[i]['id']
    df['city'][i] = data[i]['name']
    df['country'][i] = data[i]['country']
    df['population'][i] = data[i]['stat']['population']

In [107]:
df = df.sort_values(['population'], ascending = False)
df = df.reset_index()

In [108]:
# Selecting only the top 100 cities based on population
top100 = df[0:100]
# saving the results
# top100.to_csv('~/top100.csv')

In [4]:
top100 = pd.read_csv('~/top100.csv')

In [19]:
# View the sample data of top cities
top100.head(3)

Unnamed: 0.1,Unnamed: 0,index,id,city,country,population
0,0,8526,1796236,Shanghai,CN,14608512
1,1,9085,1850144,Tōkyō-to,JP,12445327
2,2,4022,1174872,Karachi,PK,11624219


### Classification model of Weather bit data for the top 100 cities

In [16]:
base_url = "https://api.weatherbit.io/v2.0/forecast/3hourly?city="
key = "&key=fb160a43539c4f65886ca58f5aba9b94"

In [None]:
cities = top100.city.values

In [17]:
# Fetching the weather data for top 50 cities
res_bit = []
for i in range(len(cities[0:50])):
    res_bit.append(requests.get(base_url + str(cities[i]) + key).text)
    time.sleep(0.5)

In [18]:
# Checking the sample response
json.loads(res_bit[1])

{'city_name': 'Tōkyō',
 'country_code': 'JP',
 'data': [{'app_temp': 9.8,
   'clouds': 3,
   'datetime': '2018-03-11:15',
   'dewpt': 3,
   'dhi': 0,
   'pod': 'n',
   'pop': 0,
   'precip': 0,
   'pres': 1012,
   'rh': 62.5,
   'slp': 1019.5,
   'snow': 0,
   'snow_depth': 0,
   'temp': 9.8,
   'ts': 1520780400,
   'uv': 0,
   'vis': 10,
   'weather': {'code': '800', 'description': 'Clear sky', 'icon': 'c01n'},
   'wind_cdir': 'NE',
   'wind_cdir_full': 'northeast',
   'wind_dir': 45,
   'wind_spd': 1.4},
  {'app_temp': 8.5,
   'clouds': 1,
   'datetime': '2018-03-11:18',
   'dewpt': -4.1,
   'dhi': 0,
   'pod': 'n',
   'pop': 0,
   'precip': 0,
   'precip6h': 0,
   'pres': 1013,
   'rh': 40.6,
   'slp': 1020,
   'snow': 0,
   'snow6h': 0,
   'snow_depth': 0,
   'temp': 8.5,
   'ts': 1520791200,
   'uv': 0,
   'vis': 10,
   'weather': {'code': '800', 'description': 'Clear sky', 'icon': 'c01n'},
   'wind_cdir': 'SW',
   'wind_cdir_full': 'southwest',
   'wind_dir': 225,
   'wind_spd': 

In [19]:
# Transforming the data in json format to dataframe
bit_df = pd.DataFrame()
for i in range(len(cities[0:50])):
    temp_json = json.loads(res_bit[i])
    temp_df = pd.DataFrame.from_dict(json_normalize(temp_json['data']), orient='columns')
    temp_df['city'] = temp_json['city_name']
    temp_df['lat'] = temp_json['lat']
    temp_df['lon'] = temp_json['lon']
    temp_df['timezone'] = temp_json['timezone']
    # Dropping redundant columns
    temp_df = temp_df.drop(['weather.code', 'weather.icon', 'wind_cdir_full', 'wind_dir'], axis = 1)
    bit_df = bit_df.append(temp_df, ignore_index = True)

In [20]:
print (bit_df.shape)
print (bit_df.columns)

(2000, 26)
Index(['app_temp', 'clouds', 'datetime', 'dewpt', 'dhi', 'pod', 'pop',
       'precip', 'precip6h', 'pres', 'rh', 'slp', 'snow', 'snow6h',
       'snow_depth', 'temp', 'ts', 'uv', 'vis', 'weather.description',
       'wind_cdir', 'wind_spd', 'city', 'lat', 'lon', 'timezone'],
      dtype='object')


In [21]:
bit_df['weather.description'].value_counts()

Clear sky                       762
Scattered clouds                465
Overcast clouds                 337
Broken clouds                   227
Drizzle                          75
Light rain                       55
Flurries                         17
Thunderstorm with light rain     14
Light Rain                       13
Haze                             12
Thunderstorm with drizzle         7
Moderate rain                     6
Light snow                        5
Mix snow/rain                     4
Fog                               1
Name: weather.description, dtype: int64

In [22]:
# bit_df.to_csv('~/weatherbit_data.csv')

In [20]:
# bit_df = pd.read_csv('~/weatherbit_data.csv')

In [21]:
df = bit_df

In [22]:
# Extracting hour from datetime to time of the day as a feature in prediction weather description
df['new_datetime'] = (pd.to_datetime(df['datetime'], format = '%Y-%m-%d:%H'))
df["hour"] = df['new_datetime'].apply(lambda x: x.hour)
df.hour = df['hour'].apply(lambda x: str(x))

In [90]:
feature_cols = ['app_temp', 'clouds', 'dewpt', 'dhi', 'pod', 'pop',
       'precip', 'pres', 'rh', 'slp', 'snow',
       'snow_depth', 'temp', 'uv', 'vis',
       'wind_cdir', 'wind_spd', 'lat', 'lon', 'hour']
pred_col = ['weather.description']

In [91]:
X = df[feature_cols]
y = df[pred_col]

In [93]:
# Label encoding discrete variables
for column in ['pod', 'wind_cdir', 'hour']:
    le = LabelEncoder()
    X[column] = le.fit_transform(X[column])

In [94]:
# One hot encoding label encoded discrete variables
enc = preprocessing.OneHotEncoder(sparse=False)
columns_to_enc = ['pod', 'wind_cdir', 'hour']
X_enc = X[columns_to_enc]
for var in columns_to_enc:
    enc.fit(X_enc[[var]])
    temp = enc.transform(X_enc[[var]])
    temp = pd.DataFrame(temp,columns=[(var+"_"+str(i)) for i in X_enc[var]
                                .value_counts().index])
    temp = temp.set_index(X_enc.index.values)
    # adding the new One Hot Encoded varibales to the train data frame
    X_enc = pd.concat([X_enc, temp],axis=1)
X = pd.concat([X, X_enc],axis=1)
X = X.drop(columns_to_enc, axis=1)

In [95]:
# Feature selector to remove low variance features that are either one or zero (on or off) in more than 80% of the samples
from sklearn.feature_selection import VarianceThreshold
sel = VarianceThreshold(threshold=(.8 * (1 - .8)))
X_new = sel.fit_transform(X)

In [96]:
# Test-Train split using inbuilt function in sklearn
X_train, X_test, y_train, y_test = train_test_split(X_new, y, test_size=0.2, random_state=42)

In [97]:
# Fitting a random forest classifier with 100 estimators
rf = RandomForestClassifier(n_estimators=100)
rf.fit(X_train, y_train)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=100, n_jobs=1,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False)

In [98]:
# Actual values
rf_output = y_test
# Predicted values
rf_output['preds'] = rf.predict(X_test)

In [99]:
print ("Accuracy is {0}%".format(sklearn.metrics.accuracy_score(rf_output['weather.description'], rf_output['preds'])*100))

Accuracy is 98.0%


#### Accuracy of the Random Forest Classifier model is around 98%

In [87]:
# Sample comparison between actual and predicted values
rf_output[0:10]

Unnamed: 0,weather.description,preds
1860,Overcast clouds,Overcast clouds
353,Broken clouds,Broken clouds
1333,Clear sky,Clear sky
905,Scattered clouds,Scattered clouds
1289,Clear sky,Clear sky
1273,Scattered clouds,Scattered clouds
938,Overcast clouds,Overcast clouds
1731,Clear sky,Clear sky
65,Clear sky,Clear sky
1323,Clear sky,Clear sky
