In [1]:
#import data manipulation and visualisation libraries
import numpy as np
import pandas as pd
import seaborn as sns 
import matplotlib.pyplot as plt

#encoder
from sklearn import preprocessing

# Standardize the data
from sklearn.preprocessing import StandardScaler

#rebalancing target variable
from imblearn.over_sampling import RandomOverSampler

#ML libraries
from sklearn.model_selection import train_test_split

#other libraries
import warnings
warnings.filterwarnings('ignore')  # "error", "ignore", "always", "default", "module" or "once"

#set no randomness
np.random.seed(0)

In [2]:
#load data from csv file
dataset = pd.read_csv('./../data/seattle-weather.csv')

In [3]:
#set boundaries for variables to remove outliers in data.
Q1 = dataset.quantile(0.25)
Q3 = dataset.quantile(0.75)
IQR = Q3-Q1
dataset = dataset[~((dataset< (Q1-1.5*IQR)) | (dataset>(Q3+1.5*IQR))).any(axis=1)]

In [4]:
# Add a 'temp_avg' column and drop 'temp_min' and 'temp_max'
dataset['temp_avg'] = dataset[['temp_max', 'temp_min']].mean(axis=1)
dataset = dataset.drop(columns=['temp_max', 'temp_min'])
temp = dataset['weather']
dataset = dataset.drop(columns=['weather'])
dataset['weather'] = temp

In [5]:
# label_encoder object knows how to understand word labels.
label_encoder = preprocessing.LabelEncoder()
  
# Encode labels in column 'weather'.
dataset['weather']= label_encoder.fit_transform(dataset['weather'])

In [6]:
#balancing the target variable
columns = ['precipitation', 'wind', 'temp_avg']
y = dataset['weather'] #prior target variable
X = dataset[columns]  #prior features

ros = RandomOverSampler(sampling_strategy='not majority')
X_train_balanced, y_train_balanced = ros.fit_resample(X, y) #y_train as balanced target variable

In [7]:
#preparation of train and test data for ML models
X_train,X_test,y_train,y_test = train_test_split(X_train_balanced, y_train_balanced,test_size=0.3, random_state = 1) #70 % of train data, 30 % of test data
X_train.shape,X_test.shape,y_train.shape,y_test.shape

((2222, 3), (953, 3), (2222,), (953,))

In [8]:
# Standardization to achive standard normal distribution
sc = StandardScaler()
sc = sc.fit(X_train)

In [9]:
#dump file
import pickle
with open("./../model/scale.pkl", "wb") as f:
    pickle.dump(sc, f)