In [None]:
!pip install opendatasets

In [None]:
import opendatasets as od
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
import numpy as np
import os
import matplotlib
%matplotlib inline

pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', 150)
sns.set_style('darkgrid')
matplotlib.rcParams['font.size'] = 14
matplotlib.rcParams['figure.figsize']= (10, 6)
matplotlib.rcParams['figure.facecolor'] = '#00000000'

In [None]:
od.download('https://www.kaggle.com/jsphyg/weather-dataset-rattle-package')

In [None]:
os.listdir('weather-dataset-rattle-package')

In [None]:
raw_df = pd.read_csv('weather-dataset-rattle-package/weatherAUS.csv')

In [None]:
raw_df

In [None]:
raw_df.info()

In [None]:
raw_df.dropna(subset=['RainTomorrow'], inplace=True)

In [None]:
plt.title('No. of rows per year')
sns.countplot(x= pd.to_datetime(raw_df['Date']).dt.year)

In [None]:
year = pd.to_datetime(raw_df.Date).dt.year

train_df = raw_df[year < 2015]
val_df = raw_df[year == 2015]
test_df = raw_df[year > 2015]

In [None]:
train_df.shape, val_df.shape, test_df.shape


In [None]:
input_cols = list(train_df.columns)[1:-1]
target_col = 'RainTomorrow'

In [None]:
train_inputs = train_df[input_cols].copy()
train_targets = train_df[target_col].copy()

val_inputs = val_df[input_cols].copy()
val_targets = val_df[target_col].copy()

test_inputs = test_df[input_cols].copy()
test_targets = test_df[target_col].copy()

In [None]:
numeric_col = train_inputs.select_dtypes(include=np.number).columns.tolist()
categorical_col = train_inputs.select_dtypes('object').columns.tolist()

In [None]:
train_inputs[numeric_col].isna().sum()

In [None]:
from sklearn.impute import SimpleImputer

In [None]:
imputer = SimpleImputer(strategy='mean').fit(raw_df[numeric_col])

In [None]:
train_inputs[numeric_col] = imputer.transform(train_inputs[numeric_col])
val_inputs[numeric_col] = imputer.transform(val_inputs[numeric_col])
test_inputs[numeric_col] = imputer.transform(test_inputs[numeric_col])

In [None]:
val_inputs[numeric_col].isna().sum()

In [None]:
from sklearn.preprocessing import MinMaxScaler

In [None]:
scaler = MinMaxScaler().fit(raw_df[numeric_col])

In [None]:
train_inputs[numeric_col] = scaler.transform(train_inputs[numeric_col])
val_inputs[numeric_col] = scaler.transform(val_inputs[numeric_col])
test_inputs[numeric_col] = scaler.transform(test_inputs[numeric_col])

In [None]:
val_inputs.describe().loc[['min', 'max']]

In [None]:
from sklearn.preprocessing import OneHotEncoder

In [None]:
train_df[categorical_col].fillna('Unknown').count()

In [None]:
encoder = OneHotEncoder(sparse_output=False, handle_unknown='ignore').fit(raw_df[categorical_col])

In [None]:
encoded_col = list(encoder.get_feature_names_out(categorical_col))

In [None]:
train_inputs[encoded_col] = encoder.transform(train_inputs[categorical_col])
val_inputs[encoded_col] = encoder.transform(val_inputs[categorical_col])
test_inputs[encoded_col] = encoder.transform(test_inputs[categorical_col])

In [None]:
encoded_col

In [None]:
test_inputs[encoded_col]

In [None]:
X_train = train_inputs[numeric_col + encoded_col]
X_val = val_inputs[numeric_col + encoded_col]
X_test = test_inputs[numeric_col + encoded_col]

In [None]:
X_test

In [None]:
from sklearn.tree import DecisionTreeClassifier

In [None]:
model = DecisionTreeClassifier(random_state = 42)
model.fit(X_train, train_targets)

In [None]:
from sklearn.metrics import accuracy_score, confusion_matrix

In [None]:
train_preds = model.predict(X_train)

In [None]:
train_preds

In [None]:
pd.value_counts(train_preds)

In [None]:
train_probs = model.predict_proba(X_train)

In [None]:
train_probs

In [None]:
accuracy_score(train_targets, train_preds)

In [None]:
model.score(X_val, val_targets)

In [None]:
val_targets.value_counts()/len(val_targets)

In [None]:
from sklearn.tree import plot_tree, export_text

In [None]:
plt.figure(figsize=(80, 20))
plot_tree(model, feature_names  = X_train.columns, max_depth =2, filled=True, fontsize=14)

In [None]:
model.tree_.max_depth

In [None]:
tree_text = export_text(model, max_depth = 10, feature_names=list(X_train.columns))
print(tree_text)

In [None]:
model.feature_importances_

In [None]:
importance_df = pd.DataFrame({
    'feature': X_train.columns,
    'importance': model.feature_importances_
})

In [None]:
importance_df

In [None]:
plt.title('Feature Importance')
sns.barplot(importance_df.head(10), x='importance', y='feature')

In [None]:
?DecisionTreeClassifier

In [None]:
model = DecisionTreeClassifier(max_depth = 3, random_state = 42)

In [None]:
model.fit(X_train, train_targets)

In [None]:
model.score(X_train, train_targets)

In [None]:
model.score(X_val, val_targets)

In [None]:
model.classes_

In [None]:
plt.figure(figsize=(80, 20))
plot_tree(model, feature_names  = X_train.columns, max_depth =3, filled=True, fontsize=14)


In [None]:
print(export_text(model, feature_names=list(X_train.columns)))

In [None]:
def max_depth_error(md):
  model = DecisionTreeClassifier(max_depth=md, random_state= 42)
  model.fit(X_train, train_targets)
  train_acc = model.score(X_train, train_targets)
  val_acc = model.score(X_val, val_targets)
  return {'Max Depth': md, 'Training Error': train_acc, 'Validation Error': val_acc}

In [None]:
error_df = pd.DataFrame([max_depth_error(md) for md in range(1,21)])

In [None]:
error_df

In [None]:
plt.figure()
plt.plot(error_df['Max Depth'], error_df['Training Error'])
plt.plot(error_df['Max Depth'], error_df['Validation Error'])
plt.title('Training vs. Validation Error')
plt.xticks(range(0,21))
plt.xlabel('Max. Depth')
plt.ylabel('Prediction Error (1 - Accuracy)')
plt.legend(['Training', 'Validation'])

In [None]:
model = DecisionTreeClassifier(max_depth= 7, random_state = 42).fit(X_train, train_targets)

In [None]:
model.score(X_val, val_targets)

In [None]:
model= DecisionTreeClassifier(max_leaf_nodes = 128, random_state = 42)

In [None]:
model.fit(X_train, train_targets)

In [None]:
model.score(X_train,train_targets)

In [None]:
model.score(X_val, val_targets)

In [None]:
model_text = export_text(model, feature_names=list(X_train.columns))
print(model_text)

In [None]:
from sklearn.ensemble import RandomForestClassifier
model = RandomForestClassifier(n_jobs = -1, random_state = 42)



In [None]:
model.fit(X_train, train_targets)

In [None]:
model.score(X_train, train_targets)


In [None]:
model.score(X_val, val_targets)

In [None]:
train_probs = model.predict_proba(X_train)


In [None]:
X_train.shape

In [None]:
train_probs.shape

In [None]:
model.estimators_[0]

In [None]:
plt.figure(figsize=(80,20))
plot_tree(model.estimators_[0], max_depth=2, feature_names=X_train.columns, filled=True, rounded=True, class_names=model.classes_);

In [None]:
importance_df = pd.DataFrame({
    'feature': X_train.columns,
    'importance': model.feature_importances_
}).sort_values('importance', ascending=False)

In [None]:
importance_df

In [None]:
sns.barplot(importance_df.head(10), x='importance', y='feature')

In [None]:
?RandomForestClassifier

In [None]:
base_model = RandomForestClassifier(random_state = 42, n_jobs=-1).fit(X_train, train_targets)

In [None]:
base_model.score(X_train, train_targets)


In [None]:
base_model.score(X_val, val_targets)

In [None]:
model = RandomForestClassifier(random_state = 42, n_jobs=-1, n_estimators=10).fit(X_train, train_targets)

In [None]:
model.score(X_train, train_targets), model.score(X_val, val_targets)

In [None]:
model = RandomForestClassifier(random_state = 42, n_jobs=-1, n_estimators=500).fit(X_train, train_targets)

In [None]:
model.score(X_train, train_targets), model.score(X_val, val_targets)

In [None]:
def test_params(**params):
  model = RandomForestClassifier(random_state = 42, n_jobs=-1, **params).fit(X_train, train_targets)
  return model.score(X_train, train_targets), model.score(X_val, val_targets)

In [None]:
test_params(max_features = 'log2')

In [None]:
test_params(max_features = 3)

In [None]:
test_params(min_samples_split=3, min_samples_leaf=2)

In [None]:
test_params(min_impurity_decrease=1e-6)

In [None]:
test_params(bootstrap = False)

In [None]:
test_params(max_samples = 0.9)

In [None]:
model.classes_

In [None]:
test_params(class_weight ='balanced')

In [None]:
test_params(class_weight ={'No': 1, 'Yes': 2})

In [None]:
model = RandomForestClassifier(n_jobs = -1, random_state = 42, n_estimators= 500, max_features = 7, max_depth = 30, class_weight={'No': 1, 'Yes':1.5})

In [None]:
model.fit(X_train, train_targets)

In [None]:
model.score(X_train, train_targets), model.score(X_val, val_targets)

In [None]:
model.score(X_test, test_targets)

In [None]:
def predcit_input(single_input):
  input_df = pd.DataFrame([single_input])
  input_df[nnumeric_cols] = imputer.transform(input_df[numeric_cols])
  input_df[numeric_cols] = scaler.transform(input_df[numeric_cols])
  input_df[encoded_cols] = encoder.transform(input_df[categorical_cols])
  X_input = input_df[numeric_cols + encoded_cols]
  pred - model.predict(X_input)[0]
  prob = model.predict_prob(X_input)[0][list(model.classes_).index(pred)]

  return {'Prediction': pred, 'Probability': prob}