In [44]:
import pandas as pd
import numpy as np
import pickle
import statsmodels.api as sm
from sklearn import metrics

import matplotlib.pyplot as plt
plt.style.use('seaborn-ticks')
%matplotlib inline

In [45]:
df = pd.read_csv("weatherAUS_clean.csv")
print(df.shape)
df.head()

(123710, 29)


Unnamed: 0,MinTemp,MaxTemp,Rainfall,Evaporation,Sunshine,WindGustSpeed,WindSpeed9am,WindSpeed3pm,Humidity9am,Humidity3pm,...,Day,winter,spring,summer,autumn,winterRain,springRain,summerRain,autumnRain,RainTomorrow
0,13.4,22.9,0.6,4.8,8.6,44.0,20.0,24.0,71.0,22.0,...,1,True,False,False,False,False,False,False,False,0
1,7.4,25.1,0.0,4.8,8.6,44.0,4.0,22.0,44.0,25.0,...,2,True,False,False,False,False,False,False,False,0
2,12.9,25.7,0.0,4.8,8.6,46.0,19.0,26.0,38.0,30.0,...,3,True,False,False,False,False,False,False,False,0
3,9.2,28.0,0.0,4.8,8.6,24.0,11.0,9.0,45.0,16.0,...,4,True,False,False,False,False,False,False,False,0
4,17.5,32.3,1.0,4.8,8.6,41.0,7.0,20.0,82.0,33.0,...,5,True,False,False,False,False,False,False,False,0


In [46]:
df.columns

Index(['MinTemp', 'MaxTemp', 'Rainfall', 'Evaporation', 'Sunshine',
       'WindGustSpeed', 'WindSpeed9am', 'WindSpeed3pm', 'Humidity9am',
       'Humidity3pm', 'Pressure9am', 'Pressure3pm', 'Cloud9am', 'Cloud3pm',
       'Temp9am', 'Temp3pm', 'RainToday', 'Year', 'Month', 'Day', 'winter',
       'spring', 'summer', 'autumn', 'winterRain', 'springRain', 'summerRain',
       'autumnRain', 'RainTomorrow'],
      dtype='object')

In [47]:
target = "RainTomorrow"
features = ['MinTemp',
 'MaxTemp',
 'Rainfall',
 'Evaporation',
 'Sunshine',
 'WindGustSpeed',
 'WindSpeed9am',
 'WindSpeed3pm',
 'Humidity9am',
 'Humidity3pm',
 'Pressure9am',
 'Pressure3pm',
 'Cloud9am',
 'Cloud3pm',
 'Temp9am',
 'Temp3pm',
 'RainToday',
 'Year',
 'Month',
 'Day',
 'winter',
 'spring',
 'summer',
 'autumn',
 'winterRain',
 'springRain',
 'summerRain',
 'autumnRain',
 'latitude',
 'longitude']

In [48]:
# Wczytanie potrzebnych bibliotek
from sklearn.model_selection import KFold
from sklearn import metrics
from sklearn import neighbors
from sklearn.metrics import roc_auc_score

# Przygotujmy wrapper
def CVTestKNN(nFolds = 5, randomState=2020, debug=False, *args, **kwargs):
    kf = KFold(n_splits=nFolds, shuffle=True, random_state=randomState)

    # Listy z wynikami
    testResults = []
    trainResults = []
    predictions = []
    indices = []
    
    # Pętla walidująca model na kolejnych foldach
    for train, test in kf.split(df.index.values):
        # Przygotowanie estymatora
        clf = neighbors.KNeighborsClassifier(*args, **kwargs)
        if debug:
            print(clf)
        # Trenowanie modelu
        clf.fit(df.iloc[train][features], df.iloc[train][target])

        # Przygotowanie prognoz dla zbioru treningowego i testowego
        # UWAGA Sklearn zwracał będzie dwie kolumny prawdopodobieństw dla obydwu klas
        predsTrain = clf.predict_proba(df.iloc[train][features])[:,1]
        preds = clf.predict_proba(df.iloc[test][features])[:,1]
        
        # Zachowajmy informacje o predykcjach dla tego foldu
        predictions.append(preds.tolist().copy())
        
        # Razem z indeksami w oryginalnym data frame
        indices.append(df.iloc[test].index.tolist().copy())
        
        # Policzenie ROC-AUC dla foldów
        trainScore = roc_auc_score(df[target].iloc[train].astype(int), predsTrain)
        testScore = roc_auc_score(df[target].iloc[test].astype(int), preds)
        
        # Zapisanie wyników dla foldów
        trainResults.append(trainScore)
        testResults.append(testScore)
        
        # Informowanie o każdym foldzie razem z wynikami treningowymi możemy opcjonalnie wyświetlać w trakcie
        if debug:
            print("Train AUC:", trainScore,
                  "Valid AUC:", testScore)
        
    return trainResults, testResults, predictions, indices

In [49]:
trainResults, testResults, predictions, indices = CVTestKNN(n_neighbors=100, n_jobs=-1, p=1, debug=True)
print(np.mean(testResults))

KNeighborsClassifier(n_jobs=-1, n_neighbors=100, p=1)
Train AUC: 0.8872812541637901 Valid AUC: 0.8761392886700614
KNeighborsClassifier(n_jobs=-1, n_neighbors=100, p=1)
Train AUC: 0.887178945536096 Valid AUC: 0.8778564264960391
KNeighborsClassifier(n_jobs=-1, n_neighbors=100, p=1)
Train AUC: 0.8864288649464421 Valid AUC: 0.8809942591309323
KNeighborsClassifier(n_jobs=-1, n_neighbors=100, p=1)
Train AUC: 0.8871687625165386 Valid AUC: 0.8764701609119037
KNeighborsClassifier(n_jobs=-1, n_neighbors=100, p=1)
Train AUC: 0.886887051886097 Valid AUC: 0.8791743008624067
0.8781268872142686


In [50]:
import random
results=[]
paramList = []
for x in range(20):
    params = (random.randint(3, 200), random.randint(1, 2))
    trainResults, testResults, predictions, indices = CVTestKNN(
        n_neighbors=params[0],
        p = params[1])
    print(np.mean(testResults), params)
    results.append(np.mean(testResults))
    paramList.append(params)

KeyboardInterrupt: 