In [1]:
import pandas as pd
import numpy as np

In [2]:
data = pd.read_csv('weather_data_days.csv', sep=',')
data = data.drop(['text'], axis='columns')
data = data.drop(['day_end'], axis='columns')
data = data.drop(data[data['day_start'] < 1].index)
data = data.drop(data[data.iloc[:, 0:61].sum(axis = 1, skipna = True) == 0].index)

In [3]:
X= data.drop('day_start', axis=1)
y = data['day_start'].astype('int')

In [4]:
X.shape, y.shape

((11490, 61), (11490,))

In [5]:
from sklearn.tree import DecisionTreeClassifier
dtc = DecisionTreeClassifier(random_state=17)

In [6]:
from sklearn.model_selection import train_test_split
X_train, X_valid, y_train, y_valid = train_test_split(X, y, 
                                                      test_size=0.3, 
                                                      random_state=17)

In [7]:
X_train.shape, X_valid.shape

((8043, 61), (3447, 61))

In [8]:
from sklearn.model_selection import  cross_val_score
np.mean(cross_val_score(dtc, X_train, y_train, cv=5))

0.9829666150292663

In [9]:
%time
from sklearn.ensemble import RandomForestClassifier
rf = RandomForestClassifier(
    random_state=17,
    n_jobs=-1, max_features=2,
    )
rf.fit(X_train, y_train)

Wall time: 0 ns


RandomForestClassifier(max_features=2, n_jobs=-1, random_state=17)

In [10]:
%time
rf_valid_pred = rf.predict(X_valid)

Wall time: 0 ns


In [11]:
from sklearn.metrics import accuracy_score
accuracy_score(y_valid, rf_valid_pred)

0.9878154917319408

In [12]:
from sklearn.model_selection import cross_val_score, StratifiedKFold, GridSearchCV
from sklearn.metrics import accuracy_score

In [13]:
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

In [14]:
rfc = RandomForestClassifier(random_state=42, n_jobs=-1, oob_score=True)

In [15]:
results = cross_val_score(rfc, X, y, cv=skf)

In [16]:
print("CV accuracy score: {:.2f}%".format(results.mean()*100))

CV accuracy score: 98.51%


In [17]:
# Инициализируем валидацию
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

In [18]:
# Создаем списки для сохранения точности на тренировочном и тестовом датасете
train_acc = []
test_acc = []
temp_train_acc = []
temp_test_acc = []
trees_grid = range(40,65) 

In [19]:
# Обучаем на тренировочном датасете
for ntrees in trees_grid:
    rfc = RandomForestClassifier(n_estimators=ntrees, random_state=42, n_jobs=-1, oob_score=True)
    temp_train_acc = []
    temp_test_acc = []
    for train_index, test_index in skf.split(X, y):
        X_train, X_test = X.iloc[train_index], X.iloc[test_index]
        y_train = y.iloc[train_index]
        y_test = y.iloc[test_index]
        rfc.fit(X_train, y_train)
        temp_train_acc.append(rfc.score(X_train, y_train))
        temp_test_acc.append(rfc.score(X_test, y_test))
    train_acc.append(temp_train_acc)
    test_acc.append(temp_test_acc)

In [20]:
train_acc, test_acc = np.asarray(train_acc), np.asarray(test_acc)
print("Best accuracy on CV is {:.2f}% with {} trees".format(max(test_acc.mean(axis=1))*100, 
                                                        trees_grid[np.argmax(test_acc.mean(axis=1))]))

Best accuracy on CV is 98.53% with 51 trees


In [21]:
# Создаем списки для сохранения точности на тренировочном и тестовом датасете
train_acc = []
test_acc = []
temp_train_acc = []
temp_test_acc = []
max_depth_grid = range(10,35)

In [22]:
# Обучаем на тренировочном датасете
for max_depth in max_depth_grid:
    rfc = RandomForestClassifier(n_estimators=51, random_state=42, n_jobs=-1, oob_score=True, max_depth=max_depth)
    temp_train_acc = []
    temp_test_acc = []
    for train_index, test_index in skf.split(X, y):
        X_train, X_test = X.iloc[train_index], X.iloc[test_index]
        y_train = y.iloc[train_index]
        y_test = y.iloc[test_index]
        rfc.fit(X_train, y_train)
        temp_train_acc.append(rfc.score(X_train, y_train))
        temp_test_acc.append(rfc.score(X_test, y_test))
    train_acc.append(temp_train_acc)
    test_acc.append(temp_test_acc)
    
train_acc, test_acc = np.asarray(train_acc), np.asarray(test_acc)
print("Best accuracy on CV is {:.2f}% with {} max_depth".format(max(test_acc.mean(axis=1))*100, 
                                                        max_depth_grid[np.argmax(test_acc.mean(axis=1))]))

Best accuracy on CV is 98.18% with 34 max_depth


In [23]:
# Создаем списки для сохранения точности на тренировочном и тестовом датасете
train_acc = []
test_acc = []
temp_train_acc = []
temp_test_acc = []
min_samples_leaf_grid = [1, 3, 5, 7, 9, 11, 13, 15, 17, 20, 22, 24]

In [24]:
# Обучаем на тренировочном датасете
for min_samples_leaf in min_samples_leaf_grid:
    rfc = RandomForestClassifier(n_estimators=51, random_state=17, n_jobs=-1, 
                                 oob_score=True, min_samples_leaf=min_samples_leaf, max_depth=34)
    temp_train_acc = []
    temp_test_acc = []
    for train_index, test_index in skf.split(X, y):
        X_train, X_test = X.iloc[train_index], X.iloc[test_index]
        y_train = y.iloc[train_index]
        y_test = y.iloc[test_index]
        rfc.fit(X_train, y_train)
        temp_train_acc.append(rfc.score(X_train, y_train))
        temp_test_acc.append(rfc.score(X_test, y_test))
    train_acc.append(temp_train_acc)
    test_acc.append(temp_test_acc)
    
train_acc, test_acc = np.asarray(train_acc), np.asarray(test_acc)
print("Best accuracy on CV is {:.2f}% with {} min_samples_leaf".format(max(test_acc.mean(axis=1))*100, 
                                                                min_samples_leaf_grid[np.argmax(test_acc.mean(axis=1))]))

Best accuracy on CV is 98.15% with 1 min_samples_leaf


In [25]:
# Создаем списки для сохранения точности на тренировочном и тестовом датасете
train_acc = []
test_acc = []
temp_train_acc = []
temp_test_acc = []
max_features_grid = [2, 4, 6, 8, 10, 12, 14, 16]

# Обучаем на тренировочном датасете
for max_features in max_features_grid:
    rfc = RandomForestClassifier(n_estimators=51, random_state=17, n_jobs=-1, 
                                 oob_score=True, max_depth=34, max_features= 2)
    temp_train_acc = []
    temp_test_acc = []
    for train_index, test_index in skf.split(X, y):
        X_train, X_test = X.iloc[train_index], X.iloc[test_index]
        y_train = y.iloc[train_index]
        y_test = y.iloc[test_index]
        rfc.fit(X_train, y_train)
        temp_train_acc.append(rfc.score(X_train, y_train))
        temp_test_acc.append(rfc.score(X_test, y_test))
    train_acc.append(temp_train_acc)
    test_acc.append(temp_test_acc)
    
train_acc, test_acc = np.asarray(train_acc), np.asarray(test_acc)
print("Best accuracy on CV is {:.2f}% with {} max_features".format(max(test_acc.mean(axis=1))*100, 
                                                        max_features_grid[np.argmax(test_acc.mean(axis=1))]))

Best accuracy on CV is 98.54% with 2 max_features


In [26]:
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=17)
for train_index, test_index in skf.split(X, y):
    X_train, X_test = X.iloc[train_index], X.iloc[test_index]
    y_train = y.iloc[train_index]
    y_test = y.iloc[test_index]
rfc = RandomForestClassifier(n_estimators=51, random_state=17, n_jobs=-1, 
                                 oob_score=True, max_depth=34, max_features= 2)

In [27]:
X_train.shape,  X_test.shape

((9192, 61), (2298, 61))

In [28]:
rfc.fit(X_train, y_train)

RandomForestClassifier(max_depth=34, max_features=2, n_estimators=51, n_jobs=-1,
                       oob_score=True, random_state=17)

In [31]:
import csv

In [30]:
accuracy_score(y_test, rfc_valid_pred)

0.9882506527415144