In [0]:
from google.colab import drive
drive.mount('/content/drive')

Go to this URL in a browser: https://accounts.google.com/o/oauth2/auth?client_id=947318989803-6bn6qk8qdgf4n4g3pfee6491hc0brc4i.apps.googleusercontent.com&redirect_uri=urn%3aietf%3awg%3aoauth%3a2.0%3aoob&response_type=code&scope=email%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdocs.test%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive.photos.readonly%20https%3a%2f%2fwww.googleapis.com%2fauth%2fpeopleapi.readonly

Enter your authorization code:
··········
Mounted at /content/drive


In [0]:
import pandas as pd
import numpy as np
import math
from tqdm import tqdm
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import f1_score


path = "/content/drive/My Drive/fire_predict/base_data/"

def f_data(train, val, test):

    X_train = train.drop(['fr_yn'], 1)
    y_train = train['fr_yn']
    X_val = val.drop(['fr_yn'], 1)
    y_val = val['fr_yn']
    X_test = test.drop(['fr_yn'], 1)

    df_all = pd.concat([X_train, X_val, X_test])

    categorical_cols = df_all.select_dtypes(['object']).columns
    for col in categorical_cols:
        df_all[col] = pd.Categorical(df_all[col]).codes

    X_train = df_all[:len(train)]
    X_val = df_all[len(train):-len(test)]
    X_test = df_all[-len(test):]

    X_train = X_train.fillna(-1)
    X_val = X_val.fillna(-1)
    X_test = X_test.fillna(-1)

    # X_train = X_train.replace({-1:np.NaN})
    # X_val = X_val.replace({-1:np.NaN})
    # X_test = X_test.replace({-1:np.NaN})

    return X_train, y_train, X_val, y_val, X_test

def f1_rfc(X_train, y_train, X_val, y_val):
    result_list = []
    for i in tqdm(range(10)):

        model = RandomForestClassifier(bootstrap=True, max_depth=80,
                                       max_features="auto", min_samples_leaf=4,
                                       min_samples_split=5, n_estimators=200,
                                       random_state=42)
        model.fit(X_train, y_train)

        y_pred = model.predict(X_val)
        result = f1_score(y_val, y_pred)
        result_list.append(result)
    return sum(result_list) / len(result_list)

def f1_rfc_2(X_train, y_train, X_val, y_val):
    result_list = []
    for i in tqdm(range(10)):

        model = RandomForestClassifier(random_state=42, n_estimators=1000)
        model.fit(X_train, y_train)

        y_pred = model.predict(X_val)
        result = f1_score(y_val, y_pred)
        result_list.append(result)
    return sum(result_list) / len(result_list)

def preprocessing():

    df_train = pd.read_csv(path + "PJT002_train.csv")
    df_val = pd.read_csv(path + "PJT002_validation.csv")
    df_test = pd.read_csv(path + "PJT002_test.csv")

    df_eg_year = pd.read_csv(path + "df_eg_year.csv")
    df_eg_year_val = pd.read_csv(path + "df_eg_year_val.csv")
    df_eg_year_test = pd.read_csv(path + "df_eg_year_test.csv")

    # lable -> binary
    binary_y = {'N': 0, 'Y': 1}

    df_train['fr_yn'] = df_train['fr_yn'].map(binary_y)
    df_val['fr_yn'] = df_val['fr_yn'].map(binary_y)

    #df ->df_2
    df_train_2 = df_train
    df_val_2 = df_val
    df_test_2 = df_test

    #drop id
    df_train_2 = df_train_2.drop("id", axis=1)
    df_val_2 = df_val_2.drop("id", axis=1)
    df_test_2 = df_test_2.drop("id", axis=1)

    #drop dt_of_fr
    df_train_2 = df_train_2.drop("dt_of_fr", axis=1)
    df_val_2 = df_val_2.drop("dt_of_fr", axis=1)
    df_test_2 = df_test_2.drop("dt_of_fr", axis=1)

    #dt_of_athrztn -> year 

    def year(x):
        if x != "NaN":
            if isinstance(x, float):
                if not math.isnan(x):
                    if int(x) >10000000:
                        return x // 10000
                    elif int(x) > 1000000:
                        return x // 1000
                    elif int(x) > 100000:
                        return x // 100
                    elif int(x) > 10000:
                        return x // 10
                    elif int(x) > 1000:
                        return x
            elif isinstance(x, str):
                return int(x[:4])

    df_train_2["dt_of_athrztn"] = df_train_2["dt_of_athrztn"].apply(year)
    df_train_2["dt_of_athrztn"] = df_train_2["dt_of_athrztn"].apply(lambda x: None if x > 3000 else x)
    df_val_2["dt_of_athrztn"] = df_val_2["dt_of_athrztn"].apply(year)
    df_val_2["dt_of_athrztn"] = df_val_2["dt_of_athrztn"].apply(lambda x: None if x > 3000 else x)
    df_test_2["dt_of_athrztn"] = df_test_2["dt_of_athrztn"].apply(year)
    df_test_2["dt_of_athrztn"] = df_test_2["dt_of_athrztn"].apply(lambda x: None if x > 3000 else x)

    # hm_cnt -> categorical
    # def population(x):
    #     if x < 1000:
    #         return "1k"
    #     elif x < 10000:
    #         return "10k"
    #     elif x < 100000:
    #         return "100k"
    #     elif x < 1000000:
    #         return "1000k"

    # df_train_2["hm_cnt"] = df_train_2["hm_cnt"].apply(population)
    # df_val_2["hm_cnt"] = df_val_2["hm_cnt"].apply(population)
    # df_test_2["hm_cnt"] = df_test_2["hm_cnt"].apply(population)

    #tmprtr c -> k
    df_train_2.tmprtr = df_train_2.tmprtr.apply(lambda x: x + 273.15 if True else x)
    df_val_2.tmprtr = df_val_2.tmprtr.apply(lambda x: x + 273.15 if True else x)
    df_test_2.tmprtr = df_test_2.tmprtr.apply(lambda x: x + 273.15 if True else x)

    #gas_engry_us_201507 x < 0 -> remove
    df_train_2.gas_engry_us_201507 = df_train_2.gas_engry_us_201507.apply(lambda x: None if x < 0 else x)

    # drop gas, elec
    gas_elec_columns = list(df_train_2.iloc[:,16:136].columns)
    df_train_2 = df_train_2.drop(gas_elec_columns, axis=1)
    df_val_2 = df_val_2.drop(gas_elec_columns, axis=1)
    df_test_2 = df_test_2.drop(gas_elec_columns, axis=1)

    #concat eg_year
    df_train_2 = pd.concat([df_train_2, df_eg_year], axis=1)
    df_val_2 = pd.concat([df_val_2, df_eg_year_val], axis=1)
    df_test_2 = pd.concat([df_test_2, df_eg_year_test], axis=1)

    #for imputation
    # s = df_train_2.isnull().sum()
    # c_list = np.where(s >= max(s)*0.5)
    # df_train_2 = df_train_2.drop(list(s.index[c_list[0]]), axis=1)

    # df_val_2 = df_val_2.drop(list(s.index[c_list[0]]), axis=1)
    # df_test_2 = df_test_2.drop(list(s.index[c_list[0]]), axis=1)

    # X_train, y_train, X_val, y_val, X_test = f_data(df_train_2, df_val_2, df_test_2)
    return df_train_2, df_val_2, df_test_2

In [0]:
from sklearn.impute import SimpleImputer


imp_mean = SimpleImputer(strategy='mean')
imp_mean.fit(df_train_2.select_dtypes("number"))
imputed_train_df = imp_mean.transform(df_train_2.select_dtypes("number"))
imputed_train_df
df_train_sinf_1 = pd.DataFrame(imputed_train_df, columns=df_train_2.select_dtypes("number").columns)
imp_mean = SimpleImputer(strategy='mean')
imp_mean.fit(df_val_2.select_dtypes("number"))
imputed_val_df = imp_mean.transform(df_val_2.select_dtypes("number"))
imputed_val_df
df_val_sinf_1 = pd.DataFrame(imputed_val_df, columns=df_val_2.select_dtypes("number").columns)

In [46]:
df_train_2, df_val_2, df_test_2 = preprocessing()
df_train_2[df_train_sinf_1.columns] = df_train_sinf_1
df_val_2[df_val_sinf_1.columns] = df_val_sinf_1
X_train, y_train, X_val, y_val, X_test = f_data(df_train_2, df_val_2, df_test_2)
f1_rfc_2(X_train, y_train, X_val, y_val)

  if self.run_code(code, result):

  0%|          | 0/10 [00:00<?, ?it/s][A
[A

KeyboardInterrupt: ignored

In [0]:
df_train_2, df_val_2, df_test_2 = preprocessing()
df_train_2[df_train_sinf_1.columns] = df_train_sinf_1
df_val_2[df_val_sinf_1.columns] = df_val_sinf_1
X_train, y_train, X_val, y_val, X_test = f_data(df_train_2, df_val_2, df_test_2)

model = RandomForestClassifier(random_state=42, n_estimators=1000)
model.fit(X_train, y_train)

y_pred = model.predict(X_val)
result = f1_score(y_val, y_pred)
result


  if self.run_code(code, result):


In [0]:
submission_pred = model.predict(X_test)
submission_pred

array([0., 0., 0., ..., 0., 0., 1.])

In [0]:
submission_pred.shape

(2957,)

In [0]:
submission = pd.read_csv(path + "PJT002_submission.csv")
submission["fr_yn"] = submission_pred
submission = submission.replace({0:"N", 1:"Y"})
submission.to_csv(path +"submission_12_13.csv", index=False)