In [1]:
import pandas as pd 
import numpy as np
pd.set_option('display.max_columns', 500)
pd.set_option('display.max_rows', 500)
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.linear_model import Ridge
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import cross_val_predict
from sklearn.model_selection import cross_val_score
from sklearn.svm import LinearSVR
import xgboost as xgb
from sklearn.preprocessing import StandardScaler
MAX_TOWERS = 6
from catboost import CatBoostRegressor
from sklearn.preprocessing import LabelEncoder
np.random.seed(17)
import random
random.seed(17)
from sklearn.model_selection import KFold
from sklearn.metrics import mean_absolute_error

  from numpy.core.umath_tests import inner1d


In [2]:
def get_train():
    train_main = pd.read_csv("../data/task1/train_1.7.csv", encoding="cp1251")
    train_aux_coords = pd.read_csv("../data/task1_additional/coords_train_1.1.csv", encoding="cp1251")
    train_aux_frac = pd.read_csv("../data/task1_additional/frac_train_1.csv", encoding="cp1251")
    train_aux_gdis = pd.read_csv("../data/task1_additional/gdis_train1.2.csv", encoding="cp1251")
    
    
    train_frac_main = pd.merge(train_main, train_aux_frac,how="left", left_on="Скважина", right_on="Скважина")
    all_recs = pd.merge(train_frac_main, train_aux_gdis,how="left", left_on="Скважина", right_on="Скважина")
    print(all_recs.shape)
    return all_recs

def get_test():
    test_main = pd.read_csv("../data/task1/test_1.9.csv", encoding="cp1251")
    test_aux_coords = pd.read_csv("../data/task1_additional/coords_train_1.1.csv", encoding="cp1251")
    test_aux_frac = pd.read_csv("../data/task1_additional/frac_test_1.csv", encoding="cp1251")
    test_aux_gdis = pd.read_csv("../data/task1_additional/gdis_test1.2.csv", encoding="cp1251")
    
    
    test_frac_main = pd.merge(test_main, test_aux_frac,how="left", left_on="Скважина", right_on="Скважина")
    all_recs = pd.merge(test_frac_main, test_aux_gdis,how="left", left_on="Скважина", right_on="Скважина")
    print(all_recs.shape)
    return all_recs

In [3]:
#sort by converted date and group
def convert_and_sort(df):
    df["Дата"] =  df["Дата"].apply(pd.to_datetime)
    return df.sort_values(by=["Скважина", "Дата"])

def get_non_useful(df):
    non_useful_columns = []
    for c in df.columns:
        null_columns = df[df[c].isnull()]
        if len(null_columns)== len(df):
            non_useful_columns.append(c)
    return non_useful_columns

def drop_non_useful(train, test):
    non_useful = set(get_non_useful(train)) |set(get_non_useful(test))
    print("%s dropped"% non_useful)
    return train.drop(list(non_useful), axis=1), test.drop(list(non_useful), axis=1)

def get_float(v):
    v = str(v)
    if v != "NaN":
        new = v.replace(",",".")
        return float(new)
    return v

def get_target(df, column="Нефть, т"):
    target = df[column]
    print("%s dropped"% column)
    return df.drop([column], axis=1), target.apply(get_float)

#drop non present columns in test
def drop_not_present(train, test):
    absent_columns = list(set(train.columns) - set(test.columns))
    print("%s dropped"% absent_columns)
    return train.drop(absent_columns, axis=1), test
    
def show_uniq_test_train(train, test):
    #check all values that have zero ans nan only
    for c in train.columns:
        un = train[c].unique()
        if len(un)<100:
            tun = test[c].unique()
            print("%s ;train: %s; test:%s"%(c, un, tun))

In [4]:
def common_data_pipeline(train, test):
    print(train.shape)
    print(test.shape)
    
    y = None
    train=convert_and_sort(train)
    train, test = drop_non_useful(train, test)
    #remove target from train
    train, y = get_target(train)
    train, test = drop_not_present(train, test)
    print(train.shape)
    print(test.shape)
    
    return train, test, y

In [5]:
def get_existed(columns, df):
    return list(set(columns)&set(df.columns))

def split_continious_date_categorical_text(df):
    group_id = ["Скважина"]
    text = ["Причина простоя",
            "Куст",
            "Состояние на конец месяца",
            "Причина простоя.1",
            "Мероприятия",
            "Проппант"]
    categorical = ["Тип испытания",
                   "Тип скважины",
                   "Неустановившийся режим",
                   "ГТМ",
                   "Метод",
                   "Характер работы",
                   "Состояние",
                   "Пласт МЭР", 
                   "Способ эксплуатации", 
                   "Тип насоса", 
                   "Состояние на конец месяца", 
                   "Номер бригады", 
                   "Фонтан через насос", 
                   "Нерентабельная",
                   "Назначение по проекту",
                   "Группа фонда",
                   "Тип дополнительного оборудования",
                   "Марка ПЭД",
                   "Тип ГЗУ",
                   "ДНС",
                   "КНС",
                   #useless potentially
                   "Диаметр плунжера",
                   "Природный газ, м3",
                   "Конденсат, т",
                   "Длина хода плунжера ШГН",
                   "Коэффициент подачи насоса",
                   "Дебит конденсата",
                   "Вязкость воды в пластовых условиях",
                   "Газ из газовой шапки, м3",
                   "Число качаний ШГН",
                   "Коэффициент сепарации",
                   "SKIN",
                   "КН закрепленный",
                   # radically different
                   "Время в работе",
                   "Радиус контура питания",
                   "Время в накоплении",
                   "Время накопления",
                   "Агент закачки"
                   ]
    dates = ["Дата", 
             "Дата ГРП",
             "Время до псевдоуст-ся режима", 
             "Дата запуска после КРС", 
             "Дата пуска", 
             "Дата останова",
             "Дата ввода в эксплуатацию"]
    
    continious = list(set(df.columns) - set(dates) - set(categorical) - set(text) - set(group_id))
    return (df[group_id],df[continious], df[get_existed(dates,df)], df[get_existed(categorical, df)],
            df[get_existed(text, df)])

In [6]:
def get_object_columns(df):
    objects = []
    for c in df.columns:
        if df[c].dtype != pd.np.float:
            objects.append(c)
    return objects

def convert_locale_to_float(df):
    loc_float = get_object_columns(df)
    converted = df.copy()
    for c in loc_float:
        converted.loc[:,c] = df[c].apply(get_float)
    return converted
        
def fill_with_mean(train, test):
    means=train.mean()
    norm_train = train.fillna(means)
    norm_test = test.fillna(means)
    return norm_train, norm_test

# now we have clear non-normalized data, let's normalize first
def normalize(train, test):
    scaler = StandardScaler()
    norm_train = pd.DataFrame(scaler.fit_transform(train), columns=train.columns, index = train.index)
    norm_test = pd.DataFrame(scaler.transform(test), columns=test.columns, index = test.index)
    return norm_train, norm_test

In [7]:
def cont_transform_pipeline(train, test):
    train_f = convert_locale_to_float(train)
    test_f = convert_locale_to_float(test)
    train_cont, test_cont = fill_with_mean(train_f, test_f)
    train_cont, test_cont = normalize(train_cont, test_cont)
    print(train_cont.isnull().values.any() or test_cont.isnull().values.any())
    
    print(train_cont.shape)
    print(test_cont.shape)
    return train_cont, test_cont

In [8]:
def transform_cats_to_labels(train_cat, test_cat):
    transformed_df = train_cat.copy()
    trans_test = test_cat.copy()
    for c in train_cat.columns:
        encoder = LabelEncoder()
        column_train = train_cat[c].astype(str)
        column_test = test_cat[c].astype(str)
        combined = pd.concat([column_train, column_test])
        encoder.fit(combined)
        transformed_df[c] = encoder.transform(column_train).reshape(-1,1)
        trans_test[c] = encoder.transform(column_test).reshape(-1,1)
    return transformed_df, trans_test

In [9]:
def cat_transform_pipeline(train, test):
    train_cat, test_cat = transform_cats_to_labels(train, test)
    print(train_cat.shape)
    print(test_cat.shape)
    return train_cat, test_cat

In [10]:
def clean_non_targeted(train_array, y_train):
    clean_array = []
    train_array.append(y_train)
    #clear nans in target
    indexes_to_delete = y_train[y_train.isnull()].index
    for df in train_array:
        item = df.drop(index=indexes_to_delete)
        clean_array.append(item)
        print(item.shape)
    return clean_array

In [11]:
def get_preds_for_cats(train, test, y):
    cb_regressor = CatBoostRegressor(logging_level="Silent")
    train_catboost_preds = cross_val_predict(cb_regressor, train, y)
    cb_regressor.fit(train,y=y)
    test_catboost_preds = pd.Series(cb_regressor.predict(test), index=test.index)
    return train_catboost_preds, test_catboost_preds

In [12]:
def get_cont_ensemble():
    ridge = Ridge()
    rtree = RandomForestRegressor(n_jobs=-1, n_estimators=50)
    svr = LinearSVR()
    return [ridge, rtree, svr]

def get_cont_ensemble_names():
    return ["ridge", "rtree", "svr"]

In [13]:
def get_meta_train_preds(X, y, train_mixture, mix_cols):
    predicts = []
    for cl in get_cont_ensemble():
        predicts.append(cross_val_predict(cl, X,y, n_jobs=-1))
    predicts.append(train_mixture)
    return pd.DataFrame(np.vstack(predicts).transpose(), index=y.index, columns=get_cont_ensemble_names()+mix_cols)

def get_meta_test_predict(X_train, y_train, X_test, test_mixture, mix_cols):
    test_predicts = []
    for cl in get_cont_ensemble():
        print(cross_val_score(cl, X_train, y_train, n_jobs=-1, scoring="neg_mean_absolute_error"))
        cl.fit(X_train, y_train)
        pr = cl.predict(X_test)
        test_predicts.append(pr)
    test_predicts.append(test_mixture)
    return  pd.DataFrame(np.vstack(test_predicts).transpose(), index=X_test.index, columns=get_cont_ensemble_names()+mix_cols)

def get_stacked_ensemble_predict(X_meta, y, X_test):
    regressor = xgb.XGBRegressor()
    regressor.fit(X_meta, y)
    return pd.DataFrame(regressor.predict(X_test), index=X_test.index)

In [14]:
def get_n_item_index(group_size, df, group):
    new_df = pd.concat([df, group], axis = 1)
    index = []
    group = new_df.groupby(["Скважина"])
    for name, group in group:
        if len(group)<group_size:
            continue
        for start in range(len(group.index) - group_size):
            gr =group.index[start:start+group_size]
            index.append(gr)
    return index

def get_timed_ds(meta_size, df, group, y):
    if meta_size >= 1:
        meta_indexes = get_n_item_index(meta_size, df, group)
        first_value_idx = []
        timed_ds = df.copy()
        metas = []
        columns = []
        for i in range(meta_size):
            columns.append("meta%s"%str(i))
        for a in meta_indexes:
            first_value_idx.append(a[0])
            metas.append(list(y.loc[a]))
        metas_df = pd.DataFrame.from_records(metas, index=first_value_idx, columns=columns)
        return pd.concat([timed_ds.loc[first_value_idx], metas_df], axis=1)
    elif meta_size == 0:
        return df
    
def get_n_tower_predictions(n, train, y, test, train_group, train_mix, test_mix, mix_col):
    X_meta_train = get_meta_train_preds(train, y, train_mixture=train_mix, mix_cols=mix_col)
    X_meta_test = get_meta_test_predict(train, y, test, test_mixture=test_mix, mix_cols=mix_col)
    test_predictions= []
    for i in range(0,n):
        train_timed_ds = get_timed_ds(i,X_meta_train, train_group, y)
        y_timed = y.loc[train_timed_ds.index]
        test_predict = get_stacked_ensemble_predict(train_timed_ds, y_timed,X_meta_test)
        test_predictions.append(test_predict)
        X_meta_test["meta%s"%i]=test_predict
    return pd.concat(test_predictions,axis=1)

In [15]:
def create_submission(fname, df):
    final_pred = pd.Series(final_pred)
    final_pred.to_csv(fname,header=["_VAL_"],index_label=["_ID_"])

In [16]:
def get_clean_data(train, test):
    train, test, y_train  = common_data_pipeline(train, test)
    train_group, train_cont, train_dat, train_cat, train_text = split_continious_date_categorical_text(train)
    test_group, test_cont, test_dat, test_cat, test_text = split_continious_date_categorical_text(test)
    train_cont, test_cont = cont_transform_pipeline(train_cont, test_cont)
    train_cat, test_cat = cat_transform_pipeline(train_cat, test_cat)
    train_cont, train_group, train_cat, y_train = clean_non_targeted([train_cont, train_group, train_cat], y_train)
    train_cat_preds, test_cat_preds = get_preds_for_cats(train_cat, test_cat, y_train)
    return train_cont, y_train, test_cont, train_group, train_cat_preds, test_cat_preds

In [17]:
def get_prediction(train, test, constant = 701.4750):
    train_cont, y_train, test_cont, train_group, train_cat_preds, test_cat_preds = get_clean_data(train,test)
    time_serie_pred = get_n_tower_predictions(
        6, train_cont, y_train,test_cont, train_group, train_cat_preds, test_cat_preds, ["catboost"]
    ).values.reshape(-1,1)
    time_serie_pred = np.squeeze(time_serie_pred)
    print(constant - np.mean(time_serie_pred))
    final_pred = time_serie_pred + (constant - np.mean(time_serie_pred))
    return final_pred

In [18]:
preds = get_prediction(get_train(), get_test())

(5735, 147)
(319, 138)
(5735, 147)
(319, 138)
{'Агент закачки', 'Станок-качалка', 'Фирма ГРП', 'Тип газосепаратора', 'Примечание'} dropped
Нефть, т dropped
['ТП(ИДН) Дебит жидкости скорр-ый', 'ТП(ГРП) Дебит жидкости скорр-ый', 'ТП(ГРП) Дебит жидкости', 'ГП - Общий прирост Qн', 'Дебит жидкости', 'ТП(ИДН) Дебит жидкости', 'Жидкость, м3', 'Нефть, м3'] dropped
(5735, 133)
(319, 133)
False
(5735, 83)
(319, 83)
(5735, 37)
(319, 37)
(4764, 83)
(4764, 1)
(4764, 37)
(4764,)
[-124.54067161 -130.03049227 -137.52117274]
[-49.1389539  -52.8103233  -61.15671562]
[ -90.28929007 -121.49943394 -110.63281796]
458.4420562744141


In [None]:
create_submission("all_data_sub.csv", preds,constant=701.4750)

In [19]:
def get_cleaned_test(test, size=6):
    test, y_test = get_target(test)
    test, y_test = clean_non_targeted([test], y_test)
    X_idx = []
    y_idx = []
    group = test.groupby(["Скважина"])
    for name, group in group:
        if len(group)!= size:
            continue
        X_idx.append(group.index[0])
        y_idx.extend(group.index)
    if X_idx and y_idx:
        test = test.loc[X_idx]
        y_test = y_test[y_idx]
        constant = np.mean(y_test.values)
        return test, y_test, constant

In [20]:
def cross_validate(train, n_folds = 2):
    errors = []
    k_fold = KFold(n_splits=n_folds)
    for tr_ix, test_ix in k_fold.split(train):
        print("new fold started")
        train_cv = train.iloc[tr_ix]
        test_cv = train.iloc[test_ix]
        test_cv, y_test, constant = get_cleaned_test(test_cv)
        test_preds = get_prediction(train_cv, test_cv, constant=constant)
        error=mean_absolute_error(y_true=y_test, y_pred=test_preds)
        print("fold error:%s" % error)
        errors.append(error)
        print("overall error: %s" % np.mean(errors))

In [None]:
%%time
cross_validate(get_train())

(5735, 147)
new fold started
Нефть, т dropped
(2432, 146)
(2432,)
(2867, 147)
(90, 146)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  app.launch_new_instance()


{'Тип газосепаратора', 'Мероприятия', 'КНС', 'Станок-качалка', 'Фирма ГРП', 'Причина простоя.1', 'Примечание'} dropped
Нефть, т dropped
[] dropped
(2867, 139)
(90, 139)
False
(2867, 91)
(90, 91)
(2867, 37)
(90, 37)
(2332, 91)
(2332, 1)
(2332, 37)
(2332,)
[-4.60681326 -3.56905785 -3.89639142]
[-12.11927481  -4.83143372  -6.69613951]
[-29.06663199 -23.97000996 -28.0306708 ]
-231.75713937717018
fold error:318.18226066193756
overall error: 318.18226066193756
new fold started
Нефть, т dropped
(2332, 146)
(2332,)
(2868, 147)
(58, 146)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  app.launch_new_instance()


{'Мероприятия', 'Станок-качалка', 'Причина простоя.1', 'Фирма ГРП', 'Тип газосепаратора', 'Примечание'} dropped
Нефть, т dropped
[] dropped
(2868, 140)
(58, 140)
False
(2868, 91)
(58, 91)
(2868, 38)
(58, 38)
(2432, 91)
(2432, 1)
(2432, 38)
(2432,)
