In [1]:
import pandas as pd
import numpy as np
from sklearn.decomposition import PCA 
from sklearn.preprocessing import Imputer
from scipy.stats import skew, norm
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from itertools import repeat
from os import listdir

import matplotlib.pyplot as plt
import seaborn as sns

%matplotlib inline
from matplotlib_venn import venn2, venn3
from sklearn.preprocessing import LabelEncoder

In [2]:
def conv(s):
    if s == '':
        return np.nan
    return float(s.replace(',', '.'))

columns_to_conv = ['Нефть, т', 'Конденсат, т', 'Приемистость, м3/сут', 'Обводненность (вес), %', 'Нефть, м3', 'Жидкость, м3', \
                    'Дебит конденсата', 'Добыча растворенного газа, м3', 'Дебит попутного газа, м3/сут',
                  'Вода, т', 'Жидкость, т', 'Попутный газ, м3', 'Закачка, м3',
       'Природный газ, м3', 'Обводненность (масса), %']
convertors = dict(zip(columns_to_conv, repeat(conv.__call__, len(columns_to_conv))))

def conv_date(x):
    if x.day != 1:
        return (x + pd.offsets.MonthBegin(1))
    else:
        return x

def month_to_num(x):
    month_dict = {'янв':'1','фев':'2','мар':'3','апр':'4','май':'5','июн':'6','июл':'7','авг':'8','сен':'9','окт':'10','ноя':'11','дек':'12'}
    for k in month_dict.keys():
        x = x.replace(k, month_dict[k])
    return x

In [3]:
df_train = pd.read_csv('input/task_3/train.csv', header = 0 ,encoding = 'cp1251', 
                       parse_dates = ['Дата'], converters= convertors)
df_test_before = pd.read_csv('input/task_3/test_before.csv', header = 0 ,encoding = 'cp1251', 
                             parse_dates = ['Дата'], converters= convertors)
df_test_after = pd.read_csv('input/task_3/test_after_X.csv', header = 0 ,encoding = 'cp1251',
                            parse_dates = ['Дата'], converters= convertors)

In [4]:
month_numeric_cols = ['heff', 'bo', 'kprod_calc']
for col in month_numeric_cols:
    df_train[col] = df_train[col].astype(str).apply(month_to_num).astype(float)
    df_test_before[col] = df_test_before[col].astype(str).apply(month_to_num).astype(float)

df_train['Характер работы'] = df_train['Характер работы'].replace({'НЕФ/НАГ': 'НАГ'})
df_test_before['Характер работы'] = df_test_before['Характер работы'].replace({'НЕФ/НАГ': 'НАГ'})

df_train['Характер работы'] = df_train['Характер работы'].fillna(method = 'ffill')
df_test_before['Характер работы'] = df_test_before['Характер работы'].fillna(method = 'ffill')

In [5]:
lbl1 = LabelEncoder()
#lbl2 = LabelEncoder()
lbl1.fit(pd.concat((df_train['group'], df_test_before['group']), axis = 0).values) 
df_train['group'] = lbl1.transform(df_train['group'].values)
df_test_before['group'] = lbl1.transform(df_test_before['group'].values)
df_test_after['group'] = lbl1.transform(df_test_after['group'].values)

#df_train['gr'] = lbl1.transform(df_train['group'].values)
for col in ['bo','heff', 'h_vert', 'd_nkt']:
    well_gr = df_train.groupby('Скважина')[col].mean()
    df_train[col] =  df_train.set_index('Скважина')[col].fillna(well_gr).reset_index()[col]
    well_gr = df_test_before.groupby('Скважина')[col].mean()
    df_test_before[col] =  df_test_before.set_index('Скважина')[col].fillna(well_gr).reset_index()[col]

In [6]:
df_train.to_csv('./input/prepared_data/train_proc.csv', index=False)
df_test_before.to_csv('./input/prepared_data/test_before_proc.csv', index=False)
df_test_after.to_csv('./input/prepared_data/test_after_proc.csv', index=False)