In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.metrics import mean_squared_error
from math import sqrt
import datetime
import pickle
import ast
import gc
import re

import category_encoders as ce

from sklearn.model_selection import train_test_split
from sklearn.model_selection import StratifiedKFold

from sklearn.feature_extraction.text import TfidfVectorizer, HashingVectorizer
from scipy.sparse import csr_matrix, hstack


gc.enable()

In [2]:
def rmsle(h, y): 
    """
    Compute the Root Mean Squared Log Error for hypthesis h and targets y

    Args:
        h - numpy array containing predictions with shape (n_samples, n_targets)
        y - numpy array containing targets with shape (n_samples, n_targets)
    """
    return np.sqrt(np.square(np.log(h + 1) - np.log(y + 1)).mean())

def rlmse_score(y_test, y_hat):
    rlmse = np.sqrt(np.mean((np.log(1+y_test) - np.log(1+y_hat))**2))
    return rlmse

In [3]:
#del df_test
#gc.collect()

In [None]:
df_train = pd.read_pickle('train_hack.pckl.zip')

In [None]:
df_train.head(5)

In [None]:
df_train.info()

In [None]:
# посмотрим количество уникальных значений для каждого столбца
columns = [s for s in df_train.columns if df_train[s].dtypes == 'int64']
for colum in columns:
    print(len(df_train[colum].value_counts()), colum)

In [None]:
columns = [s for s in df_train.columns if df_train[s].dtypes == 'bool']
for colum in columns:
    print(df_train[colum].value_counts())

In [None]:
df_train['can_buy'].value_counts()

In [10]:
df_train[df_train['price'] <= 0].head(10)

Unnamed: 0,can_buy,can_promote,category,contacts_visible,date_created,delivery_available,description,fields,id,images,location,mortgage_available,name,payment_available,price,subcategory,subway
72,False,False,5,True,1497593720,False,Отдам срочно шотландцев (скотиш страйт) девочк...,"[{'field': {'name': 'Животные', 'id': 5, 'slug...",4343d339c45ccdeb87773495,"[{'id': '594376f7f20263d7be2957f3', 'num': 1, ...","{'latitude': 53.433392, 'longitude': 56.058512}",False,Шотландцы,False,0,503,
137,False,False,17,True,1502886003,False,,"[{'field': {'name': 'Хэндмейд', 'id': 17, 'slu...",220cf017752d9e5837834995,"[{'id': '59943873bd36c026dc1a9602', 'num': 1, ...","{'latitude': 56.104621, 'longitude': 40.35235}",False,Куколка,False,0,1704,
211,False,False,22,True,1488269765,False,Отдам за большую шоколадку киндер в хорошем со...,"[{'field': {'name': 'Детский гардероб', 'id': ...",57b54b37a957e9745c135b85,"[{'id': '58b531c5cd3022009704651d', 'num': 1, ...","{'latitude': 55.567398, 'longitude': 42.016585}",False,Обувь,False,0,2209,
214,False,False,9,True,1481826557,False,Отдам даром халаты для дома. Размер 48-50. В н...,"[{'field': {'name': 'Женский гардероб', 'id': ...",e6ad9f053cb47ea8df0e2585,"[{'id': '5852e0fc1c40315d078642fe', 'num': 1, ...","{'latitude': 54.200047, 'longitude': 45.174511}",False,Халаты для дома.,False,0,901,
324,False,False,3,True,1501393626,False,"Отдам бесплатно мягкие игрушки,все что на фото...","[{'field': {'name': 'Детские товары', 'id': 3,...",33a394dbd8e9ba6cad27d795,"[{'id': '597d726b9e94ba8fca33a733', 'num': 1, ...","{'latitude': 56.016354, 'longitude': 92.85442}",False,Мягкие игрушки,False,0,314,
359,False,False,3,True,1479819776,False,Отдам бесплатно. Пюре из цв.капусты выдали как...,"[{'field': {'name': 'Детские товары', 'id': 3,...",5fe4431670d3f35d00244385,"[{'id': '583441f04b5593e8431fdf2c', 'num': 1, ...","{'latitude': 55.815158, 'longitude': 38.986879}",False,"Пюре цв.капуста и трусики ""Кораблик""",False,0,311,
397,False,False,9,True,1484397885,False,"Отдаю вещи,в пакетах есть платья,джинсы,майки,...","[{'field': {'name': 'Женский гардероб', 'id': ...",45778365f8f95540d3d1a785,"[{'id': '587a1c7d96ad844c96885edc', 'num': 1, ...","{'latitude': 47.204112, 'longitude': 39.631697}",False,Пакет вещей,False,0,901,
426,False,False,22,True,1487795454,False,Отдам бу 1 раз,"[{'field': {'name': 'Детский гардероб', 'id': ...",07207002031304c1ef4fda85,"[{'id': '58adf4fe9a64a2de2a2e446d', 'num': 1, ...","{'latitude': 56.227821, 'longitude': 35.635624}",False,Набор для новорожденных,False,0,2201,
519,False,False,3,True,1496826296,False,"Отдам две каши и смесь Беллакт, пюре бебевита ...","[{'field': {'name': 'Детские товары', 'id': 3,...",260dc1ad0cba9a728b1c7395,"[{'id': '5937c10cc3bdd287695f4fc7', 'num': 1, ...","{'latitude': 55.698215, 'longitude': 37.511066}",False,Каши и пюре,False,0,311,
525,False,False,9,True,1492605382,False,3 кофты ( 2 рубашки и блузка) befree и сарафан...,"[{'field': {'name': 'Женский гардероб', 'id': ...",34d5d29ae60008396c957f85,"[{'id': '58f7594a074b3e164d2eaff4', 'num': 1, ...","{'latitude': 59.870759, 'longitude': 29.858937}",False,Вещи пакетом,False,0,907,


In [11]:
df_train['location'].map(loc_to_cord).value_counts()

NameError: name 'loc_to_cord' is not defined

In [None]:
len(df_train[df_train['price'] <= 0])

In [None]:
df_train['free'] = df_train['description'].str.lower().apply(lambda x: 1 if 'отдам' in x else 0)
print(len(df_train.query('free == 1 & price <=0')))
df_train.drop(['free'], axis=1, inplace=True)

In [None]:
ast.literal_eval(str(df_train[df_train['price'] <= 0]['fields'][29]))

In [None]:
def json_parse(f):
    return ast.literal_eval(f)
def parse_field(id_val, f):
    info = ''
    info = info + f[0]['field']['name'] + ' ' + f[1]['field']['name']
    
    for i in range(0, len(f)):
        if f[i]['values']:
            info = info + ' ' + f[i]['field']['name'] + ' ' + f[i]['values'][0]['selected_value']['value']
            #+ ', ' + str() + ' ' + str(f[i]['values'][0]['selected_value']['value'])
    return info

In [None]:
pd.DataFrame([parse_field(df_train['id'].values[i], json_parse(df_train['fields'].values[i])) for i in range(0, 10)])

In [3]:
%%time
df_train = pd.read_pickle(r'E:/data/ds/hak/train_hack.pckl.zip')

Wall time: 3min 14s


In [4]:
def get_date_features(df):
    df['datetime'] = pd.to_datetime(df['date_created'], unit='s')
    df['year'] = df.loc[:, 'datetime'].dt.year
    #df = pd.get_dummies(df, columns=['year',], prefix='year_')
    df['month'] = df.loc[:, 'datetime'].dt.month
    #df = pd.get_dummies(df, columns=['month',], prefix='month_')
    #df['day'] = df.loc[:, 'datetime'].dt.day
    df['hour'] = df.loc[:, 'datetime'].dt.hour
    #df = pd.get_dummies(df, columns=['hour',], prefix='hour_')
    df['weekday'] = df.loc[:, 'datetime'].dt.weekday
    #df = pd.get_dummies(df, columns=['weekday',], prefix='weekday_')
    #df.drop(['datetime',],axis = 1, inplace=True)
    return df

# уберем пунктуацию
def clean_p(data):
    clean_re = re.compile('[^\w\s]')
    cleantext = re.sub(clean_re, ' ', data)
    cleantext = cleantext.replace("  ", " ")
    cleantext = cleantext.replace("   ", " ")
    return cleantext

def loc_to_cord(x):
    lng = str(int(round(x['longitude'],0)))

    lat = str(int(round(x['latitude'],0)))

    return 'lo' + lng + 'la' + lat

#df_sample['coord'] = df_sample['location'].map(loc_to_cord)

def json_parse(f):
    return ast.literal_eval(f)
def parse_field(id_val, f):
    info = ''
    info = info + f[0]['field']['name'] + ' ' + f[1]['field']['name']
    
    for i in range(0, len(f)):
        if f[i]['values']:
            info = info + ' ' + f[i]['field']['name'] + ' ' + f[i]['values'][0]['selected_value']['value']
            #+ ', ' + str() + ' ' + str(f[i]['values'][0]['selected_value']['value'])
    return info

In [5]:
def preproc_data_step_1 (df_output):
    #df_output = df_input.copy() # memory
    
    df_output = get_date_features(df_output)
    
    #df_output['price'] = df_output['price']/100
    
    # good try ;)
    df_output.drop(['can_buy', 'can_promote', 'contacts_visible', 'mortgage_available'], axis = 1, inplace=True)
    
    df_output['delivery_available'] = df_output['delivery_available'].apply(lambda x: 1 if x is True else 0)
    df_output['payment_available'] = df_output['payment_available'].apply(lambda x: 1 if x is True else 0)
    
    #df_output = pd.get_dummies(df_output, columns=['category',], prefix='category_')
    #df_output = pd.get_dummies(df_output, columns=['subcategory',], prefix='subcategory_')
    
    # количество картинок
    df_output['images_len'] = df_output['images'].str.len()
    df_output.drop(['images'], axis=1, inplace=True)
    
    # Lover
    df_output['name'] = df_output['name'].str.lower()
    df_output['description'] = df_output['description'].str.lower()
    # Clean
    df_output['description'] = df_output['description'].apply(lambda x: clean_p(x))
    df_output['name'] = df_output['name'].apply(lambda x: clean_p(x))
    
    # mb zero
    df_output['free_price'] = df_output['description'].apply(lambda x: 1 if 'отдам' in x else 0)
    
    # 
    df_output['loct'] = df_output['location'].map(loc_to_cord)
    
    #df_output['fields_mod'] = pd.DataFrame([parse_field(df_train['id'].values[i], json_parse(df_train['fields'].values[i])) for i in range(0, len(df_train['fields'].values))])
    
    df_output.drop(['date_created', 'datetime', 'fields', 'id', 'location', 'subway',], axis = 1, inplace=True)
    
    # дропаем повторные размещения
    #columns_tmp = list(df_output.columns)
    #columns_tmp.remove('price')
    #df_output.drop_duplicates(subset=columns_tmp, keep='last', inplace=True)
    
    return df_output

In [6]:
%%time
df_preproc = preproc_data_step_1(df_train)
df_preproc.info()
#del df_train
#gc.collect()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1748890 entries, 0 to 99930
Data columns (total 14 columns):
category              int64
delivery_available    int64
description           object
name                  object
payment_available     int64
price                 int64
subcategory           int64
year                  int64
month                 int64
hour                  int64
weekday               int64
images_len            int64
free_price            int64
loct                  object
dtypes: int64(11), object(3)
memory usage: 200.1+ MB
Wall time: 23.3 s


In [None]:
#df_preproc['price'] = df_preproc['price']/100

In [7]:
df_preproc.head(5)

Unnamed: 0,category,delivery_available,description,name,payment_available,price,subcategory,year,month,hour,weekday,images_len,free_price,loct
0,9,0,,сумка dg,1,199900,914,2017,4,13,4,1,0,lo38la56
1,22,0,8 12 лет,комплект,0,35000,2202,2016,10,20,1,1,0,lo38la56
2,22,0,на девочку 1 5 г состояние хорошее,пальтишко демисезонное,0,30000,2204,2016,9,15,6,4,0,lo37la56
3,22,0,размер 135mm euro 22 5 прочная мягкая не сколь...,attipas,1,80000,2209,2016,10,21,2,3,0,lo37la56
5,22,0,,жилет теплый,1,50000,2204,2017,8,11,2,4,0,lo30la60


In [8]:
## TargetEncoder ?

In [9]:
# Train Test 
HOLOUT_LINE = len(df_preproc)-(len(df_preproc)//10)
#X_train = df_preproc[:HOLOUT_LINE]
#X_test = df_preproc[HOLOUT_LINE:]

feature_colnames = ['category',
                    #'subcategory'
                   ]
target = 'price'

for colname in feature_colnames:
    tmp_data = df_preproc[[colname, target]][:HOLOUT_LINE]
    # median
    target_means = tmp_data.groupby(colname).median()
    mapping = target_means.to_dict()[target]
    df_preproc[colname+'_TargetEnc_median'] = df_preproc[colname]
    df_preproc[colname+'_TargetEnc_median'] = df_preproc[colname+'_TargetEnc_median'].map(mapping)
    # std
    target_std = tmp_data.groupby(colname).std()
    mapping_std = target_std.to_dict()[target]
    df_preproc[colname+'_TargetEnc_std'] = df_preproc[colname]
    df_preproc[colname+'_TargetEnc_std'] = df_preproc[colname+'_TargetEnc_std'].map(mapping_std)

In [10]:
y = df_preproc.price.values
X = df_preproc.drop(['price'], axis=1)
X_train = X[:HOLOUT_LINE]
y_train = y[:HOLOUT_LINE]
X_test = X[HOLOUT_LINE:]
y_test = y[HOLOUT_LINE:]

In [11]:
X_train.info(null_counts=True)

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1574001 entries, 0 to 49473
Data columns (total 15 columns):
category                     1574001 non-null int64
delivery_available           1574001 non-null int64
description                  1574001 non-null object
name                         1574001 non-null object
payment_available            1574001 non-null int64
subcategory                  1574001 non-null int64
year                         1574001 non-null int64
month                        1574001 non-null int64
hour                         1574001 non-null int64
weekday                      1574001 non-null int64
images_len                   1574001 non-null int64
free_price                   1574001 non-null int64
loct                         1574001 non-null object
category_TargetEnc_median    1574001 non-null int64
category_TargetEnc_std       1574000 non-null float64
dtypes: float64(1), int64(11), object(3)
memory usage: 192.1+ MB


In [12]:
X_train.head(10)

Unnamed: 0,category,delivery_available,description,name,payment_available,subcategory,year,month,hour,weekday,images_len,free_price,loct,category_TargetEnc_median,category_TargetEnc_std
0,9,0,,сумка dg,1,914,2017,4,13,4,1,0,lo38la56,50000,2558626000.0
1,22,0,8 12 лет,комплект,0,2202,2016,10,20,1,1,0,lo38la56,45000,285110800.0
2,22,0,на девочку 1 5 г состояние хорошее,пальтишко демисезонное,0,2204,2016,9,15,6,4,0,lo37la56,45000,285110800.0
3,22,0,размер 135mm euro 22 5 прочная мягкая не сколь...,attipas,1,2209,2016,10,21,2,3,0,lo37la56,45000,285110800.0
5,22,0,,жилет теплый,1,2204,2017,8,11,2,4,0,lo30la60,45000,285110800.0
6,9,0,новые текстильные босоножки 37 размер в размер...,босоножки сабо,0,902,2017,6,14,5,3,0,lo44la56,50000,2558626000.0
7,15,0,seagate\nмодель st320lt020 \nhdd для ноутбуков...,жесткий диск для ноутбука seagate 320gb,1,1508,2017,3,17,5,1,0,lo39la52,199000,1151533000.0
8,4,0,состояние отличное без дефектов,ковер 324 на 240,1,403,2017,4,10,5,4,0,lo42la45,130000,4903625000.0
10,3,0,продам кенгуру 350 ванночка с горкой 250,кенгуру,0,301,2016,12,16,4,3,0,lo50la53,80000,2696549000.0
11,2,0,,машинка автомат,1,208,2016,11,9,2,2,0,lo85la54,245000,2500472000.0


# VW

In [13]:
def to_vw_format(category, 
                 subcategory, 
                 year,
                 payment_available,
                 #delivery_available,
                 free_price,
                 loct,
                 #category_TargetEnc_median,
                 name, 
                 description,
                 label=None):
      return str(label or '') + ' |category ' + category \
                            + ' |subcategory ' + subcategory \
                            + ' |year ' + year \
                            + ' |payment_available ' + payment_available \
                            + ' |free_price  ' +  free_price \
                            + ' |loct  ' +  loct  \
                            + ' |name ' + ' '.join(re.findall('\w{3,}', name)) \
                            + ' |description ' + ' '.join(re.findall('\w{3,}', description)) \
                            + '\n'

In [14]:
%%time
# Convert and save in vowpal wabbit format
def to_vw_file(X_train, y_train, name='cv_reviews_train.vw'):
    with open(name, 'w', encoding='utf-8') as vw_train_data:
        for category, \
            subcategory, \
            year, \
            payment_available, \
            free_price, \
            loct, \
            name, \
            description, \
            target \
            in zip(X_train['category'], 
                   X_train['subcategory'],
                   X_train['year'], 
                   X_train['payment_available'],
                   #X_train['delivery_available'],
                   X_train['free_price'],
                   X_train['loct'],
                   #X_train['category_TargetEnc_median'],
                   X_train['name'], 
                   X_train['description'], 
                   y_train):
                vw_train_data.write(to_vw_format(str(category), 
                                                 str(subcategory), 
                                                 str(year), 
                                                 str(payment_available), 
                                                 #str(delivery_available), 
                                                 str(free_price),
                                                 str(loct),
                                                 #str(category_TargetEnc_median),
                                                 str(name), 
                                                 str(description), 
                                                 target))
                
to_vw_file(X_train, y_train, name='cv_reviews_train_3.vw')
to_vw_file(X_test, y_train, name='cv_reviews_valid_3.vw')

Wall time: 21.6 s


In [15]:
%%time
# Fitting a logistic regression for predicting the sentiment of a review
!vw cv_reviews_train_3.vw -f cv_reviews_model.vw --random_seed 42 --holdout_off -b 28 \
--sgd --adaptive --normalized --invariant \
--loss_function=quantile \
--nn 100 \
--passes 5 -k -c \
--ngram 2

Wall time: 7min 53s


Generating 2-grams for all namespaces.
final_regressor = cv_reviews_model.vw
Num weight bits = 28
learning rate = 0.5
initial_t = 0
power_t = 0.5
decay_learning_rate = 1
creating cache_file = cv_reviews_train_3.vw.cache
Reading datafile = cv_reviews_train_3.vw
num sources = 1
average  since         example        example  current  current  current
loss     last          counter         weight    label  predict features
99950.000000 99950.000000            1            1.0 199900.0000   0.0000        8
58724.873047 17499.746094            2            2.0 35000.0000   0.5078        9
43112.237549 27499.602051            4            4.0 80000.0000   0.8769      115
48430.825562 53749.413574            8            8.0 200000.0000   1.3747       19
80308.488342 112186.151123           16           16.0 250000.0000   4.6335       59
99869.863388 119431.238434           32           32.0 40000.0000  27.4316       27
88153.223850 76058.628197           64           64.0 100000.0000 118.1327

In [16]:
!vw -i cv_reviews_model.vw -t -d cv_reviews_valid_3.vw -p cv_valid_pred_3.txt --quiet
with open('cv_valid_pred_3.txt') as pred_file:
    validation_prediction = [float(label) for label in pred_file.readlines()]
validation_prediction = np.array(validation_prediction)
print(rmsle(y_test, validation_prediction))
print(rlmse_score(y_test, validation_prediction))

inf
inf


  if __name__ == '__main__':
  if sys.path[0] == '':


In [17]:
np.zeros(len(y_train))

array([0., 0., 0., ..., 0., 0., 0.])

In [18]:
y_train_z = np.zeros(len(y_train))

In [19]:
to_vw_file(X_test, y_train_z, name='cv_reviews_valid.vw')

In [20]:
!vw -i cv_reviews_model.vw -t -d cv_reviews_valid.vw -p cv_valid_pred.txt --quiet
with open('cv_valid_pred.txt') as pred_file:
    validation_prediction = [float(label) for label in pred_file.readlines()]
validation_prediction = np.array(validation_prediction)
print(rmsle(y_test, validation_prediction))
print(rlmse_score(y_test, validation_prediction))

inf
inf


  if __name__ == '__main__':
  if sys.path[0] == '':


# SUBMIT

In [21]:
del df_train, X, X_train
gc.collect()

331

In [35]:
%%time
df_test = pd.read_pickle(r'E:/data/ds/hak/test_hack.pckl')

Wall time: 51.8 s


In [23]:
df_test.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 749525 entries, 4 to 99931
Data columns (total 16 columns):
can_buy               749525 non-null bool
can_promote           749525 non-null bool
category              749525 non-null int64
contacts_visible      749525 non-null bool
date_created          749525 non-null int64
delivery_available    749525 non-null bool
description           749525 non-null object
fields                749525 non-null object
id                    749525 non-null object
images                749525 non-null object
location              749525 non-null object
mortgage_available    749525 non-null bool
name                  749525 non-null object
payment_available     749525 non-null bool
subcategory           749525 non-null int64
subway                1210 non-null object
dtypes: bool(6), int64(3), object(7)
memory usage: 67.2+ MB


In [24]:
%%time
df_preproc = preproc_data_step_1(df_test)
df_preproc.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 749525 entries, 4 to 99931
Data columns (total 13 columns):
category              749525 non-null int64
delivery_available    749525 non-null int64
description           749525 non-null object
name                  749525 non-null object
payment_available     749525 non-null int64
subcategory           749525 non-null int64
year                  749525 non-null int64
month                 749525 non-null int64
hour                  749525 non-null int64
weekday               749525 non-null int64
images_len            749525 non-null int64
free_price            749525 non-null int64
loct                  749525 non-null object
dtypes: int64(10), object(3)
memory usage: 80.1+ MB
Wall time: 11.8 s


In [25]:
df_preproc.head(5)

Unnamed: 0,category,delivery_available,description,name,payment_available,subcategory,year,month,hour,weekday,images_len,free_price,loct
4,6,1,стремянка трехсекционная 3 4,стремянка,1,603,2018,1,14,5,3,0,lo56la55
9,2,0,плита эви 5120 работают 3 конфорки и духовка п...,плита электрическая,1,203,2017,10,7,2,3,0,lo30la60
15,1,0,все лоты привезены с площадок и аукционов япон...,диски r17 rays mazdaspeed touring 5х114 3 7j,0,116,2018,1,21,5,4,0,lo37la56
19,10,0,,batman archam knight,1,1009,2017,10,15,0,2,0,lo38la56
20,11,0,отдаю ледобур времён с с с р диаметр 130 мм бе...,ледобур для зимней рыбалки,0,1104,2018,1,8,1,4,0,lo30la60


In [26]:
y_train_z = np.zeros(len(df_preproc))

In [27]:
to_vw_file(df_preproc, y_train_z, name='cv_reviews_valid.vw')

In [28]:
!vw -i cv_reviews_model.vw -t -d cv_reviews_valid.vw -p cv_valid_pred.txt --quiet
with open('cv_valid_pred.txt') as pred_file:
    validation_prediction = [float(label) for label in pred_file.readlines()]
validation_prediction = np.array(validation_prediction)

In [29]:
validation_prediction.min()

-1.0

In [30]:
validation_prediction[[validation_prediction <0]] = 0

In [31]:
validation_prediction

array([40332.503906, 40332.503906, 40311.285156, ..., 40324.140625,
       40332.503906, 40332.503906])

In [None]:
%%time
df_test = pd.read_pickle(r'E:/data/ds/hak/test_hack.pckl')

In [38]:
df_test['price'] = validation_prediction

In [39]:
df_test['price'].head(10)

4     40332.503906
9     40332.503906
15    40311.285156
19    40332.503906
20    31624.546875
21    40332.503906
24    39970.742188
28    40332.503906
29    40332.503906
31    28488.476563
Name: price, dtype: float64

In [40]:
sub_1 = df_test[['id', 'price']]

In [41]:
sub_1.to_csv('sub_vw_v4.csv', index=False)

In [None]:
sub_2 = pd.read_csv('sub_vw_v3.csv')

In [None]:
sub_2

In [None]:
sub_2['SUM'] = sub_1['price'] + sub_2['price']
sub_2['med'] = sub_2['med'].mean()

In [None]:
sub_2['med'] = sub_2['med'].mean()

In [77]:
df_test.shape, sub_1.shape

((749525, 18), (749525, 2))

In [78]:
sub_1.to_csv('sub_vw_v3.csv', index=False)

In [None]:
sub_1.to_csv('sub_vw_v4.csv', index=False)