In [1]:
import pandas as panda
from matplotlib import pyplot as plot
import numpy as np
import seaborn as sns

In [2]:
train_data  = panda.read_csv('all_new/merchants.csv')

In [3]:
train_data.shape, train_data.merchant_id.unique().shape

((334696, 22), (334633,))

In [4]:
train_data.describe(include='all').T

Unnamed: 0,count,unique,top,freq,mean,std,min,25%,50%,75%,max
merchant_id,334696,334633.0,M_ID_6464db3b45,4.0,,,,,,,
merchant_group_id,334696,,,,31028.7,31623.0,1.0,3612.0,19900.0,51707.2,112586.0
merchant_category_id,334696,,,,423.132,252.898,-1.0,222.0,373.0,683.0,891.0
subsector_id,334696,,,,25.1164,9.80737,-1.0,19.0,27.0,33.0,41.0
numerical_1,334696,,,,0.0114764,1.09815,-0.0574706,-0.0574706,-0.0574706,-0.0475558,183.735
numerical_2,334696,,,,0.00810311,1.0705,-0.0574706,-0.0574706,-0.0574706,-0.0475558,182.079
category_1,334696,2.0,N,327657.0,,,,,,,
most_recent_sales_range,334696,5.0,E,177104.0,,,,,,,
most_recent_purchases_range,334696,5.0,E,175309.0,,,,,,,
avg_sales_lag3,334683,,,,13.833,2395.49,-82.13,0.88,1.0,1.16,851845.0


##### We can observe the following from merchants data

1. avg_purchases_lag12,avg_purchases_lag6, avg_purchases_lag3 all have np.inf values.which would require handling.
2. state_id, city_id, subsector_id, merchant_category_id have minimum values as -1. these indicate missing data

object data types are:

1. category_4

2. category_1

3. most_recent_sales_range

4. most_recent_purchases_range


In [7]:
train_data_dtypes = train_data.dtypes.to_frame().reset_index()
train_data_dtypes.columns = ['col_name','col_type']
train_data_dtypes

Unnamed: 0,col_name,col_type
0,merchant_id,object
1,merchant_group_id,int64
2,merchant_category_id,int64
3,subsector_id,int64
4,numerical_1,float64
5,numerical_2,float64
6,category_1,object
7,most_recent_sales_range,object
8,most_recent_purchases_range,object
9,avg_sales_lag3,float64


In [8]:
train_data_dtypes[train_data_dtypes.col_type==np.object].col_name

0                     merchant_id
6                      category_1
7         most_recent_sales_range
8     most_recent_purchases_range
18                     category_4
Name: col_name, dtype: object

In [9]:
# ignoring merchant id lets analyze the rest of the fields such as category_1,most_recent_sales_range,most_recent_purchase_range,
# category_4

# check for empty values
train_data[train_data_dtypes[train_data_dtypes.col_type==np.object].col_name.values.tolist()].isnull().sum()

merchant_id                    0
category_1                     0
most_recent_sales_range        0
most_recent_purchases_range    0
category_4                     0
dtype: int64

In [10]:
train_data.category_1.value_counts()

N    327657
Y      7039
Name: category_1, dtype: int64

In [11]:
train_data['merchant_category_1_yes'] = train_data.category_1.apply(lambda x : 1 if x=='Y'else 0).astype(np.int64)
train_data['merchant_category_1_no'] = train_data.category_1.apply(lambda x : 1 if x=='N'else 0).astype(np.int64)

In [12]:
train_data.category_4.value_counts()

N    238596
Y     96100
Name: category_4, dtype: int64

In [13]:
train_data['merchant_category_4_yes'] = train_data.category_4.apply(lambda x : 1 if x=='Y'else 0).astype(np.int64)
train_data['merchant_category_4_no'] = train_data.category_4.apply(lambda x : 1 if x=='N'else 0).astype(np.int64)

In [14]:
train_data.most_recent_sales_range.value_counts()

E    177104
D    117475
C     34075
B      5037
A      1005
Name: most_recent_sales_range, dtype: int64

In [15]:
train_data['most_sales_range_E'] = train_data.most_recent_sales_range.apply(lambda x : 1 if x=='E'else 0).astype(np.int64)
train_data['most_sales_range_D'] = train_data.most_recent_sales_range.apply(lambda x : 1 if x=='D'else 0).astype(np.int64)
train_data['most_sales_range_C'] = train_data.most_recent_sales_range.apply(lambda x : 1 if x=='C'else 0).astype(np.int64)
train_data['most_sales_range_B'] = train_data.most_recent_sales_range.apply(lambda x : 1 if x=='B'else 0).astype(np.int64)
train_data['most_sales_range_A'] = train_data.most_recent_sales_range.apply(lambda x : 1 if x=='A'else 0).astype(np.int64)

In [16]:
train_data.most_recent_purchases_range.value_counts()

E    175309
D    119187
C     34144
B      5046
A      1010
Name: most_recent_purchases_range, dtype: int64

In [17]:
train_data['most_recent_purchases_range_E'] = train_data.most_recent_purchases_range.apply(lambda x : 1 if x=='E'else 0).astype(np.int64)
train_data['most_recent_purchases_range_D'] = train_data.most_recent_purchases_range.apply(lambda x : 1 if x=='D'else 0).astype(np.int64)
train_data['most_recent_purchases_range_C'] = train_data.most_recent_purchases_range.apply(lambda x : 1 if x=='C'else 0).astype(np.int64)
train_data['most_recent_purchases_range_B'] = train_data.most_recent_purchases_range.apply(lambda x : 1 if x=='B'else 0).astype(np.int64)
train_data['most_recent_purchases_range_A'] = train_data.most_recent_purchases_range.apply(lambda x : 1 if x=='A'else 0).astype(np.int64)

In [18]:

int_col_names = train_data_dtypes[train_data_dtypes.col_type==np.int64].col_name.values.tolist()
int_col_names

['merchant_group_id',
 'merchant_category_id',
 'subsector_id',
 'active_months_lag3',
 'active_months_lag6',
 'active_months_lag12',
 'city_id',
 'state_id']

In [19]:
train_data[int_col_names].isnull().any().sum()#no empty or nan values

0

In [34]:
##however -1 would indicate missing/outlier data for merchants ..let check the count of those

train_data[train_data.merchant_category_id==-1].shape,\
train_data[train_data.subsector_id==-1].shape,\
train_data[train_data.state_id==-1].shape,\
train_data[train_data.city_id==-1].shape,\
train_data.loc[(train_data.merchant_category_id==-1) &(train_data.subsector_id==-1)].shape,\
train_data.loc[(train_data.merchant_category_id==-1) &(train_data.subsector_id==-1)].shape,\
train_data.loc[(train_data.merchant_category_id==-1) &(train_data.subsector_id==-1)].merchant_id

((1, 36),
 (1, 36),
 (11887, 36),
 (105184, 36),
 (1, 36),
 (1, 36),
 2465    M_ID_a2cfe4149a
 Name: merchant_id, dtype: object)

I have decided not to handle -1 values as missing data in the state_id, city_id, merchant_category_id and subsector_id fields


In [35]:
train_data.merchant_group_id.unique()

array([  8353,   3184,    447, ..., 112528, 112393, 107283], dtype=int64)

In [36]:
train_data.merchant_category_id.unique()

array([792, 840, 690, 222,  87, 529, 813,  81, 369, 427,  63,  45,   2,
       278, 298, 497, 309, 705, 511, 437, 456, 781, 703, 360, 796, 412,
       332,  80, 630,  68, 591, 606, 683, 891, 560, 178, 290, 544, 823,
       692, 779,  33, 432, 818, 819, 842, 411,  19, 771,  34, 419, 884,
       184, 111, 434, 356, 367, 373, 631, 573, 652, 669, 531, 195, 130,
       836, 637, 670, 307, 215, 387, 506, 443, 774, 414, 166, 108, 607,
       451, 763, 806, 471, 518, 105,  60, 391, 557, 302, 154, 623, 333,
       847, 384,  78, 299, 342, 363, 101, 783,  90, 489, 737, 650, 261,
       556, 209, 454, 462,   9, 769, 561, 661, 289, 474, 180, 259,  14,
        40, 317, 273, 115, 507, 667, 216, 498, 422, 206, 613, 430, 110,
       574, 536, 653, 357, 274, 793, 171, 385, 472, 651,  56, 312, 119,
       201, 247, 554, 114, 873, 671, 190, 854, 530, 843, 527, 761, 320,
       276, 348, 743, 109, 315, 645,  16, 834, 415, 642,  83, 568,  49,
       223, 695, 696, 172, 702, 458, 748, 416, 330, 706, 340,  2

In [37]:
temp = train_data.groupby(['merchant_id']).agg({'merchant_category_id':'nunique'}).reset_index()
temp[temp.merchant_category_id!=1]

##merchant and merchant cat id is a 1-many relationship

Unnamed: 0,merchant_id,merchant_category_id
130891,M_ID_645a6af169,2


In [38]:
int_col_names

['merchant_group_id',
 'merchant_category_id',
 'subsector_id',
 'active_months_lag3',
 'active_months_lag6',
 'active_months_lag12',
 'city_id',
 'state_id']

In [39]:
train_data.subsector_id.unique(),train_data.city_id.unique(),train_data.state_id.unique()

(array([ 9, 20,  1, 21, 27, 29, 18, 37, 10, 33,  7, 15, 34,  2, 36, 17, 19,
        25, 12, 31, 38, 40, 32, 16, 41,  5, 30, 13, 22,  4, 23, 39,  8, 14,
         3, 26, 35, 24, -1, 11, 28], dtype=int64),
 array([242,  22,  -1, 160,  60, 248,  88, 158,  11, 143, 282, 123, 126,
        272,  85, 140,  62,  63, 323, 200, 310,  69, 239,  17,   4, 231,
        124,   3, 301,  87, 186, 150, 156, 161, 209,  25, 303, 101,  12,
        147,  44, 254,  58,  32,  53,  96, 213, 214, 125, 229, 168,  30,
        204,  94, 210, 137, 165, 187, 223, 238, 205, 157, 114, 276,  76,
        233, 183, 277, 212, 247, 167, 274, 343, 251, 283,   9,  21,  84,
          7, 297, 291,  33, 246, 169, 228, 201, 181, 253, 159, 172,  47,
          6, 113, 206,  51,  82,  80,  36, 215, 259, 314,  98, 295, 174,
        211, 339, 250, 307,  24,  48,  89, 341, 275,  66, 224, 189, 230,
        100, 142, 342, 135, 105, 115,  64, 153, 245, 299,  14, 184, 208,
        235, 219, 108, 119, 329, 163, 139, 220,  52,  10, 285, 337,

In [40]:
train_data.active_months_lag3.nunique(),train_data.active_months_lag6.nunique(),train_data.active_months_lag12.nunique()

(3, 6, 12)

In [41]:

float_col_names = train_data_dtypes[train_data_dtypes.col_type==np.float64].col_name.values.tolist()
float_col_names

['numerical_1',
 'numerical_2',
 'avg_sales_lag3',
 'avg_purchases_lag3',
 'avg_sales_lag6',
 'avg_purchases_lag6',
 'avg_sales_lag12',
 'avg_purchases_lag12',
 'category_2']

In [42]:
train_data[float_col_names].isnull().sum()

numerical_1                0
numerical_2                0
avg_sales_lag3            13
avg_purchases_lag3         0
avg_sales_lag6            13
avg_purchases_lag6         0
avg_sales_lag12           13
avg_purchases_lag12        0
category_2             11887
dtype: int64

In [43]:
mean_avg_sales_3 = np.mean(train_data[~train_data.avg_sales_lag3.isna()].avg_sales_lag3.values)
mean_avg_sales_6 = np.mean(train_data[~train_data.avg_sales_lag6.isna()].avg_sales_lag6.values)
mean_avg_sales_12 = np.mean(train_data[~train_data.avg_sales_lag12.isna()].avg_sales_lag12.values)

In [44]:
train_data.avg_sales_lag3.fillna(value= mean_avg_sales_3, inplace = True)
train_data.avg_sales_lag6.fillna(value= mean_avg_sales_6, inplace = True)
train_data.avg_sales_lag12.fillna(value= mean_avg_sales_12, inplace = True)

In [45]:
train_data.numerical_1.unique(),train_data.numerical_2.unique()#no idea what these negative numbers are

(array([-5.74706500e-02, -7.89613000e-03,  4.08529850e-01, -4.75557500e-02,
        -3.76408400e-02, -2.77259400e-02,  1.88585060e+00, -1.78110400e-02,
         2.56717061e+01,  2.01877000e-03,  1.19336800e-02,  4.16783900e-02,
         9.14189970e-01,  1.24138182e+00,  3.17634900e-02,  2.18485800e-02,
         6.15082000e-02,  1.85610589e+00,  1.11082720e-01,  1.50742340e-01,
         8.13380100e-02,  1.30912530e-01,  5.15932900e-02,  9.93509210e-01,
         2.59806290e-01,  7.14231000e-02,  5.37423610e-01,  1.01167820e-01,
         1.15231376e+01,  1.40827430e-01,  2.79636090e-01,  2.12380830e+00,
         3.88700040e-01,  2.00316860e-01,  1.90401960e-01,  2.42439599e+01,
         1.36827699e+02,  1.60657240e-01,  1.20997630e-01,  6.16742840e-01,
         1.80487050e-01,  9.12529100e-02,  3.29210620e-01,  2.39976480e-01,
         2.10231760e-01,  3.78785140e-01,  4.18444760e-01,  2.20146670e-01,
         2.69721190e-01,  3.19295710e-01,  3.58955330e-01,  1.28628073e+02,
         2.8

In [46]:
train_data[float_col_names].isnull().sum()

numerical_1                0
numerical_2                0
avg_sales_lag3             0
avg_purchases_lag3         0
avg_sales_lag6             0
avg_purchases_lag6         0
avg_sales_lag12            0
avg_purchases_lag12        0
category_2             11887
dtype: int64

In [47]:
most_appearing = train_data.category_2.value_counts().idxmax()
train_data.category_2.fillna(value=most_appearing, inplace= True)

In [48]:
train_data.describe(include='all').T

Unnamed: 0,count,unique,top,freq,mean,std,min,25%,50%,75%,max
merchant_id,334696,334633.0,M_ID_6464db3b45,4.0,,,,,,,
merchant_group_id,334696,,,,31028.7,31623.0,1.0,3612.0,19900.0,51707.2,112586.0
merchant_category_id,334696,,,,423.132,252.898,-1.0,222.0,373.0,683.0,891.0
subsector_id,334696,,,,25.1164,9.80737,-1.0,19.0,27.0,33.0,41.0
numerical_1,334696,,,,0.0114764,1.09815,-0.0574706,-0.0574706,-0.0574706,-0.0475558,183.735
numerical_2,334696,,,,0.00810311,1.0705,-0.0574706,-0.0574706,-0.0574706,-0.0475558,182.079
category_1,334696,2.0,N,327657.0,,,,,,,
most_recent_sales_range,334696,5.0,E,177104.0,,,,,,,
most_recent_purchases_range,334696,5.0,E,175309.0,,,,,,,
avg_sales_lag3,334696,,,,13.833,2395.44,-82.13,0.88,1.0,1.16,851845.0


In [49]:
train_data.drop(columns = ['most_recent_sales_range','most_recent_purchases_range'], inplace = True)

In [50]:
avg_purchases_lag3_mean = np.mean(train_data[train_data.avg_purchases_lag3!=np.inf].avg_purchases_lag3.values)
avg_purchases_lag6_mean = np.mean(train_data[train_data.avg_purchases_lag6!=np.inf].avg_purchases_lag6.values)
avg_purchases_lag12_mean = np.mean(train_data[train_data.avg_purchases_lag12!=np.inf].avg_purchases_lag12.values)
avg_purchases_lag3_mean,avg_purchases_lag6_mean,avg_purchases_lag12_mean

(1.5907620965243672, 1.8875678157761595, 2.0791954108497936)

In [52]:
train_data.avg_purchases_lag3.replace(np.inf , avg_purchases_lag3_mean, inplace = True)
train_data.avg_purchases_lag6.replace(np.inf , avg_purchases_lag6_mean, inplace = True)
train_data.avg_purchases_lag12.replace(np.inf , avg_purchases_lag12_mean, inplace = True)

In [53]:
train_data.describe(include='all').T

Unnamed: 0,count,unique,top,freq,mean,std,min,25%,50%,75%,max
merchant_id,334696,334633.0,M_ID_6464db3b45,4.0,,,,,,,
merchant_group_id,334696,,,,31028.7,31623.0,1.0,3612.0,19900.0,51707.2,112586.0
merchant_category_id,334696,,,,423.132,252.898,-1.0,222.0,373.0,683.0,891.0
subsector_id,334696,,,,25.1164,9.80737,-1.0,19.0,27.0,33.0,41.0
numerical_1,334696,,,,0.0114764,1.09815,-0.0574706,-0.0574706,-0.0574706,-0.0475558,183.735
numerical_2,334696,,,,0.00810311,1.0705,-0.0574706,-0.0574706,-0.0574706,-0.0475558,182.079
category_1,334696,2.0,N,327657.0,,,,,,,
avg_sales_lag3,334696,,,,13.833,2395.44,-82.13,0.88,1.0,1.16,851845.0
avg_purchases_lag3,334696,,,,1.59076,107.187,0.333495,0.92365,1.01667,1.14652,61851.3
active_months_lag3,334696,,,,2.99411,0.0952475,1.0,3.0,3.0,3.0,3.0


In [54]:
train_data.to_csv('all_new/merchants_2.csv', index = False)