In [1]:
import featuretools as ft
import numpy as np
import pandas as pd

train = pd.read_csv("train_v9rqX0R.csv")
test = pd.read_csv("test_AbJTz2l.csv")

In [2]:
# saving identifiers
test_Item_Identifier = test['Item_Identifier']
test_Outlet_Identifier = test['Outlet_Identifier']
sales = train['Item_Outlet_Sales']
train.drop(['Item_Outlet_Sales'], axis=1, inplace=True)

In [6]:
train.head()

Unnamed: 0,Item_Identifier,Item_Weight,Item_Fat_Content,Item_Visibility,Item_Type,Item_MRP,Outlet_Identifier,Outlet_Establishment_Year,Outlet_Size,Outlet_Location_Type,Outlet_Type
0,FDA15,9.3,Low Fat,0.016047,Dairy,249.8092,OUT049,1999,Medium,Tier 1,Supermarket Type1
1,DRC01,5.92,Regular,0.019278,Soft Drinks,48.2692,OUT018,2009,Medium,Tier 3,Supermarket Type2
2,FDN15,17.5,Low Fat,0.01676,Meat,141.618,OUT049,1999,Medium,Tier 1,Supermarket Type1
3,FDX07,19.2,Regular,0.0,Fruits and Vegetables,182.095,OUT010,1998,,Tier 3,Grocery Store
4,NCD19,8.93,Low Fat,0.0,Household,53.8614,OUT013,1987,High,Tier 3,Supermarket Type1


In [7]:
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8523 entries, 0 to 8522
Data columns (total 11 columns):
 #   Column                     Non-Null Count  Dtype  
---  ------                     --------------  -----  
 0   Item_Identifier            8523 non-null   object 
 1   Item_Weight                7060 non-null   float64
 2   Item_Fat_Content           8523 non-null   object 
 3   Item_Visibility            8523 non-null   float64
 4   Item_Type                  8523 non-null   object 
 5   Item_MRP                   8523 non-null   float64
 6   Outlet_Identifier          8523 non-null   object 
 7   Outlet_Establishment_Year  8523 non-null   int64  
 8   Outlet_Size                6113 non-null   object 
 9   Outlet_Location_Type       8523 non-null   object 
 10  Outlet_Type                8523 non-null   object 
dtypes: float64(3), int64(1), object(7)
memory usage: 732.6+ KB


In [3]:
combi = train.append(test, ignore_index=True)

In [4]:
combi.isnull().sum()

Item_Identifier                 0
Item_Weight                  2439
Item_Fat_Content                0
Item_Visibility                 0
Item_Type                       0
Item_MRP                        0
Outlet_Identifier               0
Outlet_Establishment_Year       0
Outlet_Size                  4016
Outlet_Location_Type            0
Outlet_Type                     0
dtype: int64

In [5]:
# imputing missing data
combi['Item_Weight'].fillna(combi['Item_Weight'].mean(), inplace = True)
combi['Outlet_Size'].fillna("missing", inplace = True)

In [8]:
combi['Item_Fat_Content'].value_counts()

Low Fat    8485
Regular    4824
LF          522
reg         195
low fat     178
Name: Item_Fat_Content, dtype: int64

In [9]:
# dictionary to replace the categories
fat_content_dict = {'Low Fat':0, 'Regular':1, 'LF':0, 'reg':1, 'low fat':0}

combi['Item_Fat_Content'] = combi['Item_Fat_Content'].replace(fat_content_dict, regex=True)

In [11]:
combi['Item_Identifier'].value_counts() , combi['Outlet_Identifier'].value_counts()

(FDI20    10
 NCS29    10
 FDV21    10
 FDC53    10
 DRP47    10
          ..
 FDH58     7
 FDL50     7
 FDR51     7
 FDO33     7
 DRN11     7
 Name: Item_Identifier, Length: 1559, dtype: int64,
 OUT027    1559
 OUT013    1553
 OUT035    1550
 OUT046    1550
 OUT049    1550
 OUT045    1548
 OUT018    1546
 OUT017    1543
 OUT010     925
 OUT019     880
 Name: Outlet_Identifier, dtype: int64)

In [12]:
combi['id'] = combi['Item_Identifier'] + combi['Outlet_Identifier']
combi.drop(['Item_Identifier'], axis=1, inplace=True)

In [13]:
# creating and entity set 'es'
es = ft.EntitySet(id = 'sales')

# adding a dataframe 
es.entity_from_dataframe(entity_id = 'bigmart', dataframe = combi, index = 'id')

Entityset: sales
  Entities:
    bigmart [Rows: 14204, Columns: 11]
  Relationships:
    No relationships

In [14]:
es.normalize_entity(base_entity_id='bigmart', new_entity_id='outlet', index = 'Outlet_Identifier', 
additional_variables = ['Outlet_Establishment_Year', 'Outlet_Size', 'Outlet_Location_Type', 'Outlet_Type'])

Entityset: sales
  Entities:
    bigmart [Rows: 14204, Columns: 7]
    outlet [Rows: 10, Columns: 5]
  Relationships:
    bigmart.Outlet_Identifier -> outlet.Outlet_Identifier

In [15]:
print(es)

Entityset: sales
  Entities:
    bigmart [Rows: 14204, Columns: 7]
    outlet [Rows: 10, Columns: 5]
  Relationships:
    bigmart.Outlet_Identifier -> outlet.Outlet_Identifier


In [16]:
feature_matrix, feature_names = ft.dfs(entityset=es, 
target_entity = 'bigmart', 
max_depth = 2, 
verbose = 1, 
n_jobs = 3)

Built 37 features
EntitySet scattered to 3 workers in 8 seconds                                                                          
Elapsed: 00:02 | Progress: 100%|███████████████████████████████████████████████████████████████████████████████████████


In [17]:
feature_matrix.columns

Index(['Item_Weight', 'Item_Fat_Content', 'Item_Visibility', 'Item_Type',
       'Item_MRP', 'Outlet_Identifier', 'outlet.Outlet_Establishment_Year',
       'outlet.Outlet_Size', 'outlet.Outlet_Location_Type',
       'outlet.Outlet_Type', 'outlet.COUNT(bigmart)',
       'outlet.MAX(bigmart.Item_Fat_Content)', 'outlet.MAX(bigmart.Item_MRP)',
       'outlet.MAX(bigmart.Item_Visibility)',
       'outlet.MAX(bigmart.Item_Weight)',
       'outlet.MEAN(bigmart.Item_Fat_Content)',
       'outlet.MEAN(bigmart.Item_MRP)', 'outlet.MEAN(bigmart.Item_Visibility)',
       'outlet.MEAN(bigmart.Item_Weight)',
       'outlet.MIN(bigmart.Item_Fat_Content)', 'outlet.MIN(bigmart.Item_MRP)',
       'outlet.MIN(bigmart.Item_Visibility)',
       'outlet.MIN(bigmart.Item_Weight)', 'outlet.MODE(bigmart.Item_Type)',
       'outlet.NUM_UNIQUE(bigmart.Item_Type)',
       'outlet.SKEW(bigmart.Item_Fat_Content)',
       'outlet.SKEW(bigmart.Item_MRP)', 'outlet.SKEW(bigmart.Item_Visibility)',
       'outlet.SKEW(bi

In [18]:
feature_matrix.head()

Unnamed: 0_level_0,Item_Weight,Item_Fat_Content,Item_Visibility,Item_Type,Item_MRP,Outlet_Identifier,outlet.Outlet_Establishment_Year,outlet.Outlet_Size,outlet.Outlet_Location_Type,outlet.Outlet_Type,...,outlet.SKEW(bigmart.Item_Visibility),outlet.SKEW(bigmart.Item_Weight),outlet.STD(bigmart.Item_Fat_Content),outlet.STD(bigmart.Item_MRP),outlet.STD(bigmart.Item_Visibility),outlet.STD(bigmart.Item_Weight),outlet.SUM(bigmart.Item_Fat_Content),outlet.SUM(bigmart.Item_MRP),outlet.SUM(bigmart.Item_Visibility),outlet.SUM(bigmart.Item_Weight)
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
FDA15OUT049,9.3,0,0.016047,Dairy,249.8092,OUT049,1999,Medium,Tier 1,Supermarket Type1,...,0.790782,0.099024,0.478027,62.144594,0.043924,4.650796,547,218802.9588,91.450099,19844.655
DRC01OUT018,5.92,1,0.019278,Soft Drinks,48.2692,OUT018,2009,Medium,Tier 3,Supermarket Type2,...,0.783017,0.102602,0.478308,62.022851,0.044489,4.650874,547,217987.3906,92.723425,19794.425
FDN15OUT049,17.5,0,0.01676,Meat,141.618,OUT049,1999,Medium,Tier 1,Supermarket Type1,...,0.790782,0.099024,0.478027,62.144594,0.043924,4.650796,547,218802.9588,91.450099,19844.655
FDX07OUT010,19.2,1,0.0,Fruits and Vegetables,182.095,OUT010,1998,missing,Tier 3,Grocery Store,...,0.776902,0.112759,0.479301,62.010835,0.073604,4.67507,330,130572.7618,94.293418,11768.655
NCD19OUT013,8.93,0,0.0,Household,53.8614,OUT013,1987,High,Tier 3,Supermarket Type1,...,0.759033,0.104392,0.478213,62.140848,0.044005,4.650214,549,219172.4492,93.555174,19859.98


In [19]:
feature_matrix = feature_matrix.reindex(index=combi['id'])
feature_matrix = feature_matrix.reset_index()

### MOdel Building

In [20]:
from catboost import CatBoostRegressor

In [21]:
categorical_features = np.where(feature_matrix.dtypes == 'object')[0]

for i in categorical_features:
    feature_matrix.iloc[:,i] = feature_matrix.iloc[:,i].astype('str')

In [22]:
feature_matrix.drop(['id'], axis=1, inplace=True)
train = feature_matrix[:8523]
test = feature_matrix[8523:]

In [23]:
# removing uneccesary variables
train.drop(['Outlet_Identifier'], axis=1, inplace=True)
test.drop(['Outlet_Identifier'], axis=1, inplace=True)

In [25]:
# identifying categorical features
categorical_features = np.where(train.dtypes == 'object')[0]
categorical_features

array([ 3,  6,  7,  8, 22], dtype=int64)

In [26]:
from sklearn.model_selection import train_test_split

# splitting train data into training and validation set
xtrain, xvalid, ytrain, yvalid = train_test_split(train, sales, test_size=0.25, random_state=11)

In [29]:
model_cat = CatBoostRegressor(iterations=200, learning_rate=0.3, depth=6, eval_metric='RMSE', random_seed=7)

# training model
model_cat.fit(xtrain, ytrain, cat_features=categorical_features, use_best_model=True)

You should provide test set for use best model. use_best_model parameter has been switched to false value.


0:	learn: 1452.8590984	total: 20.7ms	remaining: 4.12s
1:	learn: 1299.1044258	total: 44.7ms	remaining: 4.42s
2:	learn: 1206.4313081	total: 66.1ms	remaining: 4.34s
3:	learn: 1147.9485392	total: 92.4ms	remaining: 4.53s
4:	learn: 1116.2629645	total: 115ms	remaining: 4.5s
5:	learn: 1094.1346113	total: 137ms	remaining: 4.44s
6:	learn: 1082.6767621	total: 164ms	remaining: 4.51s
7:	learn: 1077.1103816	total: 190ms	remaining: 4.55s
8:	learn: 1073.1205735	total: 212ms	remaining: 4.5s
9:	learn: 1069.6594792	total: 240ms	remaining: 4.57s
10:	learn: 1066.7326891	total: 262ms	remaining: 4.51s
11:	learn: 1066.0171874	total: 285ms	remaining: 4.47s
12:	learn: 1063.2888862	total: 310ms	remaining: 4.46s
13:	learn: 1063.2809422	total: 316ms	remaining: 4.2s
14:	learn: 1061.1235559	total: 334ms	remaining: 4.12s
15:	learn: 1058.1475726	total: 353ms	remaining: 4.06s
16:	learn: 1057.3021612	total: 366ms	remaining: 3.94s
17:	learn: 1056.2975498	total: 384ms	remaining: 3.88s
18:	learn: 1054.9213591	total: 405ms	

160:	learn: 905.6572809	total: 3.26s	remaining: 790ms
161:	learn: 904.6132141	total: 3.28s	remaining: 770ms
162:	learn: 903.7102607	total: 3.31s	remaining: 751ms
163:	learn: 902.5413397	total: 3.32s	remaining: 730ms
164:	learn: 901.7232842	total: 3.35s	remaining: 710ms
165:	learn: 900.0529571	total: 3.37s	remaining: 690ms
166:	learn: 899.1112622	total: 3.4s	remaining: 673ms
167:	learn: 898.4533338	total: 3.43s	remaining: 654ms
168:	learn: 896.0880976	total: 3.46s	remaining: 636ms
169:	learn: 895.0880412	total: 3.49s	remaining: 616ms
170:	learn: 894.0790482	total: 3.51s	remaining: 595ms
171:	learn: 891.9886768	total: 3.54s	remaining: 576ms
172:	learn: 891.2147958	total: 3.55s	remaining: 555ms
173:	learn: 890.3315929	total: 3.58s	remaining: 535ms
174:	learn: 888.6550888	total: 3.6s	remaining: 515ms
175:	learn: 888.2754247	total: 3.63s	remaining: 495ms
176:	learn: 887.5914245	total: 3.65s	remaining: 474ms
177:	learn: 887.3273748	total: 3.68s	remaining: 454ms
178:	learn: 886.8439708	total:

<catboost.core.CatBoostRegressor at 0x244349377c8>

In [30]:
# validation score
model_cat.score(xvalid, yvalid)

0.5728788704354562

In [45]:
pred = model_cat.predict(test)

In [32]:
sub =pd.read_csv('sample_submission_8RXa3c6.csv')
sub.head()

Unnamed: 0,Item_Identifier,Outlet_Identifier,Item_Outlet_Sales
0,FDW58,OUT049,1000
1,FDW14,OUT017,1000
2,NCN55,OUT010,1000
3,FDQ58,OUT017,1000
4,FDY38,OUT027,1000


In [36]:
test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5681 entries, 8523 to 14203
Data columns (total 36 columns):
 #   Column                                 Non-Null Count  Dtype  
---  ------                                 --------------  -----  
 0   Item_Weight                            5681 non-null   float64
 1   Item_Fat_Content                       5681 non-null   int64  
 2   Item_Visibility                        5681 non-null   float64
 3   Item_Type                              5681 non-null   object 
 4   Item_MRP                               5681 non-null   float64
 5   outlet.Outlet_Establishment_Year       5681 non-null   int64  
 6   outlet.Outlet_Size                     5681 non-null   object 
 7   outlet.Outlet_Location_Type            5681 non-null   object 
 8   outlet.Outlet_Type                     5681 non-null   object 
 9   outlet.COUNT(bigmart)                  5681 non-null   int64  
 10  outlet.MAX(bigmart.Item_Fat_Content)   5681 non-null   int64  
 11  

In [43]:
feature_matrix[8523:].head()

Unnamed: 0,Item_Weight,Item_Fat_Content,Item_Visibility,Item_Type,Item_MRP,Outlet_Identifier,outlet.Outlet_Establishment_Year,outlet.Outlet_Size,outlet.Outlet_Location_Type,outlet.Outlet_Type,...,outlet.SKEW(bigmart.Item_Visibility),outlet.SKEW(bigmart.Item_Weight),outlet.STD(bigmart.Item_Fat_Content),outlet.STD(bigmart.Item_MRP),outlet.STD(bigmart.Item_Visibility),outlet.STD(bigmart.Item_Weight),outlet.SUM(bigmart.Item_Fat_Content),outlet.SUM(bigmart.Item_MRP),outlet.SUM(bigmart.Item_Visibility),outlet.SUM(bigmart.Item_Weight)
8523,20.75,0,0.007565,Snack Foods,107.8622,OUT049,1999,Medium,Tier 1,Supermarket Type1,...,0.790782,0.099024,0.478027,62.144594,0.043924,4.650796,547,218802.9588,91.450099,19844.655
8524,8.3,1,0.038428,Dairy,87.3198,OUT017,2007,missing,Tier 2,Supermarket Type1,...,0.774783,0.106563,0.477922,62.295513,0.044152,4.655234,544,217561.35,94.34221,19722.75
8525,14.6,0,0.099575,Others,241.7538,OUT010,1998,missing,Tier 3,Grocery Store,...,0.776902,0.112759,0.479301,62.010835,0.073604,4.67507,330,130572.7618,94.293418,11768.655
8526,7.315,0,0.015388,Snack Foods,155.034,OUT017,2007,missing,Tier 2,Supermarket Type1,...,0.774783,0.106563,0.477922,62.295513,0.044152,4.655234,544,217561.35,94.34221,19722.75
8527,12.792854,1,0.118599,Dairy,234.23,OUT027,1985,Medium,Tier 3,Supermarket Type3,...,0.774028,0.0,0.478189,62.05966,0.044228,0.0,551,219838.2488,94.075671,19944.059742


In [38]:
sub.head()

Unnamed: 0,Item_Identifier,Outlet_Identifier,Item_Outlet_Sales
0,FDW58,OUT049,1000
1,FDW14,OUT017,1000
2,NCN55,OUT010,1000
3,FDQ58,OUT017,1000
4,FDY38,OUT027,1000


In [46]:
sub['Item_Outlet_Sales'] = pred

In [53]:
sub.to_csv('sub_fe_catboost.csv', index=False)

In [49]:
sub.describe()

Unnamed: 0,Item_Outlet_Sales
count,5681.0
mean,2217.791049
std,1400.73576
min,-382.764119
25%,1103.687728
50%,2104.299333
75%,3084.257148
max,7869.805779


In [50]:
sales.describe()

count     8523.000000
mean      2181.288914
std       1706.499616
min         33.290000
25%        834.247400
50%       1794.331000
75%       3101.296400
max      13086.964800
Name: Item_Outlet_Sales, dtype: float64

In [52]:
sub['Item_Outlet_Sales'] = np.where(sub['Item_Outlet_Sales']<0,0,sub['Item_Outlet_Sales'])