In [45]:
# !pip install tensorflow

### Loading the Dataset

In [2]:
# importing necessary library, rest we will import when required
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import AdaBoostRegressor,RandomForestRegressor
from sklearn.metrics import mean_squared_error as mse, r2_score

In [3]:
# Avoiding warning diplay
import warnings
warnings.filterwarnings('ignore')

In [4]:
# Loading datasets 
items=pd.read_csv('items.csv')
sample_submission=pd.read_csv('sample_submission.csv')
item_categories=pd.read_csv('item_categories.csv')
sales_train=pd.read_csv('sales_train.csv')
shops=pd.read_csv('shops.csv')
test=pd.read_csv('test.csv')

In [5]:
# Storing names of all dataset in a list
listdf=[items,item_categories,shops,sales_train,test,sample_submission]

In [6]:
# Reading test data
test.head(2)

Unnamed: 0,ID,shop_id,item_id
0,0,5,5037
1,1,5,5320


In [7]:
ID=test['ID']

In [8]:
test.shape

(214200, 3)

>Test data has 214200 rows and 3 columns.
Only shop_id and item_id columns has been taken for prediction, and will leave all other columns untouched.

In [9]:
# Reading first 3 rows of item dataset
items.head(3)

Unnamed: 0,item_name,item_id,item_category_id
0,! ВО ВЛАСТИ НАВАЖДЕНИЯ (ПЛАСТ.) D,0,40
1,!ABBYY FineReader 12 Professional Edition Full...,1,76
2,***В ЛУЧАХ СЛАВЫ (UNV) D,2,40


In [10]:
# items.drop('item_name',axis=1,inplace=True)

In [11]:
# Reading Item Categories
item_categories.head(3)

Unnamed: 0,item_category_name,item_category_id
0,PC - Гарнитуры/Наушники,0
1,Аксессуары - PS2,1
2,Аксессуары - PS3,2


In [22]:
# Merging items and item_categories

items_item_categories=pd.merge(items,item_categories, on='item_category_id')

In [23]:
items_item_categories.isnull().sum()

item_name             0
item_id               0
item_category_id      0
item_category_name    0
dtype: int64

In [24]:
items_item_categories.shape

(22170, 4)

In [None]:
# Shop

In [15]:
shops.head(3)

Unnamed: 0,shop_name,shop_id
0,"!Якутск Орджоникидзе, 56 фран",0
1,"!Якутск ТЦ ""Центральный"" фран",1
2,"Адыгея ТЦ ""Мега""",2


In [16]:
shops.shape

(60, 2)

In [17]:
shops.isnull().sum()

shop_name    0
shop_id      0
dtype: int64

In [18]:
sales_train.head(3)

Unnamed: 0,date,date_block_num,shop_id,item_id,item_price,item_cnt_day
0,02.01.2013,0,59,22154,999.0,1.0
1,03.01.2013,0,25,2552,899.0,1.0
2,05.01.2013,0,25,2552,899.0,-1.0


In [20]:
# merging sales train and shops

sales_train_shops=pd.merge(sales_train,shops, on='shop_id')
sales_train_shops.shape

(2935849, 7)

In [21]:
# Checking for null values
sales_train_shops.isnull().sum()

date              0
date_block_num    0
shop_id           0
item_id           0
item_price        0
item_cnt_day      0
shop_name         0
dtype: int64

> Now we have **sales_train_shops** and **items_item_categories** as two main dataframe, let's explorer what can be done with these 2 data sets

In [25]:
# we can merge these two datasets also, on item_id

final_df=pd.merge(sales_train_shops,items_item_categories, on='item_id')
final_df.head(2)

Unnamed: 0,date,date_block_num,shop_id,item_id,item_price,item_cnt_day,shop_name,item_name,item_category_id,item_category_name
0,02.01.2013,0,59,22154,999.0,1.0,"Ярославль ТЦ ""Альтаир""",ЯВЛЕНИЕ 2012 (BD),37,Кино - Blu-Ray
1,02.01.2013,0,25,22154,999.0,1.0,"Москва ТРК ""Атриум""",ЯВЛЕНИЕ 2012 (BD),37,Кино - Blu-Ray


In [26]:
final_df.shape

(2935849, 10)

In [28]:
final_df.isnull().sum()

date                  0
date_block_num        0
shop_id               0
item_id               0
item_price            0
item_cnt_day          0
shop_name             0
item_name             0
item_category_id      0
item_category_name    0
dtype: int64

In [31]:
# Extracting input and output values
x=final_df.drop(['date','date_block_num','item_price','item_cnt_day','shop_name','item_name',
                'item_category_id','item_category_name'],axis=1)
y=final_df.item_cnt_day

In [32]:
x[0:5]

Unnamed: 0,shop_id,item_id
0,59,22154
1,25,22154
2,25,22154
3,25,22154
4,25,22154


In [33]:
y[0:5]

0    1.0
1    1.0
2    1.0
3    1.0
4    1.0
Name: item_cnt_day, dtype: float64

### Preprocessing

In [34]:
from sklearn.preprocessing import StandardScaler
ss=StandardScaler()
x=ss.fit_transform(x)

In [35]:
from sklearn.model_selection import train_test_split
xtrain,xtest,ytrain,ytest=train_test_split(x,y,test_size=0.15,random_state=0)

In [37]:
# Linear model
from sklearn.linear_model import LinearRegression
lr=LinearRegression(fit_intercept=True)
lr.fit(xtrain,ytrain)
print('Train Score: ',lr.score(xtrain,ytrain))
print('Test Score: ',lr.score(xtest,ytest))
ypred=lr.predict(xtest)
score=mse(ytest,ypred)**1/2 # Taking root of mean squared error
print("Accuracy Score: ",score.round(3)) # Score is rounded upto 3 decimal places

Train Score:  0.0003898581899413056
Test Score:  0.00015424430827826185
Accuracy Score:  7.794


### LightGBM

In [39]:
from lightgbm import LGBMRegressor
lgb=LGBMRegressor(n_estimators= 200)
lgb.fit(xtrain,ytrain)
lgb.score(xtrain,ytrain)

0.12916159167636132

In [40]:
lgb.score(xtest,ytest)

0.0452820302263669

### Random Forest

In [None]:
from sklearn.ensemble import RandomForestRegressor
rf=RandomForestRegressor(n_estimators=200)
rf.fit(xtrain,ytrain)
print('Train Score: ',rf.score(xtrain,ytrain))
print('Test Score: ',rf.score(xtest,ytest))
ypred=rf.predict(xtest)
score=mse(ytest,ypred)**1/2 # Taking root of mean squared error
print("RMSE of Random Forest: ",score.round(3)) 
# Score is rounded upto 3 decimal places

### AdaBoostRegressor

In [32]:
adb=AdaBoostRegressor()
adb.fit(xtrain,ytrain)
ypred=adb.predict(xtest)

# Traning Score
print('AdaBoost Trainig Score: ',adb.score(xtrain,ytrain))
# Test Score
print('AdaBoost Testing Score: ',adb.score(xtest,ytest))

score=mse(ytest,ypred)**1/2 # Taking root of mean squared error
print("RMSE of Random Forest: ",score.round(3)) 
# Score is rounded upto 3 decimal places

AdaBoost Trainig Score:  -9.470082962366712
AdaBoost Testing Score:  -11.972425169292466
RMSE of Random Forest:  43.151


In [33]:
dt=DecisionTreeRegressor()
dt.fit(xtrain,ytrain)
ypred=dt.predict(xtest)

# Traning Score
print('Trainig Score: ',dt.score(xtrain,ytrain))
# Test Score
print('AdaBoost Testing Score: ',dt.score(xtest,ytest))

score=mse(ytest,ypred)**1/2 # Taking root of mean squared error
print("RMSE of Random Forest: ",score.round(3)) 
# Score is rounded upto 3 decimal places

Trainig Score:  0.16155127293667015
AdaBoost Testing Score:  0.4346116997520495
RMSE of Random Forest:  1.881


In [34]:
test.head(2)

Unnamed: 0,ID,shop_id,item_id
0,0,5,5037
1,1,5,5320


In [35]:
testID=test['ID']

In [36]:
testdf=test.drop('ID',axis=1)

In [37]:
ypred=dt.predict(testdf)

In [38]:
dt.score(testdf,ypred)

1.0

In [41]:
submission = pd.DataFrame()
submission['ID']=ID
submission['item_cnt_month']=ypred
submission.head(20)

Unnamed: 0,ID,item_cnt_month
0,0,1.0
1,1,1.0
2,2,1.0
3,3,1.0
4,4,1.0
5,5,1.0
6,6,1.0
7,7,1.0
8,8,1.0
9,9,1.0


In [43]:
submission.to_csv('submission.csv')