In [45]:
# !pip install tensorflow

### Loading the Dataset

In [4]:
# importing necessary library, rest we will import when required
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import AdaBoostRegressor,RandomForestRegressor
from sklearn.metrics import mean_squared_error as mse, r2_score

In [5]:
# Avoiding warning diplay
import warnings
warnings.filterwarnings('ignore')

In [6]:
# Loading datasets 
items=pd.read_csv('items.csv')
sample_submission=pd.read_csv('sample_submission.csv')
item_categories=pd.read_csv('item_categories.csv')
sales_train=pd.read_csv('sales_train.csv')
shops=pd.read_csv('shops.csv')
test=pd.read_csv('test.csv')

In [7]:
# Storing names of all dataset in a list
listdf=[items,item_categories,shops,sales_train,test,sample_submission]

In [8]:
# Reading test data
test.head(2)

Unnamed: 0,ID,shop_id,item_id
0,0,5,5037
1,1,5,5320


In [9]:
ID=test['ID']

In [10]:
test.shape

(214200, 3)

>Test data has 214200 rows and 3 columns.
Only shop_id and item_id columns has been taken for prediction, and will leave all other columns untouched.

In [12]:
# Reading first 3 rows of item dataset
items.head(3)

Unnamed: 0,item_name,item_id,item_category_id
0,! ВО ВЛАСТИ НАВАЖДЕНИЯ (ПЛАСТ.) D,0,40
1,!ABBYY FineReader 12 Professional Edition Full...,1,76
2,***В ЛУЧАХ СЛАВЫ (UNV) D,2,40


In [13]:
items.shape

(22170, 3)

In [14]:
items.drop('item_name',axis=1,inplace=True)

In [15]:
# Reading Item Categories
item_categories.head(3)

Unnamed: 0,item_category_name,item_category_id
0,PC - Гарнитуры/Наушники,0
1,Аксессуары - PS2,1
2,Аксессуары - PS3,2


In [16]:
item_categories.shape

(84, 2)

In [17]:
item_categories.isnull().sum()

item_category_name    0
item_category_id      0
dtype: int64

In [18]:
shops.head(3)

Unnamed: 0,shop_name,shop_id
0,"!Якутск Орджоникидзе, 56 фран",0
1,"!Якутск ТЦ ""Центральный"" фран",1
2,"Адыгея ТЦ ""Мега""",2


In [19]:
shops.shape

(60, 2)

In [20]:
shops.isnull().sum()

shop_name    0
shop_id      0
dtype: int64

In [21]:
sales_train.head(3)

Unnamed: 0,date,date_block_num,shop_id,item_id,item_price,item_cnt_day
0,02.01.2013,0,59,22154,999.0,1.0
1,03.01.2013,0,25,2552,899.0,1.0
2,05.01.2013,0,25,2552,899.0,-1.0


In [22]:
sales_train.shape

(2935849, 6)

>This is the main data that we will use to train the model, as all other datasets are very short, like shop dataset has only 60 rows, and item_categories has only 84 rows. these dataset are very small in comparision to sales_train dataset, which has more than 2.9  millions records. so merging these dataset is almost meaningless.

In [23]:
# Checking for null values
sales_train.isnull().sum()

date              0
date_block_num    0
shop_id           0
item_id           0
item_price        0
item_cnt_day      0
dtype: int64

>WE DON'T HAVE NULL VALUES IN sales_train dataset.

In [24]:
# Extracting input and output values
x=sales_train.drop(['date','date_block_num','item_price','item_cnt_day'],axis=1)
y=sales_train.item_cnt_day

In [25]:
x[0:5]

Unnamed: 0,shop_id,item_id
0,59,22154
1,25,2552
2,25,2552
3,25,2554
4,25,2555


In [26]:
y[0:5]

0    1.0
1    1.0
2   -1.0
3    1.0
4    1.0
Name: item_cnt_day, dtype: float64

****

In [27]:
from sklearn.preprocessing import StandardScaler
ss=StandardScaler()
x=ss.fit_transform(x)

In [28]:
from sklearn.model_selection import train_test_split
xtrain,xtest,ytrain,ytest=train_test_split(x,y,test_size=0.15,random_state=0)

In [30]:
# Linear model
from sklearn.linear_model import LinearRegression
lr=LinearRegression()
lr.fit(xtrain,ytrain)
print('Train Score: ',lr.score(xtrain,ytrain))
print('Test Score: ',lr.score(xtest,ytest))
ypred=lr.predict(xtest)
score=mse(ytest,ypred)**1/2 # Taking root of mean squared error
print("Accuracy Score: ",score.round(3)) # Score is rounded upto 3 decimal places

Train Score:  0.0002907564164054133
Test Score:  0.00041975466303478814
Accuracy Score:  3.325


### Random Forest

In [31]:
from sklearn.ensemble import RandomForestRegressor
rf=RandomForestRegressor(n_estimators=100)
rf.fit(xtrain,ytrain)
print('Train Score: ',rf.score(xtrain,ytrain))
print('Test Score: ',rf.score(xtest,ytest))
ypred=rf.predict(xtest)
score=mse(ytest,ypred)**1/2 # Taking root of mean squared error
print("RMSE of Random Forest: ",score.round(3)) 
# Score is rounded upto 3 decimal places

Train Score:  0.15859428848273482
Test Score:  0.3337789828652741
RMSE of Random Forest:  2.216


### AdaBoostRegressor

In [32]:
adb=AdaBoostRegressor()
adb.fit(xtrain,ytrain)
ypred=adb.predict(xtest)

# Traning Score
print('AdaBoost Trainig Score: ',adb.score(xtrain,ytrain))
# Test Score
print('AdaBoost Testing Score: ',adb.score(xtest,ytest))

score=mse(ytest,ypred)**1/2 # Taking root of mean squared error
print("RMSE of Random Forest: ",score.round(3)) 
# Score is rounded upto 3 decimal places

AdaBoost Trainig Score:  -9.470082962366712
AdaBoost Testing Score:  -11.972425169292466
RMSE of Random Forest:  43.151


In [33]:
dt=DecisionTreeRegressor()
dt.fit(xtrain,ytrain)
ypred=dt.predict(xtest)

# Traning Score
print('Trainig Score: ',dt.score(xtrain,ytrain))
# Test Score
print('AdaBoost Testing Score: ',dt.score(xtest,ytest))

score=mse(ytest,ypred)**1/2 # Taking root of mean squared error
print("RMSE of Random Forest: ",score.round(3)) 
# Score is rounded upto 3 decimal places

Trainig Score:  0.16155127293667015
AdaBoost Testing Score:  0.4346116997520495
RMSE of Random Forest:  1.881


In [34]:
test.head(2)

Unnamed: 0,ID,shop_id,item_id
0,0,5,5037
1,1,5,5320


In [35]:
testID=test['ID']

In [36]:
testdf=test.drop('ID',axis=1)

In [37]:
ypred=dt.predict(testdf)

In [38]:
dt.score(testdf,ypred)

1.0

In [41]:
submission = pd.DataFrame()
submission['ID']=ID
submission['item_cnt_month']=ypred
submission.head(20)

Unnamed: 0,ID,item_cnt_month
0,0,1.0
1,1,1.0
2,2,1.0
3,3,1.0
4,4,1.0
5,5,1.0
6,6,1.0
7,7,1.0
8,8,1.0
9,9,1.0


In [43]:
submission.to_csv('submission.csv')