
- **`sales_train.csv`** Rows: 2935849 sales (January 2013 -> Octuber 2015)
  - **date**: date in format dd/mm/yyyy.
  - **date_block_num**: a consecutive month number. January 2013 is 0, February 2013 is 1,..., October 2015 is 33
  - **shop_id**: unique identifier of a shop
  - **item_id**: unique identifier of a product
  - **item_price**: current price of an item
  - **item_cnt_day**: number of products sold. You are predicting a monthly amount of this measure.
- **`shops.csv`** Rows: 60 shops
  - **shop_id**
  - **shop_name**: name of shop (RUSSIAN 🇷🇺)
- **`items.csv`** Rows: 22170 products
  - **item_id**
  - **item_name**: name of item (RUSSIAN 🇷🇺)
  - **item_category_id**: unique identifier of item category
- **`item_categories.csv`** Rows: 84 product categories
  - **item_category_id**
  - **item_category_name**: name of item category (RUSSIAN 🇷🇺)
- **`test.csv`** Rows: 214200 pairs combination of (Shop, Item)
  - **ID**: an Id that represents a (Shop, Item) tuple within the test set
  - **shop_id**
  - **item_id**


In [7]:
#!pip install missingno

Collecting missingno
  Downloading missingno-0.4.2-py3-none-any.whl (9.7 kB)
Installing collected packages: missingno
Successfully installed missingno-0.4.2


In [60]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

import missingno as m
import seaborn as sns
from sklearn.ensemble import IsolationForest
from scipy import stats
import matplotlib as plt



# Preprecessing

from tqdm import tqdm_notebook as tqdm
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer


# Machine Learning
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes  import MultinomialNB
from sklearn.naive_bayes  import BernoulliNB
from sklearn.ensemble     import RandomForestClassifier
from xgboost              import XGBClassifier

# Machine Learning Evaluation
from sklearn.metrics         import accuracy_score, f1_score
from sklearn.model_selection import train_test_split, StratifiedKFold, cross_val_score


In [69]:
path = "../../datasets/predict-future-sales/"

train = pd.read_csv(path+"sales_train.csv") # Daily sales  Jan 2013 -> Oct 2015
shops = pd.read_csv(path+"shops-translated.csv")       # Shops    (60)
items = pd.read_csv(path+"items-translated.csv")       # Products  (22170)
oritem = pd.read_csv(path+"items.csv")  
cats  = pd.read_csv(path+"item_categories-translated.csv") # Product categories (84)
test  = pd.read_csv(path+"test.csv", index_col="ID") # predict November 2015
sub   = pd.read_csv(path+"sample_submission.csv", index_col="ID")


In [70]:
items['item_category_id'] = oritem['item_category_id']
pd.set_option('display.max_rows', 15)
items

Unnamed: 0,item_id,item_name_translated,item_category_id
0,0,!! IN THE POWER OF HAPPINESS (PLAST) D,40
1,1,! ABBYY FineReader 12 Professional Edition Ful...,76
2,2,*** IN THE GLORY OF THE GLORY (UNV) D,40
3,3,*** BLUE WAVE (Univ) D,40
4,4,*** BOX (GLASS) D,40
...,...,...,...
22165,22165,"Nuclear Titbit 2 [PC, Digital Version]",31
22166,22166,Query language 1C: Enterprise [Digital version],54
22167,22167,The query language is 1C: Enterprise 8 (+ CD)....,49
22168,22168,Egg for Little Inu,62


In [15]:
pd.set_option('display.max_rows', None)
cats

Unnamed: 0,item_category_id,item_category_name_translated
0,0,PC - Headsets / Headphones
1,1,Accessories - PS2
2,2,Accessories - PS3
3,3,Accessories - PS4
4,4,Accessories - PSP
5,5,Accessories - PSVita
6,6,Accessories - XBOX 360
7,7,Accessories - XBOX ONE
8,8,Tickets (figure)
9,9,Delivery of goods


In [11]:
itemcat = items.groupby(['item_category_id'])['item_category_id'].count()
itemcat

item_category_id
0      4
1      2
2     75
3     34
4     15
      ..
79     1
80     6
81     7
82     8
83    15
Name: item_category_id, Length: 84, dtype: int64

In [35]:
def get_main_category(string):
        string_ret = string.split("-")[0]
        return (string_ret)
    
cats['MainCategory'] = cats['item_category_name_translated'].apply(get_main_category)

In [25]:
cats

Unnamed: 0,item_category_id,item_category_name_translated,MainCategory
0,0,PC - Headsets / Headphones,PC
1,1,Accessories - PS2,Accessories
2,2,Accessories - PS3,Accessories
3,3,Accessories - PS4,Accessories
4,4,Accessories - PSP,Accessories
5,5,Accessories - PSVita,Accessories
6,6,Accessories - XBOX 360,Accessories
7,7,Accessories - XBOX ONE,Accessories
8,8,Tickets (figure),Tickets (figure)
9,9,Delivery of goods,Delivery of goods


In [53]:
def get_sub_category(string):
    if '-' in string:
        string_ret = string.split("-")[1]
        return (string_ret)
    else:
        return('none')
cats['SubCategory'] = cats['item_category_name_translated'].apply(get_sub_category)

In [54]:
cats

Unnamed: 0,item_category_id,item_category_name_translated,MainCategory,SubCategory
0,0,PC - Headsets / Headphones,PC,Headsets / Headphones
1,1,Accessories - PS2,Accessories,PS2
2,2,Accessories - PS3,Accessories,PS3
3,3,Accessories - PS4,Accessories,PS4
4,4,Accessories - PSP,Accessories,PSP
5,5,Accessories - PSVita,Accessories,PSVita
6,6,Accessories - XBOX 360,Accessories,XBOX 360
7,7,Accessories - XBOX ONE,Accessories,XBOX ONE
8,8,Tickets (figure),Tickets (figure),none
9,9,Delivery of goods,Delivery of goods,none


In [71]:
items

Unnamed: 0,item_id,item_name_translated,item_category_id
0,0,!! IN THE POWER OF HAPPINESS (PLAST) D,40
1,1,! ABBYY FineReader 12 Professional Edition Ful...,76
2,2,*** IN THE GLORY OF THE GLORY (UNV) D,40
3,3,*** BLUE WAVE (Univ) D,40
4,4,*** BOX (GLASS) D,40
...,...,...,...
22165,22165,"Nuclear Titbit 2 [PC, Digital Version]",31
22166,22166,Query language 1C: Enterprise [Digital version],54
22167,22167,The query language is 1C: Enterprise 8 (+ CD)....,49
22168,22168,Egg for Little Inu,62


In [72]:
texts = items['item_name_translated']

In [98]:
bow = CountVectorizer()
texts_bow = bow.fit_transform(texts)
tokens = bow.get_feature_names()
df_items_bow = pd.DataFrame(data=texts_bow.toarray(), index=items['item_id'], columns=tokens)

In [99]:
df_items_bow

Unnamed: 0_level_0,00,000,00002,00003,00005,00006,00007,00008,00009h,00010,...,энд,энигма,эпик,эргагон,эстрада,эсхато,яблоко,янка,ёлка,ёрш
item_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
22165,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
22166,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
22167,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
22168,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [105]:
one = 10 in df_items_bow.iloc[:, 14504]
print(one)

True


In [106]:
bow = CountVectorizer(ngram_range=(1, 2))
texts_bow = bow.fit_transform(texts)
tokens = bow.get_feature_names()
df_items_bow_2gram = pd.DataFrame(data=texts_bow.toarray(), index=items['item_id'], columns=tokens)

In [107]:
df_items_bow_2gram

Unnamed: 0_level_0,00,00 07,000,000 dawn,000 kill,000 leagues,000 points,000 space,000 years,00002,...,эстрада 12cd,эсхато,эсхато фирма,яблоко,янка,янка home,ёлка,ёрш,ёрш compact,ёрш баня
item_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
22165,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
22166,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
22167,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
22168,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [111]:
tfidf = TfidfVectorizer()
texts_tfidf = tfidf.fit_transform(texts)
tokens = tfidf.get_feature_names()
df_items_tfidf = pd.DataFrame(data=texts_tfidf.toarray(), index=items['item_id'], columns=tokens)

In [112]:
df_items_tfidf

Unnamed: 0_level_0,00,000,00002,00003,00005,00006,00007,00008,00009h,00010,...,энд,энигма,эпик,эргагон,эстрада,эсхато,яблоко,янка,ёлка,ёрш
item_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
22165,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
22166,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
22167,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
22168,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [126]:
tfidf = TfidfVectorizer(ngram_range=(1, 2), dtype='float32')
texts_tfidf = tfidf.fit_transform(texts)
tokens = tfidf.get_feature_names()
df_items_tfidf_2gram = pd.DataFrame(data=texts_tfidf.toarray(), index=items['item_id'], columns=tokens)




MemoryError: Unable to allocate 5.28 GiB for an array with shape (22170, 63885) and data type float32

df_items_tfidf_2gram