In [4]:
import pandas as pd
import numpy as np
import os
import gc
import matplotlib.pyplot as plt
%matplotlib inline 


from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestClassifier

from itertools import product
from tqdm import tqdm_notebook
from sklearn.model_selection import KFold


from sklearn.metrics import r2_score
def downcast_dtypes(df):
    '''
        Changes column types in the dataframe: 
                
                `float64` type to `float32`
                `int64`   type to `int32`
    '''
    
    # Select columns to downcast
    float_cols = [c for c in df if df[c].dtype == "float64"]
    int_cols =   [c for c in df if df[c].dtype == "int64"]
    
    # Downcast
    df[float_cols] = df[float_cols].astype(np.float32)
    df[int_cols]   = df[int_cols].astype(np.int32)
    
    return df


In [2]:
print(os.listdir("C:\Users\JaiminPat\Downloads\KaggleCoursera"))


['.ipynb_checkpoints', 'items.csv', 'item_categories.csv', 'KaggleCoursera.ipynb', 'sales_train.csv_2.gz', 'sample_submission.csv.gz', 'shops.csv', 'submission.csv', 'test.csv', 'test.csv.gz', 'Untitled.ipynb']


In [3]:

items_df = pd.read_csv('C:\Users\JaiminPat\Downloads\KaggleCoursera\items.csv')
shops_df = pd.read_csv('C:\Users\JaiminPat\Downloads\KaggleCoursera\shops.csv')

icats_df = pd.read_csv('C:\Users\JaiminPat\Downloads\KaggleCoursera\item_categories.csv')
train_df = pd.read_csv('C:\Users\JaiminPat\Downloads\KaggleCoursera\sales_train.csv_2.gz', compression='gzip', header=0, sep=',', quotechar='"')
smpsb_df = pd.read_csv('C:\Users\JaiminPat\Downloads\KaggleCoursera\sample_submission.csv.gz', compression='gzip', header=0, sep=',', quotechar='"')
test_df  = pd.read_csv('C:\\Users\\JaiminPat\\Downloads\\KaggleCoursera\\test.csv')
# Any results you write to the current directory are saved as output.


In [None]:
train_df.shape

In [None]:
train_df.head()

In [None]:
index_cols = ['date_block_num', 'shop_id', 'item_id']

train = train_df.groupby(index_cols,as_index=False).agg({'item_cnt_day':{'target':'sum'}})

In [None]:
train.head()

In [None]:
test_df.head()

In [None]:
dates = train['date_block_num']
to_drop_cols = ['target']
X_train = train.loc[dates <  33]
X_test =  train.loc[dates == 33]

y_train = train.loc[dates <  33].item_cnt_day.target
y_test =  train.loc[dates == 33].item_cnt_day.target

In [None]:
X_cols = ['shop_id', 'item_id']
lr = LinearRegression()
lr.fit(X_train[X_cols], y_train)
predict = lr.predict(X_test[X_cols])

In [None]:
r2_score(y_test, predict)

In [None]:
clf = RandomForestClassifier(max_depth=2, random_state=0)
clf.fit(X_train[X_cols], y_train)
clf_predict = clf.predict(X_test[X_cols])

In [None]:
r2_score(y_test, clf_predict)

In [None]:
sales = train_df
shops = pd.read_csv('C:\Users\JaiminPat\Downloads\KaggleCoursera\shops.csv')
items = pd.read_csv('C:\Users\JaiminPat\Downloads\KaggleCoursera\items.csv')
item_cats = pd.read_csv('C:\Users\JaiminPat\Downloads\KaggleCoursera\item_categories.csv')

In [None]:
sales = test_df


In [None]:
# Create "grid" with columns
index_cols = ['shop_id', 'item_id', 'date_block_num']

# For every month we create a grid from all shops/items combinations from that month
grid = [] 
for block_num in sales['date_block_num'].unique():
    cur_shops = sales.loc[sales['date_block_num'] == block_num, 'shop_id'].unique()
    cur_items = sales.loc[sales['date_block_num'] == block_num, 'item_id'].unique()
    grid.append(np.array(list(product(*[cur_shops, cur_items, [block_num]])),dtype='int32'))

# Turn the grid into a dataframe
grid = pd.DataFrame(np.vstack(grid), columns = index_cols,dtype=np.int32)

# Groupby data to get shop-item-month aggregates
gb = sales.groupby(index_cols,as_index=False).agg({'item_cnt_day':{'target':'sum'}})
# Fix column names
gb.columns = [col[0] if col[-1]=='' else col[-1] for col in gb.columns.values] 
# Join it to the grid
all_data = pd.merge(grid, gb, how='left', on=index_cols).fillna(0)

# Same as above but with shop-month aggregates
gb = sales.groupby(['shop_id', 'date_block_num'],as_index=False).agg({'item_cnt_day':{'target_shop':'sum'}})
gb.columns = [col[0] if col[-1]=='' else col[-1] for col in gb.columns.values]
all_data = pd.merge(all_data, gb, how='left', on=['shop_id', 'date_block_num']).fillna(0)

# Same as above but with item-month aggregates
gb = sales.groupby(['item_id', 'date_block_num'],as_index=False).agg({'item_cnt_day':{'target_item':'sum'}})
gb.columns = [col[0] if col[-1] == '' else col[-1] for col in gb.columns.values]
all_data = pd.merge(all_data, gb, how='left', on=['item_id', 'date_block_num']).fillna(0)

# Downcast dtypes from 64 to 32 bit to save memory
all_data = downcast_dtypes(all_data)
del grid, gb 
gc.collect();

In [None]:
cols_to_rename = list(all_data.columns.difference(index_cols)) 

shift_range = [1, 2, 3, 4, 5, 12]

for month_shift in tqdm_notebook(shift_range):
    train_shift = all_data[index_cols + cols_to_rename].copy()
    
    train_shift['date_block_num'] = train_shift['date_block_num'] + month_shift
    
    foo = lambda x: '{}_lag_{}'.format(x, month_shift) if x in cols_to_rename else x
    train_shift = train_shift.rename(columns=foo)

    all_data = pd.merge(all_data, train_shift, on=index_cols, how='left').fillna(0)

del train_shift

# Don't use old data from year 2013
all_data = all_data[all_data['date_block_num'] >= 12] 

# List of all lagged features
fit_cols = [col for col in all_data.columns if col[-1] in [str(item) for item in shift_range]] 
# We will drop these at fitting stage
to_drop_cols = list(set(list(all_data.columns)) - (set(fit_cols)|set(index_cols))) + ['date_block_num'] 

# Category for each item
item_category_mapping = items[['item_id','item_category_id']].drop_duplicates()

all_data = pd.merge(all_data, item_category_mapping, how='left', on='item_id')
all_data = downcast_dtypes(all_data)
gc.collect();

In [None]:
list(all_data.columns.difference(index_cols)) 


In [None]:
train_all_data = all_data.copy()

In [None]:
train_all_data.head()

In [None]:
test = test_df

In [None]:
test = pd.merge(test, train_all_data, on=['item_id','shop_id'], how='left').fillna(0)


In [None]:
test.head()

In [None]:
X_cols = ['shop_id', 'item_id']
lr = LinearRegression()
lr.fit(X_train[X_cols], y_train)
predict = lr.predict(X_test[X_cols])

In [None]:
dates = all_data['date_block_num']

In [None]:
last_block = 33
X_train = all_data.loc[dates <  last_block].drop(to_drop_cols, axis=1)
X_test =  all_data.loc[dates == last_block].drop(to_drop_cols, axis=1)

y_train = all_data.loc[dates <  last_block, 'target'].values
y_test =  all_data.loc[dates == last_block, 'target'].values

In [None]:
lr = LinearRegression()
lr.fit(X_train, y_train)
predict = lr.predict(X_test)

In [None]:
sub_final

In [None]:
test_df.head(15)

In [None]:
agr = {
    'item_cnt_day': {
         'tot_cnt' : 'sum'                         
    }
}
grp_by = train_df.groupby(['shop_id', 'item_id']).agg(agr).reset_index()
grp_by.head()

In [None]:
train_df['shop_id'].nunique()

In [None]:
train_df['item_id'].nunique()

In [None]:
items_df['item_id'].nunique()

In [None]:
test_df['item_id'].nunique()

In [None]:
not_in_train_items = test_df[~test_df['item_id'].isin(train_df['item_id'])]['item_id']

In [None]:
items_df[items_df.item_id.isin(np.array(not_in_train_items))].head()

In [None]:
train = train_df
test = test_df
#train_df[train_df['shop_id'].isin(test_df['shop_id'])]

In [None]:
train['shop_item'] = train['shop_id'].map(str) + '_' + train['item_id'].map(str)
test['shop_item'] = test['shop_id'].map(str) + '_' + test['item_id'].map(str)


In [None]:
test_in_train = train[train['shop_item'].isin(test['shop_item'])]


In [None]:
test_in_train.head()

In [None]:
agrr = {
    'item_cnt_day' : {
        'tot_items' : 'sum'
    }
}
tit = test_in_train.groupby('shop_item').agg(agrr).reset_index()

In [None]:
tit[tit.shop_item == '25_2574']

In [None]:
tit.shape

In [None]:
test[test.shop_item == '10_10003']

In [None]:
test.shop_item.nunique()

In [None]:
tit.head()

In [None]:
test.head()

In [None]:
tit.shop_item.nunique()

In [None]:
test_tit_join = pd.concat([test.set_index('shop_item'), tit.set_index('shop_item')], axis=1, join='outer').reset_index()

In [None]:
train[train.shop_item == '10_10005']

In [None]:
train.shape

In [None]:
train.shop_item.nunique()

In [None]:
agre = {
    'item_cnt_day': {
        'tot' : 'sum'
    }
}
train_mean_item_cnt_shop_item = train.groupby('shop_item').agg(agre).reset_index()
#train.groupby('shop_item').agg(agre).reset_index()

In [None]:
train_mean_item_cnt_shop_item.item_cnt_day.tot.mean()

In [None]:
test_tit_join.head()

In [None]:
test.head()

In [None]:
sub = pd.concat([test.set_index('shop_item'), test_tit_join.set_index('index')], axis=1, join='inner').reset_index()    

In [None]:
arr = np.array(sub)

In [None]:
sub.shape

In [None]:
sub['pred'] = arr[:,7]

In [None]:
mean = train_mean_item_cnt_shop_item.item_cnt_day.tot.mean()
sub.loc[sub.pred.isnull(), 'pred'] = mean


In [None]:
sub_final_arr = np.array(sub)

In [None]:
data = {'ID' : sub_final_arr[:,1]}
sub_final = pd.DataFrame(data)  


In [None]:
sub.head()

In [None]:
sub_final['item_cnt_month'] = sub['pred']

In [None]:
train.date.max()

In [None]:
sub_final.to_csv('submission.csv', index=False)