### Problem Statement
> A retail company “ABC Private Limited” wants to understand the customer purchase behaviour (specifically, purchase amount) against various products of different categories. They have shared purchase summary of various customers for selected high volume products from last month.
The data set also contains customer demographics (age, gender, marital status, city_type, stay_in_current_city), product details (product_id and product category) and Total purchase_amount from last month.
Now, they want to build a model to predict the purchase amount of customer against various products which will help them to create personalized offer for customers against different products.

|Variable|Definition|
|:---- |:----
|User_ID|User ID|
|Product_ID|Product ID|
|Gender|Sex of User|
|Age|Age in bins|
|Occupation|Occupation (Masked)|
|City_Category|Category of the City (A,B,C)|
|Stay_In_Current_City_Years|Number of years stay in current city|
|Marital_Status|Marital Status|
|Product_Category_1|Product Category (Masked)|
|Product_Category_2|Product may belongs to other category also (Masked)|
|Product_Category_3|Product may belongs to other category also (Masked)|
|Purchase|Purchase Amount (Target Variable)|

### Evaluation
Submissions are scored on the root mean squared error (RMSE). RMSE is very common and is a suitable general-purpose error metric. Compared to the Mean Absolute Error, RMSE punishes large errors:



In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
from pathlib import Path
import numpy as np
import pandas as pd
import pandas_profiling
import dexplot as dxp
from matplotlib import pyplot as plt

In [3]:
import scipy.stats as stats
from scipy.stats import chi2_contingency

In [4]:
pkmn_type_colors = ['#78C850',  # Grass
                    '#F08030',  # Fire
                    '#6890F0',  # Water
                    '#A8B820',  # Bug
                    '#A8A878',  # Normal
                    '#A040A0',  # Poison
                    '#F8D030',  # Electric
                    '#E0C068',  # Ground
                    '#EE99AC',  # Fairy
                    '#C03028',  # Fighting
                    '#F85888',  # Psychic
                    '#B8A038',  # Rock
                    '#705898',  # Ghost
                    '#98D8D8',  # Ice
                    '#7038F8',  # Dragon
                   ]

In [5]:
train_file = Path.cwd().joinpath('datasource/train.csv')
test_file =  Path.cwd().joinpath('datasource/test.csv')

In [6]:
train_df = pd.read_csv(train_file)

In [7]:
test_df = pd.read_csv(test_file)

In [8]:
def extended_describe(dataframe):
    extended_describe_df= dataframe.describe(include='all').T 
    extended_describe_df['null_count']= dataframe.isnull().sum()
    extended_describe_df['unique_count'] = dataframe.apply(lambda x: len(x.unique()))
    return extended_describe_df 

In [9]:
extended_describe(train_df)

Unnamed: 0,count,unique,top,freq,mean,std,min,25%,50%,75%,max,null_count,unique_count
User_ID,550068,,,,1003030.0,1727.59,1000000.0,1001520.0,1003080.0,1004480.0,1006040.0,0,5891
Product_ID,550068,3631.0,P00265242,1880.0,,,,,,,,0,3631
Gender,550068,2.0,M,414259.0,,,,,,,,0,2
Age,550068,7.0,26-35,219587.0,,,,,,,,0,7
Occupation,550068,,,,8.07671,6.52266,0.0,2.0,7.0,14.0,20.0,0,21
City_Category,550068,3.0,B,231173.0,,,,,,,,0,3
Stay_In_Current_City_Years,550068,5.0,1,193821.0,,,,,,,,0,5
Marital_Status,550068,,,,0.409653,0.49177,0.0,0.0,0.0,1.0,1.0,0,2
Product_Category_1,550068,,,,5.40427,3.93621,1.0,1.0,5.0,8.0,20.0,0,20
Product_Category_2,376430,,,,9.84233,5.08659,2.0,5.0,9.0,15.0,18.0,173638,18


In [10]:
extended_describe(test_df)

Unnamed: 0,count,unique,top,freq,mean,std,min,25%,50%,75%,max,null_count,unique_count
User_ID,233599,,,,1003030.0,1726.5,1000000.0,1001530.0,1003070.0,1004480.0,1006040.0,0,5891
Product_ID,233599,3491.0,P00265242,829.0,,,,,,,,0,3491
Gender,233599,2.0,M,175772.0,,,,,,,,0,2
Age,233599,7.0,26-35,93428.0,,,,,,,,0,7
Occupation,233599,,,,8.08541,6.52115,0.0,2.0,7.0,14.0,20.0,0,21
City_Category,233599,3.0,B,98566.0,,,,,,,,0,3
Stay_In_Current_City_Years,233599,5.0,1,82604.0,,,,,,,,0,5
Marital_Status,233599,,,,0.41007,0.491847,0.0,0.0,0.0,1.0,1.0,0,2
Product_Category_1,233599,,,,5.27654,3.73638,1.0,1.0,5.0,8.0,18.0,0,18
Product_Category_2,161255,,,,9.84959,5.09494,2.0,5.0,9.0,15.0,18.0,72344,18


In [11]:
check

NameError: name 'check' is not defined

In [19]:
prd_city_mapper  = train_df.groupby(['Product_Category_1','City_Category'])['Purchase'].agg([np.mean]).rank(ascending= False).to_dict()

In [29]:
prd_city_mapper['mean'].get((1,'A'))

22.0

In [20]:
def prd_city(row):
    key = row['Product_Category_1'],row['City_Category']
    print(key)
    return prd_city_mapper.get(key,-999)

In [21]:
c= train_df.sample(1)

In [22]:
c['nf'] = c[['Product_Category_1','City_Category']].apply(prd_city,axis=1)

(1, 'B')


In [25]:
prd_city_mapper.get((1, 'B'))

In [None]:
set(train_df['Occupation'].unique())

In [None]:
dxp.hist(val='Purchase', data=train_df, split='Gender',figsize=(3,2),split_order='desc',bins=20)

In [None]:
dxp.hist(val='Purchase', data=train_df, col='Age',bins=20)

In [None]:
dxp.hist(val='Purchase', data=train_df, col='City_Category',split_order='desc',bins=20)

In [None]:
dxp.hist(val='Purchase', data=train_df, col='Stay_In_Current_City_Years',split_order='desc',bins=20)

In [None]:
dxp.hist(val='Purchase', data=train_df, col='Marital_Status',split_order='desc',bins=20)

In [None]:
dxp.bar(x='Product_Category_1', y='Purchase', data=train_df, aggfunc='mean', size=.7,split='City_Category')

In [None]:
from sklearn.preprocessing import FunctionTransformer

In [None]:
# from sklearn.preprocessing import FunctionTransformer
# from sklearn.pipeline import Pipeline


# class DataSetPreprocessor(object):
    
#     def __init__(self,train_df,test_df):
        
#         self.train_df = train_df
#         self.test_df = test_df
#         self._steps = []
#         self._pipeline= None
        
#     def add_step(self, func, inverse_func=None,validate=False,kw_args=None):
#         """Add step"""
#         next_ft = FunctionTransformer(func,
#                                       inverse_func=inverse_func,
#                                       validate=validate,
#                                       kw_args =kw_args,
#                                      )
#         self._steps.append((func.__name__,next_ft))
    
#     def add_steps(self,funcs):
#         for func in funcs:
#             f,invf,validate,kw_args =func 
#             self.add_step(f,invf,validate,kw_args)
            
#     def fit_steps(self):
#         """ Fits all steps """
#         if not self._pipeline:
#             self._pipeline = Pipeline(self._steps)
        
#         self.train_df = self._pipeline.fit_transform(self.train_df)
#         self.test_df = self._pipeline.transform(self.test_df)
#         return self.train_df ,self.test_df
    
        
        

In [None]:
# def merge_gender_and_city(df):
#     df['Gender&Category'] = df['Gender'] + df['City_Category']
#     return df

# def get_mapper(df,feature,target):
#     return df.groupby([feature])[target].sum().rank(ascending= False).to_dict()

# def age_transformer(df,age_mapper):
#     df['Age_TR']= df['Age'].map(lambda x:age_mapper.get(x,-1)).astype('int')
#     return df

# age_mapper = get_mapper(train_df,'Age','Purchase')

# dsp  = DataSetPreprocessor(train_df,test_df)

# dsp.add_steps([(merge_gender_and_city,None, False,None),
#                (age_transformer,None, False,dict(age_mapper=age_mapper))
#                 ])

# train_df,test_df = dsp.fit_steps()

In [None]:
train_df.head()

In [None]:
test_df.head()