based on : https://github.com/shockwave22/Walmart-Trip-Type-Classification/blob/master/EDA_fiveClasses%20with%20Markdown.ipynb

- ~ 96k store visits, segmented into 38 trip types
- Training and testing data included >1.2 million observations with 6 features:
    - Visit Number, Weekday, UPC, Scan Count, Department Description, Fineline Number
- Using the `6 provided features` the team was tasked with creating the best model to accurately classify the trips into their proper trip type category
- Challenges with the data
    - Each observation represented an `item` rather than a visit
    - Needed to group observations `by visit` to classify the trip
    - Number of unique `UPCs and Fineline` Numbers prevented the creation of dummy variables - `resulting data set was too large to process`
    - Instead, `used the Department Description` to create dummy variables

In [261]:
import pandas as pd
import numpy as np
import scipy as sp
import matplotlib.pylab as plt
import warnings
warnings.filterwarnings('ignore')

In [262]:
train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')

In [263]:
train_df = train.copy()
# 원본을 유지하기 위해서 카피
train_df.tail()

Unnamed: 0,TripType,VisitNumber,Weekday,Upc,ScanCount,DepartmentDescription,FinelineNumber
647049,39,191346,Sunday,32390000000.0,1,PHARMACY OTC,1118.0
647050,39,191346,Sunday,7874205000.0,1,FROZEN FOODS,1752.0
647051,39,191346,Sunday,4072.0,1,PRODUCE,4170.0
647052,8,191347,Sunday,4190008000.0,1,DAIRY,1512.0
647053,8,191347,Sunday,3800060000.0,1,GROCERY DRY GOODS,3600.0


In [264]:
test_df = test.copy()
# 원본을 유지하기 위해서 카피
test_df.tail()

Unnamed: 0,VisitNumber,Weekday,Upc,ScanCount,DepartmentDescription,FinelineNumber
653641,191348,Sunday,66572110000.0,1,BATH AND SHOWER,1505.0
653642,191348,Sunday,88181390000.0,1,BATH AND SHOWER,1099.0
653643,191348,Sunday,4282557000.0,1,MENS WEAR,8220.0
653644,191348,Sunday,80469190000.0,1,SWIMWEAR/OUTERWEAR,114.0
653645,191348,Sunday,7871536000.0,1,MENS WEAR,4923.0


In [265]:
train_df = train_df[['TripType','VisitNumber','Weekday','ScanCount','DepartmentDescription']]
# we only use column named 'DepartmentDescription' among columns about product infor

In [266]:
train_df.tail()

Unnamed: 0,TripType,VisitNumber,Weekday,ScanCount,DepartmentDescription
647049,39,191346,Sunday,1,PHARMACY OTC
647050,39,191346,Sunday,1,FROZEN FOODS
647051,39,191346,Sunday,1,PRODUCE
647052,8,191347,Sunday,1,DAIRY
647053,8,191347,Sunday,1,GROCERY DRY GOODS


### Encoding the Weekday columns
- We gonna use map method to matching string and number; change weekday into [1,2,3,4,5,6,7]
    - this method is for calculating and analysising the `mode`

In [267]:
train_df['Weekday'] = train_df['Weekday']\
.map({'Monday':1,'Tuesday':2,'Wednesday':3,'Thursday':4,'Friday':5,'Saturday':6,'Sunday':7})

In [268]:
train_df.tail()

Unnamed: 0,TripType,VisitNumber,Weekday,ScanCount,DepartmentDescription
647049,39,191346,7,1,PHARMACY OTC
647050,39,191346,7,1,FROZEN FOODS
647051,39,191346,7,1,PRODUCE
647052,8,191347,7,1,DAIRY
647053,8,191347,7,1,GROCERY DRY GOODS


### Encoding the DESC columns

In [269]:
ItemNumber = train_df['ScanCount']
dummies_desc = pd.get_dummies(train_df['DepartmentDescription'])
# as you know, ScanCount means that the number of product customer bought. So we simply multiply on product.
train_df[dummies_desc.columns] = dummies_desc.apply(lambda x:x*ItemNumber)
train_df.tail()

Unnamed: 0,TripType,VisitNumber,Weekday,ScanCount,DepartmentDescription,1-HR PHOTO,ACCESSORIES,AUTOMOTIVE,BAKERY,BATH AND SHOWER,...,SEAFOOD,SEASONAL,SERVICE DELI,SHEER HOSIERY,SHOES,SLEEPWEAR/FOUNDATIONS,SPORTING GOODS,SWIMWEAR/OUTERWEAR,TOYS,WIRELESS
647049,39,191346,7,1,PHARMACY OTC,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
647050,39,191346,7,1,FROZEN FOODS,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
647051,39,191346,7,1,PRODUCE,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
647052,8,191347,7,1,DAIRY,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
647053,8,191347,7,1,GROCERY DRY GOODS,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


### Making columns named Return_bin includes binary data tells us this customer has attribute to refund the project.

In [270]:
# we take binarlizing the ScanCount columns 
# if the customers refund the products then, Return column show the 1 or the Return column shows the 0
# Dont worry about the lossing the origin value. As you know, the num of the origin ScanCount columns'value is in the DepartmentDescription column
train_df.loc[train_df['ScanCount']<0,'Return']=1
train_df.loc[train_df['Return']!=1,'Return']=0

In [271]:
train_df['Return'].values

array([1., 0., 0., ..., 0., 0., 0.])

### Making the Function for making the groupby Dataframe

In [272]:
def challenge_feature(train_df):
    grouped_df = train_df.groupby(by='VisitNumber')\
          .agg({'TripType':np.max,'Weekday':np.max,'Return':np.max,\
         '1-HR PHOTO':np.sum, 'ACCESSORIES':np.sum,
           'AUTOMOTIVE':np.sum, 'BAKERY':np.sum, 'BATH AND SHOWER':np.sum, 'BEAUTY':np.sum, 'BEDDING':np.sum,
           'BOOKS AND MAGAZINES':np.sum, 'BOYS WEAR':np.sum, 'BRAS & SHAPEWEAR':np.sum,
           'CAMERAS AND SUPPLIES':np.sum, 'CANDY, TOBACCO, COOKIES':np.sum, 'CELEBRATION':np.sum,
           'COMM BREAD':np.sum, 'CONCEPT STORES':np.sum, 'COOK AND DINE':np.sum, 'DAIRY':np.sum, 'DSD GROCERY':np.sum,
           'ELECTRONICS':np.sum, 'FABRICS AND CRAFTS':np.sum, 'FINANCIAL SERVICES':np.sum,
           'FROZEN FOODS':np.sum, 'FURNITURE':np.sum, 'GIRLS WEAR, 4-6X  AND 7-14':np.sum,
           'GROCERY DRY GOODS':np.sum, 'HARDWARE':np.sum, 'HEALTH AND BEAUTY AIDS':np.sum, 'HOME DECOR':np.sum,
           'HOME MANAGEMENT':np.sum, 'HORTICULTURE AND ACCESS':np.sum,
           'HOUSEHOLD CHEMICALS/SUPP':np.sum, 'HOUSEHOLD PAPER GOODS':np.sum,
           'IMPULSE MERCHANDISE':np.sum, 'INFANT APPAREL':np.sum, 'INFANT CONSUMABLE HARDLINES':np.sum,
           'JEWELRY AND SUNGLASSES':np.sum, 'LADIES SOCKS':np.sum, 'LADIESWEAR':np.sum,
           'LARGE HOUSEHOLD GOODS':np.sum, 'LAWN AND GARDEN':np.sum, 'LIQUOR,WINE,BEER':np.sum,
           'MEAT - FRESH & FROZEN':np.sum, 'MEDIA AND GAMING':np.sum, 'MENS WEAR':np.sum, 'MENSWEAR':np.sum,
           'OFFICE SUPPLIES':np.sum, 'OPTICAL - FRAMES':np.sum, 'OPTICAL - LENSES':np.sum,
           'OTHER DEPARTMENTS':np.sum, 'PAINT AND ACCESSORIES':np.sum, 'PERSONAL CARE':np.sum,
           'PETS AND SUPPLIES':np.sum, 'PHARMACY OTC':np.sum, 'PHARMACY RX':np.sum,
           'PLAYERS AND ELECTRONICS':np.sum, 'PLUS AND MATERNITY':np.sum, 'PRE PACKED DELI':np.sum,
           'PRODUCE':np.sum, 'SEAFOOD':np.sum, 'SEASONAL':np.sum, 'SERVICE DELI':np.sum, 'SHEER HOSIERY':np.sum,
           'SHOES':np.sum, 'SLEEPWEAR/FOUNDATIONS':np.sum, 'SPORTING GOODS':np.sum,
           'SWIMWEAR/OUTERWEAR':np.sum, 'TOYS':np.sum, 'WIRELESS':np.sum})
    return grouped_df

### Rename the columns for identifying 

In [273]:
train_df = challenge_feature(train_df)
train_df.rename(columns={'ScanCount':'ScanCount_sum', 'Weekday_encoded':'Weekday_mode', 'Return':'Return_bin'},inplace=True)
train_df.tail()

Unnamed: 0_level_0,TripType,Weekday,Return_bin,1-HR PHOTO,ACCESSORIES,AUTOMOTIVE,BAKERY,BATH AND SHOWER,BEAUTY,BEDDING,...,SEAFOOD,SEASONAL,SERVICE DELI,SHEER HOSIERY,SHOES,SLEEPWEAR/FOUNDATIONS,SPORTING GOODS,SWIMWEAR/OUTERWEAR,TOYS,WIRELESS
VisitNumber,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
191343,25,7,0.0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
191344,22,7,0.0,0,0,0,0,0,4,0,...,0,0,0,0,0,0,0,0,0,1
191345,39,7,0.0,0,0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0
191346,39,7,0.0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
191347,8,7,0.0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [274]:
train_df.shape

(95674, 71)

In [276]:
train_df.to_csv('train_new')

# Split the train_df into Train and Test for Cross-validation

In [15]:
from sklearn.model_selection import train_test_split

In [16]:
train_df,test_df = train_test_split(train_df,test_size=0.2,shuffle=True,random_state=4)

In [17]:
train_df.shape,test_df.shape

((76539, 71), (19135, 71))

In [18]:
np.sum(train_df.isna().sum().values),np.sum(test_df.isna().sum().values)

(0, 0)

Above outcome is sure thing cuz dataframe is composed of `GROUPBY`

In [19]:
train_X = train_df.iloc[:,1:]
train_y = train_df.iloc[:,0].values

In [20]:
test_X = test_df.iloc[:,1:].copy()
test_y = test_df.iloc[:,0].values