# Import data

In [28]:
#Black Friday data set
    #Desc: Analytics Vidhya (also on Kaggle) competition set - some columns have masked values

#Potential Approaches: 
    #1) Predict checkout size
    #2) Predict age of customer/gender of customer
    #3) Predict product category of purchase

        
# Loading the data from a csv
import pandas as pd 
from pandas.plotting import scatter_matrix
import matplotlib.pyplot as plt
%matplotlib inline

#import csv (already know it is clean)
data = pd.read_csv("/Users/emag3/Documents/Code/Python/Black Friday Competition/BlackFriday.csv") 

#add new column
data['Purchase_Size']=data.Purchase/100

#preview first 5 rows
data.head()

#data.Product_ID.nunique()
#data.describe()


#remove Purchase

Unnamed: 0,User_ID,Product_ID,Gender,Age,Occupation,City_Category,Stay_In_Current_City_Years,Marital_Status,Product_Category_1,Product_Category_2,Product_Category_3,Purchase,Purchase_Size
0,1000001,P00069042,F,0-17,10,A,2,0,3,,,8370,83.7
1,1000001,P00248942,F,0-17,10,A,2,0,1,6.0,14.0,15200,152.0
2,1000001,P00087842,F,0-17,10,A,2,0,12,,,1422,14.22
3,1000001,P00085442,F,0-17,10,A,2,0,12,14.0,,1057,10.57
4,1000002,P00285442,M,55+,16,C,4+,0,8,,,7969,79.69


# Deal with Categorical Variables
#Gender: 1 or 0
#Age: ordinal, therefore sortable categories of arbitrary number values
#City_Category: dummy variables (drop_first=True)

#careful w/ dummy variables
    #can lead to multicollinearity in your independent variables (def: 2 or more variables are highly correlated)
    #solved by using 'drop_first=True'

In [29]:
#check data types
data.dtypes

User_ID                         int64
Product_ID                     object
Gender                         object
Age                            object
Occupation                      int64
City_Category                  object
Stay_In_Current_City_Years     object
Marital_Status                  int64
Product_Category_1              int64
Product_Category_2            float64
Product_Category_3            float64
Purchase                        int64
Purchase_Size                 float64
dtype: object

In [31]:
# Do more colu
dummyData = pd.get_dummies(data, columns = ['Gender','Occupation','City_Category','Product_Category_1','Product_Category_2','Product_Category_3'], drop_first=True)
dummyData.head()

Unnamed: 0,User_ID,Product_ID,Age,Stay_In_Current_City_Years,Marital_Status,Purchase,Purchase_Size,Gender_M,Occupation_1,Occupation_2,...,Product_Category_3_9.0,Product_Category_3_10.0,Product_Category_3_11.0,Product_Category_3_12.0,Product_Category_3_13.0,Product_Category_3_14.0,Product_Category_3_15.0,Product_Category_3_16.0,Product_Category_3_17.0,Product_Category_3_18.0
0,1000001,P00069042,0-17,2,0,8370,83.7,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,1000001,P00248942,0-17,2,0,15200,152.0,0,0,0,...,0,0,0,0,0,1,0,0,0,0
2,1000001,P00087842,0-17,2,0,1422,14.22,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,1000001,P00085442,0-17,2,0,1057,10.57,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,1000002,P00285442,55+,4+,0,7969,79.69,1,0,0,...,0,0,0,0,0,0,0,0,0,0


In [33]:
dummyData.Age.unique()
dummyData.Stay_In_Current_City_Years.unique()

array(['2', '4+', '3', '1', '0'], dtype=object)

# Ranked numerical values for ordinal variables

In [37]:
dummyData2 = dummyData

#recode --> Age
#dummyData2 = pd.DataFrame({'Age': ['0-17', '18-25','26-35','36-45','46-50', '51-55', '55+']})
dummyData2['Age'] = dummyData2['Age'].astype('category')
dummyData2['Age'] = dummyData2['Age'].cat.reorder_categories(['0-17', '18-25','26-35','36-45','46-50', '51-55', '55+'], ordered=True)
dummyData2['Age'] = dummyData2['Age'].cat.codes
#print(dummyData2['Age'])

#recode --> stay_in_current_city_years
#dummyData2 = pd.DataFrame({'Stay_In_Current_City_Years': ['0', '1', '2', '3', '4+']})
dummyData2['Stay_In_Current_City_Years'] = dummyData2['Stay_In_Current_City_Years'].astype('category')
dummyData2['Stay_In_Current_City_Years'] = dummyData2['Stay_In_Current_City_Years'].cat.reorder_categories(['0', '1', '2', '3', '4+'], ordered=True)
dummyData2['Stay_In_Current_City_Years'] = dummyData2['Stay_In_Current_City_Years'].cat.codes
#print(dummyData2['Stay_In_Current_City_Years'])

dummyData2.head()

Unnamed: 0,User_ID,Product_ID,Age,Stay_In_Current_City_Years,Marital_Status,Purchase,Purchase_Size,Gender_M,Occupation_1,Occupation_2,...,Product_Category_3_9.0,Product_Category_3_10.0,Product_Category_3_11.0,Product_Category_3_12.0,Product_Category_3_13.0,Product_Category_3_14.0,Product_Category_3_15.0,Product_Category_3_16.0,Product_Category_3_17.0,Product_Category_3_18.0
0,1000001,P00069042,0,2,0,8370,83.7,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,1000001,P00248942,0,2,0,15200,152.0,0,0,0,...,0,0,0,0,0,1,0,0,0,0
2,1000001,P00087842,0,2,0,1422,14.22,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,1000001,P00085442,0,2,0,1057,10.57,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,1000002,P00285442,6,4,0,7969,79.69,1,0,0,...,0,0,0,0,0,0,0,0,0,0


In [38]:
dummyData2.head()

Unnamed: 0,User_ID,Product_ID,Age,Stay_In_Current_City_Years,Marital_Status,Purchase,Purchase_Size,Gender_M,Occupation_1,Occupation_2,...,Product_Category_3_9.0,Product_Category_3_10.0,Product_Category_3_11.0,Product_Category_3_12.0,Product_Category_3_13.0,Product_Category_3_14.0,Product_Category_3_15.0,Product_Category_3_16.0,Product_Category_3_17.0,Product_Category_3_18.0
0,1000001,P00069042,0,2,0,8370,83.7,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,1000001,P00248942,0,2,0,15200,152.0,0,0,0,...,0,0,0,0,0,1,0,0,0,0
2,1000001,P00087842,0,2,0,1422,14.22,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,1000001,P00085442,0,2,0,1057,10.57,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,1000002,P00285442,6,4,0,7969,79.69,1,0,0,...,0,0,0,0,0,0,0,0,0,0


# Split into training (2/3) and testing (1/3) sets

In [39]:


import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split

train, test = train_test_split(dummyData2, test_size=0.3)

ModuleNotFoundError: No module named 'sklearn'