# Black Friday 

### Analysis is applied for determinning

In [26]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
%matplotlib inline


### Read the Excel file

In [27]:
df = pd.read_csv('BlackFriday.csv')

In [28]:
df.head()

Unnamed: 0,User_ID,Product_ID,Gender,Age,Occupation,City_Category,Stay_In_Current_City_Years,Marital_Status,Product_Category_1,Product_Category_2,Product_Category_3,Purchase
0,1000001,P00069042,F,0-17,10,A,2,0,3,,,8370
1,1000001,P00248942,F,0-17,10,A,2,0,1,6.0,14.0,15200
2,1000001,P00087842,F,0-17,10,A,2,0,12,,,1422
3,1000001,P00085442,F,0-17,10,A,2,0,12,14.0,,1057
4,1000002,P00285442,M,55+,16,C,4+,0,8,,,7969


### Feature engineering: Cleaning, Onehotencoder, normalizing and categorizing

In [29]:
# Fill the Nan values with zeros since it is not existed
df = df.fillna(0)

# Drop the unrelated columns of the data
df = df.drop(['User_ID','Product_ID','Product_Category_1','Product_Category_2','Product_Category_3'], axis=1)

#Replace any years over 4+ to just 4
df['Stay_In_Current_City_Years']= df['Stay_In_Current_City_Years'].replace(['4+'],4)

#Set an amount range for the purchasing 
# for i,row in enumerate(df["Purchase"]):
#     if row <= 5000:
#         df['Purchase'][i]=5000
#     elif row < 10000 and row >5000:
#         df['Purchase'][i]=10000
#     elif row < 15000 and row >10000:
#         df['Purchase'][i]=15000
#     elif row < 20000 and row >15000:
#         df['Purchase'][i]=20000
#     elif row < 25000 and row >20000:
#         df['Purchase'][i]=25000
#     else:
#         df['Purchase'][i]=30000
        

In [30]:
#Set an amount range for the purchasing 
col=[]
for row in df['Purchase']:
#     print(row)
    if row <= 5000:
        col.append(5000)
    elif row < 10000 and row >5000:
        col.append(10000)
    elif row < 15000 and row >10000:
        col.append(15000)
    elif row < 20000 and row >15000:
        col.append(20000)
    elif row < 25000 and row >20000:
        col.append(25000)
    else:
        col.append(30000)



In [31]:
df['Purchase']=col



In [32]:
df.head()

Unnamed: 0,Gender,Age,Occupation,City_Category,Stay_In_Current_City_Years,Marital_Status,Purchase
0,F,0-17,10,A,2,0,10000
1,F,0-17,10,A,2,0,20000
2,F,0-17,10,A,2,0,5000
3,F,0-17,10,A,2,0,5000
4,M,55+,16,C,4,0,10000


In [33]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OneHotEncoder
from sklearn.pipeline import Pipeline

# Define a class that can do LabelEncoder for multiple columns

class MultiColumnLabelEncoder:
    def __init__(self,columns = None):
        self.columns = columns # array of column names to encode

    def fit(self,X,y=None):
        return self # not relevant here

    def transform(self,X):
        '''
        Transforms columns of X specified in self.columns using
        LabelEncoder(). If no columns specified, transforms all
        columns in X.
        '''
        output = X.copy()
        if self.columns is not None:
            for col in self.columns:
                output[col] = LabelEncoder().fit_transform(output[col])
        else:
            for colname,col in output.iteritems():
                output[colname] = LabelEncoder().fit_transform(col)
        return output

    def fit_transform(self,X,y=None):
        return self.fit(X,y).transform(X)
    
    
        


In [34]:
# Apply Label Encoder function for multiple columns ['Gender','Age','City_Category']
X = MultiColumnLabelEncoder(columns = ['Gender','Age','City_Category','Purchase']).fit_transform(df)

In [35]:
X.to_csv('Cleaned_Data.csv')
X.head()

Unnamed: 0,Gender,Age,Occupation,City_Category,Stay_In_Current_City_Years,Marital_Status,Purchase
0,0,0,10,0,2,0,1
1,0,0,10,0,2,0,3
2,0,0,10,0,2,0,0
3,0,0,10,0,2,0,0
4,1,6,16,2,4,0,1


## Classification Models
The classification is predicting the gender male/female from the provided features. Age_ranges, Occupation, City_Category, Years_of_living, Marital_Status, and Purchase amount.

After splitting the data, we are performing multiple classification models to explore the measurement parameters such as confusion matrix, accuracy, recall, precision, F-score

In [36]:
#Import the modified data 


In [91]:
#Defining the features and the target values
X_data = X.iloc[:,[6,1,2,3,4,5]].values
Y_data = X.iloc[:,0].values

In [92]:
# Importing the required machine learning classification library 
from sklearn.cross_validation import train_test_split

# Splitting the data for training
X_train,X_test,y_train,y_test = train_test_split(X_data,Y_data,test_size=0.20, random_state=0)

### Using K Nearest Neighbors K-NN model 

In [93]:
# import KNN classifier from sklearn
from sklearn.neighbors import KNeighborsClassifier

# Creating a new KNN classifier with 6 neighbors
classifier_knn = KNeighborsClassifier(n_neighbors=5,metric='minkowski', p=2)
classifier_knn.fit(X_train,y_train)

KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
           metric_params=None, n_jobs=1, n_neighbors=5, p=2,
           weights='uniform')

In [94]:
# Predict the test data
y_predict = classifier_knn.predict(X_test)

In [95]:
from sklearn.metrics import confusion_matrix
cm =confusion_matrix(y_test,y_predict)

In [96]:
print (f'Confusion Matrix is:\n{cm}')

Confusion Matrix is:
[[15965 10726]
 [ 6756 74069]]


In [97]:
from sklearn.metrics import accuracy_score
accuracy = accuracy_score(y_test, y_predict, normalize=True)
print (f'The accuracy of the KNN model is : {accuracy}')

The accuracy of the KNN model is : 0.8374009449756316


In [98]:
from sklearn.metrics import precision_recall_fscore_support
pre_re_F = precision_recall_fscore_support(y_test, y_predict, average='macro')

print (f'The preceision of KNN model is: {pre_re_F[0]}\n')
print (f'The recall of KNN model is: {pre_re_F[1]}\n')
print (f'The F-score of KNN model is: {pre_re_F[2]}')

The preceision of KNN model is: 0.7880803125484793

The recall of KNN model is: 0.7572768484699561

The F-score of KNN model is: 0.7703222095685376


### Using Decision Tree to predict Gender

In [99]:
# Importing  Decision Tree from sklearn
from sklearn.tree import DecisionTreeClassifier

#Create the model with linear classification
classifier_DT = DecisionTreeClassifier(criterion='entropy', random_state=0)

#Fit the classifier to the training data
classifier_DT.fit(X_train,y_train)

DecisionTreeClassifier(class_weight=None, criterion='entropy', max_depth=None,
            max_features=None, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, presort=False, random_state=0,
            splitter='best')

In [100]:
# Predict the test data
y_predict = classifier_DT.predict(X_test)

# Calculating the confusion matrix 
cm_DT = confusion_matrix(y_test,y_predict)

In [101]:
print (f'Confusion Matrix is:\n{cm_DT}')

Confusion Matrix is:
[[15009 11682]
 [ 3941 76884]]


In [102]:
#Calculate the model accuray for Decision tree model
from sklearn.metrics import accuracy_score
accuracy = accuracy_score(y_test, y_predict, normalize=True)
print (f'The accuracy of the Decision Tree model is : {accuracy}')

The accuracy of the Decision Tree model is : 0.8546913947691507


In [103]:
# Calculate recall, precision, F-score for Decision tree 
from sklearn.metrics import precision_recall_fscore_support
pre_re_F = precision_recall_fscore_support(y_test, y_predict, average='macro')

print (f'The preceision of Decision Tree model is: {pre_re_F[0]}\n')
print (f'The recall of Decision Tree model is: {pre_re_F[1]}\n')
print (f'The F-score of Decision Tree model is: {pre_re_F[2]}')

The preceision of Decision Tree model is: 0.8300650147942084

The recall of Decision Tree model is: 0.7567823565296079

The F-score of Decision Tree model is: 0.782733860225489


### Using Random Forest to predict Gender

In [104]:
# Importing  Random Forestfrom sklearn
from sklearn.ensemble import RandomForestClassifier

#Create the model with linear classification
classifier_RF = RandomForestClassifier(n_estimators=10,criterion='entropy', random_state=0)

#Fit the classifier to the training data
classifier_RF.fit(X_train,y_train)

# Predict the test data
y_predict = classifier_RF.predict(X_test)

# Calculating the confusion matrix 
cm_RF = confusion_matrix(y_test,y_predict)

print (f'Confusion Matrix is:\n{cm_RF}')

Confusion Matrix is:
[[14736 11955]
 [ 3767 77058]]


In [105]:
from sklearn.metrics import accuracy_score
accuracy = accuracy_score(y_test, y_predict, normalize=True)
print (f'The accuracy of the Decision Tree model is : {accuracy}')

The accuracy of the Decision Tree model is : 0.8537706015848804


In [106]:
# Calculate recall, precision, F-score for Decision tree 
from sklearn.metrics import precision_recall_fscore_support
pre_re_F = precision_recall_fscore_support(y_test, y_predict, average='macro')

print (f'The preceision of Decision Tree model is: {pre_re_F[0]}\n')
print (f'The recall of Decision Tree model is: {pre_re_F[1]}\n')
print (f'The F-score of Decision Tree model is: {pre_re_F[2]}')

The preceision of Decision Tree model is: 0.8310525839068428

The recall of Decision Tree model is: 0.7527446727595373

The F-score of Decision Tree model is: 0.7797756979230055
