In [1]:
import numpy as np
import matplotlib.pyplot as plt  
import pandas as pd 

In [2]:
df = pd.read_csv('Mall_Customers.csv') 

In [3]:
df.head()

Unnamed: 0,CustomerID,Genre,Age,Annual Income (k$),Spending Score (1-100)
0,1,Male,19,15,39
1,2,Male,21,15,81
2,3,Female,20,16,6
3,4,Female,23,16,77
4,5,Female,31,17,40


#Preprocessing

In [4]:
#Checking for Missing values
df.isnull().sum()

CustomerID                0
Genre                     0
Age                       0
Annual Income (k$)        0
Spending Score (1-100)    0
dtype: int64

In [5]:
df['Genre'].unique()

array(['Male', 'Female'], dtype=object)

In [6]:
df['Age'].unique()

array([19, 21, 20, 23, 31, 22, 35, 64, 30, 67, 58, 24, 37, 52, 25, 46, 54,
       29, 45, 40, 60, 53, 18, 49, 42, 36, 65, 48, 50, 27, 33, 59, 47, 51,
       69, 70, 63, 43, 68, 32, 26, 57, 38, 55, 34, 66, 39, 44, 28, 56, 41])

In [7]:
df['Annual Income (k$)'].unique()

array([ 15,  16,  17,  18,  19,  20,  21,  23,  24,  25,  28,  29,  30,
        33,  34,  37,  38,  39,  40,  42,  43,  44,  46,  47,  48,  49,
        50,  54,  57,  58,  59,  60,  61,  62,  63,  64,  65,  67,  69,
        70,  71,  72,  73,  74,  75,  76,  77,  78,  79,  81,  85,  86,
        87,  88,  93,  97,  98,  99, 101, 103, 113, 120, 126, 137])

In [8]:
df['Spending Score (1-100)'].unique()

array([39, 81,  6, 77, 40, 76, 94,  3, 72, 14, 99, 15, 13, 79, 35, 66, 29,
       98, 73,  5, 82, 32, 61, 31, 87,  4, 92, 17, 26, 75, 36, 28, 65, 55,
       47, 42, 52, 60, 54, 45, 41, 50, 46, 51, 56, 59, 48, 49, 53, 44, 57,
       58, 43, 91, 95, 11,  9, 34, 71, 88,  7, 10, 93, 12, 97, 74, 22, 90,
       20, 16, 89,  1, 78, 83, 27, 63, 86, 69, 24, 68, 85, 23,  8, 18])

In [9]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 200 entries, 0 to 199
Data columns (total 5 columns):
 #   Column                  Non-Null Count  Dtype 
---  ------                  --------------  ----- 
 0   CustomerID              200 non-null    int64 
 1   Genre                   200 non-null    object
 2   Age                     200 non-null    int64 
 3   Annual Income (k$)      200 non-null    int64 
 4   Spending Score (1-100)  200 non-null    int64 
dtypes: int64(4), object(1)
memory usage: 7.9+ KB


In [10]:
Correlation = df.corr()
Correlation['Spending Score (1-100)'].abs().sort_values(ascending = False)

Spending Score (1-100)    1.000000
Age                       0.327227
CustomerID                0.013835
Annual Income (k$)        0.009903
Name: Spending Score (1-100), dtype: float64

In [11]:
#As shown above, we will take the features which is higher than the median that is CRIM with 0.338 
x = df[['Age','Annual Income (k$)']].values #independent variable
y = df[['Spending Score (1-100)']].values #dependent variable

In [12]:
#Normalize the dataset using StandardScaler, I am using StandardScaler as we don't know the min-max value
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()
x = sc.fit_transform(x)

In [13]:
from sklearn.model_selection import train_test_split
#Spliting Dataset with 20% test size
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2,random_state=42)

In [14]:
print(x_train)

[[ 0.7284319  -0.25039146]
 [-0.49160182  2.49780745]
 [-0.20453507 -0.89927175]
 [ 1.08726535 -1.24279661]
 [ 0.08253169  0.32214998]
 [-0.27630176  2.26879087]
 [-1.13750203 -1.20462718]
 [-0.27630176 -1.43364376]
 [-0.49160182  0.58933599]
 [ 1.15903204 -0.13588317]
 [-1.49633548  0.16947227]
 [-0.20453507  1.00919971]
 [-1.20926872 -1.66266033]
 [ 0.29783176  0.39848884]
 [ 0.7284319   0.16947227]
 [ 1.51786549 -0.40306917]
 [ 1.30256542  0.55116656]
 [ 0.65666521  0.01679455]
 [-0.20453507  1.00919971]
 [ 0.58489852 -0.02137488]
 [-0.49160182  1.00919971]
 [-0.85043527 -0.02137488]
 [-1.28103541 -1.16645776]
 [ 1.37433211 -1.54815205]
 [-1.28103541 -1.05194947]
 [ 0.80019859  0.24581112]
 [ 0.65666521 -0.82293289]
 [-0.20453507  1.61991057]
 [ 2.091999   -0.05954431]
 [ 0.36959845  0.4748277 ]
 [-0.41983513 -0.70842461]
 [ 0.08253169  0.39848884]
 [-1.42456879  0.78018313]
 [-1.06573534 -0.86110232]
 [ 0.87196528  0.24581112]
 [-1.42456879  0.13130284]
 [ 0.44136514 -1.24279661]
 

In [15]:
print(y_train)

[[42]
 [74]
 [26]
 [14]
 [58]
 [79]
 [87]
 [98]
 [87]
 [58]
 [48]
 [92]
 [76]
 [35]
 [59]
 [56]
 [ 5]
 [42]
 [10]
 [47]
 [63]
 [50]
 [73]
 [15]
 [81]
 [57]
 [36]
 [85]
 [55]
 [ 7]
 [60]
 [95]
 [ 5]
 [92]
 [43]
 [46]
 [32]
 [72]
 [41]
 [54]
 [ 6]
 [48]
 [55]
 [68]
 [69]
 [46]
 [83]
 [17]
 [23]
 [55]
 [ 5]
 [93]
 [48]
 [92]
 [99]
 [16]
 [73]
 [ 6]
 [61]
 [56]
 [40]
 [ 4]
 [40]
 [97]
 [48]
 [12]
 [14]
 [52]
 [42]
 [36]
 [20]
 [39]
 [18]
 [76]
 [55]
 [91]
 [51]
 [28]
 [93]
 [31]
 [35]
 [43]
 [89]
 [ 1]
 [82]
 [73]
 [39]
 [74]
 [55]
 [75]
 [27]
 [47]
 [42]
 [16]
 [61]
 [10]
 [77]
 [42]
 [60]
 [71]
 [32]
 [14]
 [97]
 [42]
 [51]
 [14]
 [94]
 [52]
 [41]
 [44]
 [15]
 [86]
 [46]
 [ 3]
 [77]
 [46]
 [75]
 [75]
 [66]
 [49]
 [86]
 [ 5]
 [95]
 [88]
 [59]
 [45]
 [46]
 [52]
 [28]
 [51]
 [42]
 [60]
 [73]
 [46]
 [35]
 [ 8]
 [75]
 [73]
 [78]
 [91]
 [81]
 [54]
 [90]
 [ 9]
 [88]
 [55]
 [49]
 [43]
 [55]
 [47]
 [40]
 [83]
 [35]
 [17]
 [42]
 [50]
 [13]
 [49]
 [90]
 [59]]


In [18]:
X_train = np.c_[np.ones(len(x_train),dtype='int64'),x_train]                        
x_test = np.c_[np.ones(len(x_test),dtype='int64'),x_test]

In [19]:
def cost_function(X, Y, B):
    m = len(Y)
    J = np.sum((X.dot(B) - Y)**2)/(2 * m)
    return J

In [20]:
def batch_gradient_descent(X, Y, B, alpha, iterations):
    cost_history = [0] * iterations
    m = len(Y)
 
    for iteration in range(iterations):
     #print(iteration)
     # Hypothesis Values
         h = X.dot(B)
     # Difference b/w Hypothesis and Actual Y
         loss = h - Y
     # Gradient Calculation
         gradient = X.T.dot(loss) / m
     # Changing Values of B using Gradient
         B = B - alpha * gradient
     # New Cost Value
         cost = cost_function(X, Y, B)
         cost_history[iteration] = cost
 
    return B, cost_history

ValueError: ignored

In [None]:
y_pred = X_test.dot(newB)
print (y_pred[:5])
print (y_test[:5])

In [None]:
from sklearn.metrics import mean_squared_error, r2_score
MSE = mean_squared_error(y_test, y_pred)
score = r2_score(y_test, y_pred)
print ("MSE :", MSE)
print ("R2 :", score)