## Using Decision Tree

In [1]:
#import libraries
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score

In [2]:
# load data 
file_path = 'http://iali.in/datasets/Social_Network_Ads.csv'
purchase_data = pd.read_csv(file_path)

In [3]:
# sample data 
purchase_data.head()

Unnamed: 0,User ID,Gender,Age,EstimatedSalary,Purchased
0,15624510,Male,19,19000,0
1,15810944,Male,35,20000,0
2,15668575,Female,26,43000,0
3,15603246,Female,27,57000,0
4,15804002,Male,19,76000,0


In [4]:
# statistics of given data
purchase_data.describe()

Unnamed: 0,User ID,Age,EstimatedSalary,Purchased
count,400.0,400.0,400.0,400.0
mean,15691540.0,37.655,69742.5,0.3575
std,71658.32,10.482877,34096.960282,0.479864
min,15566690.0,18.0,15000.0,0.0
25%,15626760.0,29.75,43000.0,0.0
50%,15694340.0,37.0,70000.0,0.0
75%,15750360.0,46.0,88000.0,1.0
max,15815240.0,60.0,150000.0,1.0


In [5]:
#features
featurte_list = purchase_data.columns
featurte_list

Index(['User ID', 'Gender', 'Age', 'EstimatedSalary', 'Purchased'], dtype='object')

In [6]:
X = purchase_data[featurte_list[:-1]]

In [7]:
X.head()

Unnamed: 0,User ID,Gender,Age,EstimatedSalary
0,15624510,Male,19,19000
1,15810944,Male,35,20000
2,15668575,Female,26,43000
3,15603246,Female,27,57000
4,15804002,Male,19,76000


In [8]:
y = purchase_data[featurte_list[-1]]

In [9]:
y.head()

0    0
1    0
2    0
3    0
4    0
Name: Purchased, dtype: int64

In [10]:
duplicates = pd.get_dummies(X.Gender)
print (duplicates.head(5))

   Female  Male
0       0     1
1       0     1
2       1     0
3       1     0
4       0     1


In [11]:
X = pd.concat( [X, duplicates] , axis = 1)
new_feature_list = ['User ID','EstimatedSalary','Age','Female','Male']
X = X[new_feature_list]
X.head()

Unnamed: 0,User ID,EstimatedSalary,Age,Female,Male
0,15624510,19000,19,0,1
1,15810944,20000,35,0,1
2,15668575,43000,26,1,0
3,15603246,57000,27,1,0
4,15804002,76000,19,0,1


In [12]:
#split data
X_train,X_test,y_train,y_test=train_test_split(X,y, test_size=0.2, random_state = 42)

In [13]:
# define a model of decision tree type
model = DecisionTreeClassifier()

# fit the training data in model
model.fit(X_train, y_train)

DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
                       max_features=None, max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, presort=False,
                       random_state=None, splitter='best')

In [14]:
#predicitng on test data
preds = model.predict(X_test)
preds.size

80

In [15]:
comparison = pd.DataFrame({'Actual': y_test, 'Predicted': preds})
comparison

Unnamed: 0,Actual,Predicted
209,0,1
280,1,1
33,0,0
210,1,1
93,0,0
...,...,...
246,0,0
227,1,1
369,1,1
176,0,0


In [16]:
accuracy_score(y_test,preds)

0.8875

## Using SVM

In [17]:
#import libraries
import pandas as pd
from sklearn import svm
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

In [18]:
# load data 
file_path = 'http://iali.in/datasets/Social_Network_Ads.csv'
purchase_data = pd.read_csv(file_path)

In [19]:
# sample data 
purchase_data.head()

Unnamed: 0,User ID,Gender,Age,EstimatedSalary,Purchased
0,15624510,Male,19,19000,0
1,15810944,Male,35,20000,0
2,15668575,Female,26,43000,0
3,15603246,Female,27,57000,0
4,15804002,Male,19,76000,0


In [20]:
#features
featurte_list = purchase_data.columns
featurte_list

Index(['User ID', 'Gender', 'Age', 'EstimatedSalary', 'Purchased'], dtype='object')

In [21]:
X = purchase_data[featurte_list[:-1]]
y = purchase_data[featurte_list[-1]]
X.head()

Unnamed: 0,User ID,Gender,Age,EstimatedSalary
0,15624510,Male,19,19000
1,15810944,Male,35,20000
2,15668575,Female,26,43000
3,15603246,Female,27,57000
4,15804002,Male,19,76000


In [22]:
X = pd.get_dummies(X,prefix_sep='_',drop_first= True)
X.head()

Unnamed: 0,User ID,Age,EstimatedSalary,Gender_Male
0,15624510,19,19000,1
1,15810944,35,20000,1
2,15668575,26,43000,0
3,15603246,27,57000,0
4,15804002,19,76000,1


In [23]:
X_train,X_test,y_train,y_test=train_test_split(X,y, test_size=0.2, random_state = 42)

model = svm.SVC(gamma=10, C=100)

model.fit(X_train,y_train)

SVC(C=100, cache_size=200, class_weight=None, coef0=0.0,
    decision_function_shape='ovr', degree=3, gamma=10, kernel='rbf',
    max_iter=-1, probability=False, random_state=None, shrinking=True,
    tol=0.001, verbose=False)

In [24]:
preds = model.predict(X_test)

In [25]:
accuracy_score(y_test,preds)

0.65

### Hence, Decision trees performed better than SVM
#### Decison tree accuracy score :- 0.8875
#### SVM accuracy score :- 0.65