In [1]:
import numpy as np
import pandas as pd
from pandas import DataFrame, Series

import matplotlib.pyplot as plt
import seaborn as sns
sns.set_style('whitegrid')

%matplotlib inline

In [2]:
# Loading up the dataset

df = pd.read_csv('../archive/Social_Network_Ads.csv')
df.head()

Unnamed: 0,User ID,Gender,Age,EstimatedSalary,Purchased
0,15624510,Male,19,19000,0
1,15810944,Male,35,20000,0
2,15668575,Female,26,43000,0
3,15603246,Female,27,57000,0
4,15804002,Male,19,76000,0


In [3]:
# Setting up the X and y values
y = df['Purchased']
Xs = df.drop(['User ID', 'Purchased'], axis=1)

# Fixing the categorical data
gender_dummies = pd.get_dummies(df['Gender'])
gender_dummies = gender_dummies['Female']

pd.to_numeric(gender_dummies, errors='coerce')
Xs = pd.concat([Xs, gender_dummies], axis=1)

Xs.drop(['Gender'], inplace=True, axis=1)
Xs.head()

Unnamed: 0,Age,EstimatedSalary,Female
0,19,19000,0
1,35,20000,0
2,26,43000,1
3,27,57000,1
4,19,76000,0


In [4]:
# Splitting the data set into training and testing

from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(Xs, y, test_size=0.25, random_state=0)

In [5]:
# Feature Scaling
from sklearn.preprocessing import StandardScaler

sc_X = StandardScaler()
X_train = sc_X.fit_transform(X_train)

# We are only using the transform and not the fit transform because we already fit the model
X_test = sc_X.transform(X_test)

In [6]:
# Fitting the classifier to the training set

from sklearn.ensemble import RandomForestClassifier
classifier = RandomForestClassifier(n_estimators=10, criterion='entropy', max_depth=4, random_state=0)
classifier.fit(X_train, y_train)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='entropy',
            max_depth=4, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=1,
            oob_score=False, random_state=0, verbose=0, warm_start=False)

In [7]:
# Predicting the values of the testing set

y_pred = classifier.predict(X_test)

In [8]:
# Checking for the results
from sklearn.metrics import confusion_matrix

cm = confusion_matrix(y_test, y_pred)
tn, fp, fn, tp = cm.ravel()

print(
    "true pos: {0}\n"
    "false pos: {1}\n"
    "true neg: {2}\n"
    "false neg: {3}\n".format(tp, fp, tn, fn))


true pos: 28
false pos: 4
true neg: 64
false neg: 4

