In [10]:
# import pandas in order to read from file, similar to C++'s iostream:
# converts the provided file into a panda object, which allows for additional data manipulation
import pandas as pd
dataset = pd.read_csv("iphone_purchase_records.csv")

# 'iloc' determines the selection of data from the dataset
# 'values' returns a numpy equivalent of the dataframe
X = dataset.iloc[:,:-1].values    # stores all data aside from "Purchased Iphone" at the rightmost side
y = dataset.iloc[:, 3].values     # stores the data of "Purchased Iphone"

In [2]:
from sklearn.preprocessing import LabelEncoder

# the documentation indicates that LabelEncoder() "Encode target labels with value between 0 and n_classes-1."
# what might n_classes be in this case?
# Further reading may be found here: https://scikit-learn.org/stable/modules/preprocessing_targets.html#preprocessing-targets
# though, for now the finer details will be ignored in favor of understanding for the bigger picture
labelEncoder_gender =  LabelEncoder()

# https://stackoverflow.com/questions/45704226/what-does-the-fit-method-in-scikit-learn-do
# The following trains the model on the given data (in this case, the genders of the data), 
# though how will remain as a blackbox until I have attained a better understandin of ML
X[:,0] = labelEncoder_gender.fit_transform(X[:,0])

In [3]:
from sklearn.preprocessing import StandardScaler

sc = StandardScaler()      # "Standardize features by removing the mean and scaling to unit variance."
X = sc.fit_transform(X)

In [8]:
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC

# creates an array for the algorithms
classification_models = []
classification_models.append(('Logistic Regression', LogisticRegression(solver="liblinear")))
classification_models.append(('K Nearest Neighbor', KNeighborsClassifier(n_neighbors=5, metric="minkowski",p=2)))
classification_models.append(('Kernel SVM', SVC(kernel = 'rbf',gamma='scale')))
classification_models.append(('Naive Bayes', GaussianNB()))
classification_models.append(('Decision Tree', DecisionTreeClassifier(criterion = "entropy")))
classification_models.append(('Random Forest', RandomForestClassifier(n_estimators=100, criterion="entropy")))

# this is the meat of the model, this is where your current level of knowledge ends; come back later!
for name, model in classification_models:
  kfold = KFold(n_splits=10, shuffle=True, random_state=7)
  result = cross_val_score(model, X, y, cv=kfold, scoring='accuracy')
  print("%s: Mean Accuracy = %.2f%% - SD Accuracy = %.2f%%" % (name, result.mean()*100, result.std()*100))

Logistic Regression: Mean Accuracy = 95.00% - SD Accuracy = 15.00%
K Nearest Neighbor: Mean Accuracy = 75.00% - SD Accuracy = 33.54%
Kernel SVM: Mean Accuracy = 95.00% - SD Accuracy = 15.00%
Naive Bayes: Mean Accuracy = 90.00% - SD Accuracy = 20.00%
Decision Tree: Mean Accuracy = 90.00% - SD Accuracy = 20.00%
Random Forest: Mean Accuracy = 95.00% - SD Accuracy = 15.00%
