# Preprocessing

Importing general libraries

In [None]:
# data analysis
import pandas as pd
import numpy as np

# visualisation
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

# metrics and algorithm validation
from sklearn.metrics import accuracy_score, confusion_matrix, f1_score
from sklearn.model_selection import KFold, cross_val_score

Process the data

In [None]:
# import the dataset
df = pd.read_csv('Social_Network_Ads.csv')
X = df.iloc[:,1:-1].values
y = df.iloc[:,-1].values

Fill empty rows

In [None]:
# filling empty rows with the mean
# doing it by the column, select only numerical columns
from sklearn.preprocessing import Imputer
imputer = Imputer()
imputer = imputer.fit_transform(X[:,'columns':])

Encoding data

In [None]:
# encoding categorical data
from sklearn.preprocessing import LabelEncoder
encoder = LabelEncoder()
X[:,0] = encoder.fit_transform(X[:,0])

In [None]:
# create dummy variables
from sklearn.preprocessing import OneHotEncoder
hot_encoder = OneHotEncoder(categorical_features=[0])
X = hot_encoder.fit_transform(X).toarray()
X = X[:, 1:]

Backward elimination

In [None]:
import statsmodels.formula.api as sm
# a column of ones needs to be added so that the formula will work as intended
X = np.append(values=X, arr=np.ones((50,1)).astype(int),axis=1)

# repeat the 3 commands underneath and eliminate one row at each stept until all are within the desired p statistic
X_opt = X[:, [0,1,2,3,4,5,"""all X values"""]]
# ordinary least squares regressor
regressor_OLS = sm.OLS(y,X_opt).fit()
regressor_OLS.summary()

Preparing for algorithm application

In [None]:
# splitting the train and test sets
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=0)

In [None]:
# feature scaling
# required for K-NN, K-means, logistic regression, SVMs, perceptrons, neural networks, LDA, PCA
from sklearn.preprocessing import StandardScaler
sc_X = StandardScaler()
X_train = sc_X.fit_transform(X_train)
X_test = sc_X.transform(X_test)

PCA

In [None]:
from sklearn.decomposition import PCA
# leave it as None initially to explore the variance first, then change to the choosen number from explained_variance
pca = PCA(n_components=None)

# fitting and transforming the training set and transforming the test set
X_train = pca.fit_transform(X_train)
X_test = pca.transform(X_test)

# cumulated explained vairance of the principal components
explained_variance = pca.explained_variance_ratio_
explained_variance

Kernel PCA

In [None]:
from sklearn.decomposition import KernelPCA
kpca = KernelPCA(n_components=None,kernel='rbf')

# fitting and transforming the training set and transforming the test set
X_train = kpca.fit_transform(X_train)
X_test = kpca.transform(X_test)

LDA

In [None]:
# applying LDA
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
lda = LinearDiscriminantAnalysis(n_components=2)

# fitting and transforming the training set and transforming the test set
X_train = lda.fit_transform(X_train,y_train)
X_test = lda.transform(X_test)

# Algorithm application

# Regression

In [None]:
# importing libraries
from sklearn.linear_model import LinearRegression as LR
from sklearn.preprocessing import PolynomialFeatures as Poly
from sklearn.svm import SVR
from sklearn.tree import DecisionTreeRegressor as RT
from sklearn.ensemble import RandomForestRegressor as RF

In [None]:
# cross validation

algorithms = []

algorithms.append(('LR', LR()))
algorithms.append(('Poly', Poly(degree='select')))
algorithms.append(('SVR', SVR(kernel='select')))
algorithms.append(('CART', RT('hyperparams')))
algorithms.append(('RF', RF('hyperparams')))

results = []
names = []
scoring = 'r2'

for name, model in algorithms:
    cv_results = cross_val_score(model, X_train, y_train, cv=10, scoring=scoring)
    results.append(cv_results)
    names.append(name)
    msg = '%s: %f (%f)' %(name, cv_results.mean(), cv_results.std())
    print(msg)
    
fig = plt.figure(figsize=(22,5))
fig.suptitle('Algorithm Comparison')
ax = fig.add_subplot(111)
plt.boxplot(results)
ax.set_xticklabels(names)

# Classification

In [None]:
# importing libraries
from sklearn.linear_model import LogisticRegression as LGR
from sklearn.neighbors import KNeighborsClassifier as KNN
from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB as NB
from sklearn.tree import DecisionTreeClassifier as CT
from sklearn.ensemble import RandomForestClassifier as RFC
from sklearn.ensemble import AdaBoostClassifier as ADA
from sklearn.ensemble import GradientBoostingClassifier as GB

In [None]:
# cross validation

algorithms = []

algorithms.append(('LGR', LGR()))
algorithms.append(('KNN', KNN()))
algorithms.append(('SVC', SVC(kernel='rbf')))
algorithms.append(('NB', NB()))
algorithms.append(('CART', CT()))
algorithms.append(('RFC', RFC()))
algorithms.append(('ADA', ADA()))
algorithms.append(('GB', GB()))

results = []
names = []
scoring = 'accuracy'

for name, model in algorithms:
    cv_results = cross_val_score(model, X_train, y_train, cv=10, scoring=scoring)
    results.append(cv_results)
    names.append(name)
    msg = '%s: %f (%f)' %(name, cv_results.mean(), cv_results.std())
    print(msg)
    
fig = plt.figure(figsize=(22,10))
fig.suptitle('Algorithm Comparison',size='xx-large')
ax = fig.add_subplot(111)
plt.boxplot(results)
ax.set_xticklabels(names,size='xx-large')

# Grid Search

In [None]:
# finding the best model and the best hyperparams
from sklearn.grid_search import GridSearchCV

# specifying the combinations of different params of which we want to find the optimal values
# use this to find out whether the problem si linear or non-linear
# those will be applied to a particular model - here those are for SVC to check for linear or no and other hyperparam tunning
params = [{'C': [0.01,0.03,0.1,0.3,1,10,100,1000],
           'kernel': ['linear']},
          {'C': [0.01,0.03,0.1,0.3,1,10,100,1000],
           'kernel': ['rbf'],
           'gamma': [0.001,0.01,0.1,0.3,0.5,0.7,1]},
         ]

# putting it all together
# n_jobs = -1 for large datasets
# classifier = the model allocated to the variable
grid_search = GridSearchCV(estimator=classifier, param_grid=params, scoring='accuracy', cv=10, n_jobs=-1)

# fitting
grid_search = grid_search.fit(X_train,y_train)

# best params
# after finding out the best params, you can reassess and try new params
print('Best params: ', grid_search.best_params_)

# best accuracy - mean of the 10 accuracies through 10 CV combinations
print('Best accuracy: ', grid_search.best_score_)

# Visualise results (2D)

Use PCA or LDA to convert to 2D before running this if the data is not in 2D already.

In [None]:
from matplotlib.colors import ListedColormap
X_set, y_set = X_train, y_train

plt.figure(figsize=(15,8))
X1, X2 = np.meshgrid(np.arange(start = X_set[:,0].min() - 1, stop = X_set[:,0].max() + 1, step = 0.01),
                     np.arange(start = X_set[:,1].min() - 1, stop = X_set[:,1].max() + 1, step = 0.01))

plt.contourf(X1,X2,classifier.predict(np.array([X1.ravel(), X2.ravel()]).T).reshape(X1.shape),
            alpha = 0.75, cmap = ListedColormap(('red','green')))

plt.xlim(X1.min(), X1.max())
plt.ylim(X2.min(), X2.max())

for i,j in enumerate(np.unique(y_set)):
    plt.scatter(X_set[y_set == j,0], X_set[y_set == j, 1],
               c = ListedColormap(('red','green'))(i), label = j)

plt.title('Title')
plt.xlabel('Variable 1')
plt.ylabel('Variable 2')
plt.legend(bbox_to_anchor=(1.05,1))