In [None]:
# Imports - Basics

import pandas as pd
import numpy as np


In [None]:
# Reading Data from Source

In [None]:
# Read CSV

url = 'https://raw.githubusercontent.com/upxacademy/ML_with_Python/master/Datasets/glass.data?token=AH0Y7F-9aM-g_uOfpoUhYRF3xm7oJjrzks5ZFEl1wA%3D%3D'
col_names = ['index_col_name','feature_1','feature_2','feature_3','feature_4','label_col_name']
dataframe = pd.read_csv(url, names=col_names, index_col='index_col_name')
dataframe.sort_values(by='feature_2', inplace=True)
#dataframe.head()

# Outter Join 2 sets
dataframe_1 = pd.read_csv("student-mat.csv",delimiter=";")
dataframe_2 = pd.read_csv("student-por.csv",delimiter=";")
dataframe_combined = pd.merge(dataframe_1,dataframe_2,how="outer")


# OR read the csv from URL and save as local file
import urllib.request

url = "http://archive.ics.uci.edu/ml/machine-learning-databases/poker/poker-hand-training-true.data"
urllib.request.urlretrieve(url, "poker_train.csv")


In [None]:
# Read ARFF

from scipy.io.arff import loadarff

data_raw = loadarff("PhishingData.arff")
# Selection of the Data and converting data into numpy format for flexibility in cleaning
numpy_data_array = np.array(data_raw[0])
# Converting the numpy array into Pandas data frame and casting the coloumns to numeric type
dataframe = pd.DataFrame(numpy_data_array).apply(pd.to_numeric)


In [None]:
# Read from URL

import requests

http_request = requests.get('https://archive.ics.uci.edu/ml/machine-learning-databases/wine-quality/winequality-white.csv')
http_text = http_request.text.split("\n")
data_array = []
for lines in http_text:
    data_array.append(lines.split(";"))
col_names = []
for col in data_array[0]:
    col_names.append(col.strip('"'))

dataframe = pd.DataFrame(data=data_array[1:],columns=col_names).apply(pd.to_numeric)


In [None]:
# Massage of Data

In [None]:
# Handle Categorial Feature - Create dummy columns

col_str = original_dataframe.columns[original_dataframe.dtypes == object]
final_dataframe = pd.get_dummies(original_dataframe, columns = col_str, drop_first = True) 

# step by step
# ============
#dummies = pd.get_dummies(original_dataframe.target_col_name, prefix='target_col_name_prefix')
#dummies.drop(dummies.columns[0], axis=1, inplace=True)
#original_dataframe = pd.concat([original_dataframe, dummies], axis=1)

# Simple Way
# ==========
#dataframe["Sex"] = dataframe["Sex"].apply(lambda sex: 0 if sex == 'male' else 1)


In [None]:
# Handle NaN values - Imputer

from sklearn.preprocessing import Imputer

feature_cols = ['feature_1','feature_2','feature_3','feature_4']
features_original = dataframe[list(feature_cols)].values

imp = Imputer(missing_values='NaN', strategy='mean', axis=0)
features_transformed = imp.fit_transform(features_original)


In [None]:
# Analysing Data by Plotting Graphs

import matplotlib.pyplot as plt

%matplotlib inline

#plt.rcParams['figure.figsize'] = (8, 6)
#plt.rcParams['font.size'] = 14

# Pandas scatter plot
dataframe_name.plot(kind='scatter', x='feature_name', y='label_name', alpha=0.2)
# multiple scatter plots in Pandas
feature_cols = ['feature_1', 'feature_2', 'feature_3', 'feature_4']
fig, axs = plt.subplots(1, len(feature_cols), sharey=True)
for index, feature in enumerate(feature_cols):
    bikes.plot(kind='scatter', x=feature, y='label_name', ax=axs[index], figsize=(16, 3))

    
import seaborn as sns

# Seaborn scatter plot with regression line
sns.lmplot(x='feature_name', y='label_name', data=dataframe_name, aspect=1.5, scatter_kws={'alpha':0.2})
# multiple scattered plot using Seaborn
feature_cols = ['feature_1', 'feature_2', 'feature_3', 'feature_4']
sns.pairplot(dataframe_name, x_vars=feature_cols, y_vars='label_name', kind='reg')

# Box Plot for Logistic
sns.boxplot(x='label_name', y='feature_name', data=dataframe_name)
# Scatter Plot for Logistic
sns.lmplot(x='feature_1', y='feature_2', hue = 'label_name', data=dataframe_name, aspect=1.5, ci = None, fit_reg = False)


In [None]:
# Removing Low Variance Features - PCA

from sklearn.decomposition import PCA

label = dataframe["label_name"].values
predictors = dataframe.drop(axis = 1,labels= ["label_name"]).values

pca = PCA(n_components=len(features.columns)-1)
pca.fit(predictors)

#explained variance ratios are sorted descending (highest variance feature at the begining)
variance_ratio_cum_sum=np.cumsum(np.round(pca.explained_variance_ratio_, decimals=4)*100)
print(variance_ratio_cum_sum) #then able to find top variance feature
plt.plot(variance_ratio_cum_sum)

#Looking at above plot let's say taking (first) 10 variables (= top 10 highest variance features)
pca = PCA(n_components=10)
pca.fit(predictors)
transformed_features = pca.fit_transform(predictors) #can then be used for whatever model, say LinearRegression


# Cross Validation Score to double-check difference between transformed VS original
from sklearn.model_selection import cross_val_score

linear_txf = linear_model.LinearRegression()
score_lr_txf = cross_val_score(linear_txf, transformed_features, label, cv=5) # cv=5 -> 5-fold validation
print("LR Model Cross Validation score : " + str(score_lr_txf))
print("LR Model Cross Validation Mean score : " + str(score_lr_txf.mean()))
lr = linear_model.LinearRegression()
score_lr = cross_val_score(lr, predictors, label, cv=5)
print("LR Model Cross Validation score : " + str(score_lr))
print("LR Model Cross Validation Mean score : " + str(score_lr.mean()))


In [None]:
# Normalize data (variables of different features to same scale)
#
# Feature(s) with large range , with different features of different scale/range/metric
# Important features got loss / biased
#
# Except DecisionTree and RandomForest, can improve performance / result in certain cases

from sklearn.preprocessing import MinMaxScaler

scaler_model = MinMaxScaler().fit_transform(features.values)
bins = np.linspace(0, 1, 10)
digitized_features = np.digitize(scaler_model, bins)


In [None]:
# Preparation of Data for Training/Testing

In [None]:
# Splitting Data into Training and Testing sets

from sklearn.model_selection import train_test_split

feature_columns = list(['feature_1','feature_2','feature_3','feature_4'])
label_columns = "label_name"
features = dataframe[feature_columns].values
labels = dataframe[label_columns].values

features_train, features_test, labels_train, labels_test = train_test_split(features, labels, random_state=42)
#features_train, features_test, labels_train, labels_test = train_test_split(features, labels, test_size=0.3, random_state=123)


In [None]:
# Supervised Learning

In [None]:
# Linear Regression

from sklearn.linear_model import LinearRegression

linreg = LinearRegression()
linreg.fit(features_train, labels_train)

print(linreg.intercept_)
list(zip(feature_cols, linreg.coef_))

labels_predicted = linreg.predict(features_test)

print(np.sqrt(metrics.mean_squared_error(labels_test, labels_predicted)))


In [None]:
# Logistic Regression

from sklearn.linear_model import LogisticRegression

logreg = LogisticRegression(C=1e42) # Set Large C value for low regularization
logreg.fit(features_train, labels_train)

print(logreg.intercept_)
list(zip(feature_cols, logreg.coef_))

labels_predicted = logreg.predict(features_test) # Categorial / Digital
predicted_proba = logreg.predict_proba(features_test) # Analog (e.g. > 0.5 = true else false)


In [None]:
# Decision Tree

from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import roc_auc_score

decision_tree = DecisionTreeClassifier(criterion="entropy", max_depth=6)
#decision_tree = DecisionTreeClassifier(criterion="gini", max_depth=5)
decision_tree = decision_tree.fit(features_train, labels_train)

print(decision_tree.feature_importances_)
    
print(decision_tree.score(features_train, labels_train))

predicted_proba = decision_tree.predict_proba(features_test)
# Get success rate of the model
print(roc_auc_score(labels_test, predicted_proba[:,1])) # [:,1] for only 2 classes (e.g. Churn, Not Churn)


In [None]:
# Random Forest

from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import roc_auc_score

random_forest = RandomForestClassifier(criterion="entropy", max_depth = 10, min_samples_split=2, n_estimators = 100, random_state = 1)
#random_forest = RandomForestClassifier(criterion="gini", max_depth = 10, min_samples_split=2, n_estimators = 100, random_state = 1)
random_forest = random_forest.fit(features_train, labels_train)

print(random_forest.feature_importances_)

print(random_forest.score(features_train, labels_train))

predicted_proba = random_forest.predict_proba(features_test)
# Get success rate of the model
print(roc_auc_score(labels_test, predicted_proba[:,1])) # [:,1] for only 2 classes (e.g. Churn, Not Churn)


In [None]:
# kNN : k - Near Neighbors (Classification)

from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import roc_auc_score

knn = KNeighborsClassifier(n_neighbors=10, weights='distance')
knn = knn.fit(features_train, labels_train)

print(knn.score(features_train, labels_train))

predicted_proba = knn.predict_proba(features_test)
# Get success rate of the model
print(roc_auc_score(labels_test, predicted_proba[:,1])) # [:,1] for only 2 classes (e.g. Legitimate or Malicious site)


In [None]:
# Naive Bayes (Classification, strong independence between features)

# Feature Extraction - http://scikit-learn.org/stable/modules/feature_extraction.html#text-feature-extraction
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import TfidfVectorizer

#TFIDF Vectorizer
stopset = set(stopwords.words('english'))
vectorizer = TfidfVectorizer(use_idf=True, lowercase=True, strip_accents='ascii', stop_words=stopset)
features = vectorizer.fit_transform(dataframe.text) # transform sentence text from 1 column to features


from sklearn import naive_bayes
from sklearn.metrics import roc_auc_score

#clf = naive_bayes.BernoulliNB() #discrete data, for binary/boolean features
#clf = naive_bayes.GaussianNB() # ?
clf = naive_bayes.MultinomialNB() #discrete data, for occurrence count
clf.fit(features_train, labels_train)

predicted_proba = clf.predict_proba(features_test)
# Get success rate of the model
print(roc_auc_score(labels_test, predicted_proba[:,1])) # [:,1] for only 2 classes (e.g. Spam or Ham)


In [None]:
# SVM : Support Vector Machine
# Maximize margin around the separating hyperplane

from sklearn import svm
from sklearn.model_selection import cross_val_score

clf = svm.SVC() #C-Support Vector Classification
clf.fit(X=digitized,y=label.values)  

score_knn = cross_val_score(clf, digitized, label.values, cv=4)
print("Cross Validation score : " + str(score_knn))
print("Cross Validation Mean score : " + str(score_knn.mean()))


In [None]:
# Presentation of Result

In [None]:
# Confusion Matrix

from sklearn import metrics

def plot_confusion_matrix(df_confusion, title='Confusion matrix', cmap=plt.cm.Blues):
    plt.imshow(df_confusion, cmap=cmap)
    plt.title('Confusion Matrix')
    plt.colorbar()
    plt.yticks(np.arange(2), ('False', 'True'))
    plt.xticks(np.arange(2), ('False', 'True'))
    plt.tight_layout()
    plt.ylabel('Actual')
    plt.xlabel('Predicted')
    
    thresh = df_confusion.max() / 2
    for i, j in itertools.product(range(df_confusion.shape[0]), range(df_confusion.shape[1])):
        plt.text(j, i, df_confusion[i, j],
                 horizontalalignment="center",
                 color="white" if df_confusion[i, j] > thresh else "black")


labels_pred = whatever_model.predict(features)
df_confusion = metrics.confusion_matrix(labels, labels_pred)
#df_confusion
plot_confusion_matrix(df_confusion)

In [None]:
# Unsupervised Learning

In [None]:
# Clustering with k-Means

from sklearn.cluster import KMeans
from scipy.spatial.distance import cdist

import matplotlib.pyplot as plt
from mpl_toolkits.mplot3d import Axes3D

# Elbow Method - to select the optimum k
#   https://en.wikipedia.org/wiki/Determining_the_number_of_clusters_in_a_data_set
#
# Increasing k without penalty 
#    will always reduce the amount of error in the resulting clustering, 
#    to the extreme case of zero error if each data point is considered its own cluster 
#      (i.e., when k equals the number of data points, n)
# the optimal choice of k will strike a balance between 
#   - maximum compression of the data using a single cluster, and 
#   - maximum accuracy by assigning each data point to its own cluster
#
# When K increases, the centroids are closer to the clusters centroids
# The improvements will decline, at some point rapidly, creating the elbow shape

num_of_clusters=range(1,11) # k-means cluster analysis for 1-10 clusters
mean_dist=[]

for k in num_of_clusters:
    model=KMeans(n_clusters=k)
    model.fit(predictors)
    mean_dist.append( sum(np.min(cdist(predictors, model.cluster_centers_, 'euclidean'), axis=1)) / predictors.shape[0] )

plt.plot(num_of_clusters, mean_dist)
plt.xlabel('Number of clusters')
plt.ylabel('Average distance')
plt.title('Selecting k with the Elbow Method') # pick the fewest number of clusters that reduces the average distance
