# Machine Learning Template

## Importing all required libraries

In [None]:
import numpy as np
import pandas as pd
import seaborn as sn
from sklearn import svm
from sklearn import tree
from sklearn import metrics
import matplotlib.pyplot as plt
from sklearn.feature_selection import RFE
from sklearn.metrics import confusion_matrix
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.metrics import classification_report
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import train_test_split

## Loading the dataset

In [None]:
df = pd.read_csv('data.csv')
df.columns

## Assigning Features and target variables

In [None]:
features = df[['col1', 'col2', 'col3',....,'coln']]
features.shape

In [None]:
#target can change based on the objective
targets = df['target']
targets.shape

## Splitting the dataset into train and test datasets

In [None]:
#Spliting the dataset into train and test datasets
#test_size = 0.2 specifies that 80% of the data is used as train set and 20% as test set
X_train, X_test, y_train, y_test = train_test_split(features, targets, test_size=0.2, random_state=4)

## Logistic Regression Model

In [None]:
logreg = LogisticRegression().fit(X_train, y_train)
print('Logistic Regression Results \n\n')
print('Number of observations in the training data:', len(X_train))
print("Training Accuracy of Logit :",round(logreg.score(X_train, y_train),4))

y_pred = logreg.predict(X_test)
print('Number of observations in the test data:',len(X_test))
print("Test Accuracy of Logit :",round(metrics.accuracy_score(y_test, y_pred),4))

#Confusion Matrix
print("\n")
print("Confusion matrix:\n",confusion_matrix(y_test, y_pred))

#Classification Report
print("\n")
report = classification_report(y_test, y_pred)
print("Classification Report:\n" , report)

#AUC score
from sklearn.metrics import roc_auc_score
print("AUC score:\n",roc_auc_score(y_test, y_pred))

## ROCR Curve

In [None]:
fpr, tpr, thresholds = metrics.roc_curve(y_test, y_pred, pos_label=0)

# Print ROC curve
plt.plot(fpr,tpr)
plt.show() 

# Print AUC
auc = np.trapz(tpr,fpr)

## Running and Comparing Multiple Models at Once

In [None]:
from sklearn.preprocessing import StandardScaler

X = features.as_matrix().astype(np.float)
scaler = StandardScaler()
X = scaler.fit_transform(X)
y= targets
print("Feature space holds %d observations and %d features" % X.shape)
print("Unique target labels:", np.unique(y))

from sklearn.cross_validation import KFold

def run_cv(X,y,clf_class,**kwargs):
    # Construct a kfolds object
    kf = KFold(len(y),n_folds=5,shuffle=True)
    y_pred = y.copy()

    # Iterate through folds
    for train_index, test_index in kf:
        X_train, X_test = X[train_index], X[test_index]
        y_train = y[train_index]
        # Initialize a classifier with key word arguments
        clf = clf_class(**kwargs)
        clf.fit(X_train,y_train)
        y_pred[test_index] = clf.predict(X_test)
    return y_pred

## Performs Logistic Regression, Support Vector Machine, Random Forest, K-nearest neighbors with default parameters

In [None]:
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier as RF
from sklearn.neighbors import KNeighborsClassifier as KNN
from sklearn.linear_model import LogisticRegression as LR

def accuracy(y_true,y_pred):
    # NumPy interprets True and False as 1. and 0.
    return np.mean(y_true == y_pred)

print ("Logistic Regression:%.3f" % accuracy(y, run_cv(X,y,LR)))
print ("Support vector machines:%.3f" % accuracy(y, run_cv(X,y,SVC)))
print ("Random forest:%.3f" % accuracy(y, run_cv(X,y,RF)))
print ("K-nearest-neighbors:%.3f" % accuracy(y, run_cv(X,y,KNN)))