# Part 1: Using Scikit Learn

## Importing required headers

In [1]:
import pandas
from pandas.tools.plotting import scatter_matrix
import matplotlib.pyplot as plt
from sklearn import cross_validation
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC

from sklearn import preprocessing
from sklearn.preprocessing import LabelEncoder

import warnings
warnings.filterwarnings('ignore')



## Loading the data

In [2]:
def loadData(filePath):
    dataframe= pandas.read_csv(filePath)
    del dataframe['Name']
    del dataframe['Cabin']
    del dataframe['Fare']
    del dataframe['Ticket']
    del dataframe['SibSp']
    del dataframe['PassengerId']
    del dataframe['Embarked']
    del dataframe['Parch']
    return dataframe
train = loadData('./Data/train.csv')

In [3]:
print "Train Data"
train.head()

Train Data


Unnamed: 0,Survived,Pclass,Sex,Age
0,0,3,male,22.0
1,1,1,female,38.0
2,1,3,female,26.0
3,1,1,female,35.0
4,0,3,male,35.0


## Splitting Training set 

In [4]:
# Split-out validation dataset

train =train.replace({'male':1,'female':0})

#Replacing the NaN with zeros
train.fillna(0, inplace=True)

array = train.values
#Extracting the data features
X = array[:,1:4]

print X.shape
#Creating label array
Y = array[:,0:1]
print Y.shape

#Splitting the dataset into train and validation sets
validation_size = 0.20
seed = 21
X_train, X_validation, Y_train, Y_validation = cross_validation.train_test_split(X, Y, test_size=validation_size, random_state=seed)

(891, 3)
(891, 1)


## Testing different models

In [5]:
# Test options and evaluation metric
num_folds = 10
num_instances = len(X_train)
seed = 3

models = []
models.append(('Logistic Regression', LogisticRegression()))
models.append(('Linear Discriminant Analysis', LinearDiscriminantAnalysis()))
models.append(('KNN', KNeighborsClassifier()))
models.append(('CART Decision Tree', DecisionTreeClassifier()))
models.append(('Naive Bayes', GaussianNB()))
models.append(('SVM', SVC()))

# evaluate each model in turn
results = []
names = []
for name, model in models:
    kfold = cross_validation.KFold(n=num_instances, n_folds=num_folds, random_state=seed)
    cv_results = cross_validation.cross_val_score(model, X_train, Y_train, cv=kfold, scoring='accuracy')
    results.append(cv_results)
    names.append(name)
    msg = "model = %s:\n mean = %f std = (%f)\n" % (name, cv_results.mean(), cv_results.std())
    print(msg)

knn = KNeighborsClassifier()
knn.fit(X_train, Y_train)
predictions = knn.predict(X_validation)
print(predictions)

model = Logistic Regression:
 mean = 0.786502 std = (0.041145)

model = Linear Discriminant Analysis:
 mean = 0.786502 std = (0.041145)

model = KNN:
 mean = 0.752778 std = (0.044019)

model = CART Decision Tree:
 mean = 0.807551 std = (0.039377)

model = Naive Bayes:
 mean = 0.787911 std = (0.039476)

model = SVM:
 mean = 0.780810 std = (0.036158)

[ 1.  0.  0.  0.  0.  0.  1.  0.  0.  0.  1.  0.  0.  0.  0.  0.  0.  0.
  1.  1.  0.  1.  0.  0.  0.  1.  0.  1.  0.  1.  0.  0.  1.  0.  0.  1.
  0.  0.  0.  0.  0.  0.  1.  1.  0.  0.  0.  0.  1.  0.  0.  0.  0.  0.
  0.  0.  0.  0.  0.  0.  1.  0.  1.  0.  0.  0.  1.  1.  1.  1.  0.  0.
  0.  0.  0.  1.  0.  1.  0.  0.  1.  1.  1.  0.  0.  0.  1.  0.  0.  1.
  1.  0.  0.  1.  0.  0.  0.  0.  1.  0.  1.  1.  0.  0.  1.  0.  1.  0.
  1.  0.  0.  0.  0.  1.  1.  0.  0.  0.  1.  0.  1.  1.  0.  0.  0.  0.
  0.  0.  1.  0.  1.  0.  0.  0.  0.  0.  0.  0.  0.  1.  0.  0.  0.  0.
  0.  0.  1.  0.  0.  0.  0.  1.  0.  0.  0.  1.  0.  0.  1.  1.

# Part 2: Using Apache Spark

## Importing required headers and initializing the environment

In [6]:
import findspark
findspark.init("/usr/local/spark")
import pyspark  
from pyspark import SparkContext
sc =SparkContext()
from pyspark.sql import SQLContext
sqlContext = SQLContext(sc)
from pyspark.mllib.regression import LabeledPoint
from pyspark.mllib.classification import NaiveBayes,NaiveBayesModel
import pyspark.mllib.regression as mllib_reg
import pyspark.mllib.linalg as mllib_lalg
import pyspark.mllib.classification as mllib_class
import pyspark.mllib.tree as mllib_tree

## Loading the dataset

In [7]:
df = (sqlContext.read.format("csv").options(header="true")
    .load("./Data/train.csv"))
df.head()

Row(PassengerId=u'1', Survived=u'0', Pclass=u'3', Name=u'Braund, Mr. Owen Harris', Sex=u'male', Age=u'22', SibSp=u'1', Parch=u'0', Ticket=u'A/5 21171', Fare=u'7.25', Cabin=None, Embarked=u'S')

In [8]:
df.describe()

DataFrame[summary: string, PassengerId: string, Survived: string, Pclass: string, Name: string, Sex: string, Age: string, SibSp: string, Parch: string, Ticket: string, Fare: string, Cabin: string, Embarked: string]

In [9]:
def DisplayColumnNames(dtf):
    for i in dtf:
        print i

In [10]:
df=df.drop('Name').drop('SibSp').drop('Ticket').drop('PassengerId').drop('Fare').drop('Embarked').drop('Parch').drop('Cabin')

In [11]:
DisplayColumnNames(df)

Column<Survived>
Column<Pclass>
Column<Sex>
Column<Age>


## Converting dataframe to RDD

In [12]:
rdd=df.rdd.map(list)

In [13]:
rdd.take(10)

[[u'0', u'3', u'male', u'22'],
 [u'1', u'1', u'female', u'38'],
 [u'1', u'3', u'female', u'26'],
 [u'1', u'1', u'female', u'35'],
 [u'0', u'3', u'male', u'35'],
 [u'0', u'3', u'male', None],
 [u'0', u'1', u'male', u'54'],
 [u'0', u'3', u'male', u'2'],
 [u'1', u'3', u'female', u'27'],
 [u'1', u'2', u'female', u'14']]

## Generating Labelled Points

In [14]:
def Myfunct(line):
    temp=[]
    for i in line:
        #print i
        temp.append(i)
    TargetVariable,Pclass,Sex,Age=temp
    features = [(0 if Age==None else Age) ,(1 if Sex == 'female' else 0),(Pclass)]
    return LabeledPoint(1 if TargetVariable == '1' else 0, features)

In [15]:
labeled_points_rdd = rdd.map(Myfunct)

In [16]:
labeled_points_rdd.take(10)

[LabeledPoint(0.0, [22.0,0.0,3.0]),
 LabeledPoint(1.0, [38.0,1.0,1.0]),
 LabeledPoint(1.0, [26.0,1.0,3.0]),
 LabeledPoint(1.0, [35.0,1.0,1.0]),
 LabeledPoint(0.0, [35.0,0.0,3.0]),
 LabeledPoint(0.0, [0.0,0.0,3.0]),
 LabeledPoint(0.0, [54.0,0.0,1.0]),
 LabeledPoint(0.0, [2.0,0.0,3.0]),
 LabeledPoint(1.0, [27.0,1.0,3.0]),
 LabeledPoint(1.0, [14.0,1.0,2.0])]

## Train and test split

In [17]:
train, test = labeled_points_rdd.randomSplit([0.7, 0.3], seed = 13)

## Training a Naive Bayes Classifier

In [18]:
# parameters:
lamda = 1.0

# initialize classifier:
nbay = mllib_class.NaiveBayes.train(train, lamda)

# Make prediction and test accuracy.
predictionAndLabel = test.map(lambda p : (nbay.predict(p.features), p.label))
testErr = predictionAndLabel.filter(lambda (v, p): v != p).count() / float(test.count())
accuracy = 100.0 * predictionAndLabel.filter(lambda (x, v): x == v).count() / test.count()
print accuracy

80.694980695
