In [1]:
from pandas.io.json import json_normalize
from pymongo import MongoClient
from sklearn import linear_model
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
import numpy as np
import pprint

In [2]:
course_cluster_uri = "mongodb://agg-student:agg-password@cluster0-shard-00-00-jxeqq.mongodb.net:27017,cluster0-shard-00-01-jxeqq.mongodb.net:27017,cluster0-shard-00-02-jxeqq.mongodb.net:27017/test?ssl=true&replicaSet=Cluster0-shard-0&authSource=admin"
course_client = MongoClient(course_cluster_uri)

In [3]:
titanic = course_client['coursera-agg']['titanic']

In [4]:
# Replace {} with a stage to determine the possible values for gender.
unique_gender_stage = {
    "$group": {
        "_id": "$gender"
    }
}

In [5]:
possible_gender_values = titanic.aggregate([
    {
        "$match": {
            "age": {"$type": "number"},
            "point_of_embarkation": {"$ne": ""}
        }
    },
    unique_gender_stage
])

In [6]:
# Print the distinct list of values for the gender field
pprint.pprint(list(possible_gender_values))

[{'_id': 'female'}, {'_id': 'male'}]


In [7]:
# Replace {} with a stage to determine the possible values for point_of_embarkation
unique_point_of_embarkation_stage = {
    "$group": {
        "_id": "$point_of_embarkation"
    }
}

In [8]:
possible_point_of_embarkation_values = titanic.aggregate([
    {
        "$match": {
            "age": {"$type": "number"},
            "point_of_embarkation": {"$ne": ""}
        }
    },
    unique_point_of_embarkation_stage
])

In [9]:
# Print the distinct list of values for the point_of_embarkation field
pprint.pprint(list(possible_point_of_embarkation_values))

[{'_id': 'Q'}, {'_id': 'C'}, {'_id': 'S'}]


In [14]:
# Given the possible values for point_of_embarkation and gender replace {} with a stage that
# will convert those field values to an integer.
# e.g., For the gender field convert 'female' to 0 and 'male' to 1
gender_and_point_of_embarkation_conversion_stage = {
    "$addFields": {
        "gender": {
            "$switch": {
                "branches": [
                    { "case": { "$eq": ["$gender", "female"] }, "then": 0 },
                    { "case": { "$eq": ["$gender", "male"] }, "then": 1 }
                ]
            }
        },
        "point_of_embarkation": {
            "$switch": {
                "branches": [
                    { "case": { "$eq": ["$point_of_embarkation", "C"] }, "then": 0 },
                    { "case": { "$eq": ["$point_of_embarkation", "Q"] }, "then": 1 },
                    { "case": { "$eq": ["$point_of_embarkation", "S"] }, "then": 2 }
                ]
            }
        }
    }
}

In [15]:
cursor = titanic.aggregate([
    {
        "$match": {
            "age": {"$type": "number"},
            "point_of_embarkation": {"$ne": ""}
        }
    },
    gender_and_point_of_embarkation_conversion_stage,
    {
        "$project": {
            "_id": 0,
            "ticket_number": 0,
            "name": 0,
            "passenger_id": 0,
            "cabin": 0
        }
    }
])

In [16]:
# Exhaust our cursor into a list
titanic_data = list(cursor)

In [17]:
# Load our dataset into a DataFrame
df = json_normalize(titanic_data)

In [18]:
# Pull out the survived column (only the data we want to correlate against)
df_x = df.drop(['survived'], axis=1)

In [19]:
# Only the survived column (the value we want to predict)
df_y = df['survived']

In [20]:
# Create a Least Squares Linear Regression object
reg = linear_model.LinearRegression()

In [21]:
# Split our dataset into a training set (80%) and a test set (20%)
x_train, x_test, y_train, y_test = train_test_split(df_x, df_y, test_size=0.2, random_state=0)

In [22]:
# Fit a linear model to our training data
reg.fit(x_train, y_train)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=1, normalize=False)

In [23]:
# Check our test set against our trained linear model
reg.predict(x_test)

array([ 0.11539484,  0.90271544,  0.57035024,  0.63352057,  0.01172327,
        0.01800834,  0.06378512,  0.62182568,  0.23796354,  0.71916517,
        1.02356223,  0.12115151,  0.55889098,  0.0294754 ,  0.82479006,
        0.45264152,  0.19836704,  0.3759118 ,  0.31355646,  0.22183264,
        0.44336354,  0.83162647,  0.61957421,  0.2443862 ,  0.38782189,
        0.83087677,  0.40299786,  0.24438707,  0.16732825,  0.09493174,
        0.61874023, -0.02928499, -0.08826717,  0.64469654,  0.59732346,
        0.03927108,  0.09652011,  0.14407889,  0.33575427,  0.56339801,
        0.85238449,  0.15566589,  0.64720588,  0.12115151,  0.12113096,
        0.44901703,  0.5885736 ,  0.20252592,  0.1326563 ,  0.31003879,
        0.68068162,  0.64536789,  0.03974376,  0.61607328,  0.00609504,
        0.56856501,  0.08091643,  0.07937081,  0.82713745,  0.950898  ,
        0.57283271,  0.51746255,  0.845916  ,  0.20614194,  0.64480443,
        0.76276758,  0.84160247,  0.11572537,  0.23575847,  0.74

In [24]:
# Calculate mean squared error (should be ~0.13-0.15%)
mean_squared_error(y_test, reg.predict(x_test))

0.13166469521402543

In [25]:
# age: 25,
# class: 1,
# fare_paid: 45,
# gender: 1, (replace Y with the integer you assigned for 'male')
# parents_children: 0,
# point_of_embarkation: 0, (replace Z with the integer you assigned for 'C')
# siblings_spouse: 1

fake_passenger = [[25, 1, 45, 1, 0, 0, 1]]

In [26]:
# Use this output to verify your completion of this exercise
reg.predict(fake_passenger)
#0.50169618

array([0.50169618])