In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn
import scipy as sp
import matplotlib as mpl
import seaborn as sns

# Setting up Pandas
pd.set_option('display.width', 500)
pd.set_option('display.max_columns', 100)

# Setting up Seaborn
sns.set_style("whitegrid")
#sns.set_context("poster")

%matplotlib inline

In [2]:
train = pd.read_csv('training_data.csv')
test = pd.read_csv('test_data.csv')

In [3]:
training = train.drop('status_group', axis=1)

In [4]:
training = training.drop('Unnamed: 0', axis=1)

test = test.drop('Unnamed: 0', axis=1)

In [5]:
##Plot function for Confusion Matrix

#plt.rcParams['figure.figsize'] = (6.0, 6.0)
from sklearn.metrics import confusion_matrix
labels=['functional','functional needs repair','non functional']
def plot_confusion_matrix(cm, title='Confusion matrix', cmap=plt.cm.Blues):
    plt.imshow(cm, interpolation='nearest', cmap=cmap)
    plt.title(title)
    plt.colorbar(shrink=0.7)
    tick_marks = np.arange(len(labels))
    plt.xticks(tick_marks, labels, rotation=45, ha='right', fontsize=12)
    plt.yticks(tick_marks, labels , fontsize=12)
    plt.tight_layout()
    plt.ylabel('True label', fontsize=12)
    plt.xlabel('Predicted label', fontsize=12)

## Because we are having maximum amount of Categorical data let us transform the features, and convert them into list

In [6]:
def transform_feature(df, column_name):
    unique_values = set(df[column_name].tolist())
    transformer_dict = {}
    for index, value in enumerate(unique_values):
        transformer_dict[value] = index
    df[column_name] = df[column_name].apply(lambda y: transformer_dict[y])
    return df

In [7]:
integer_columns = ['days_since_recorded', 'population'] 
columns_to_transform = [col for col in training.columns if col not in integer_columns]
for column in columns_to_transform: 
    training = transform_feature(training, column)
    test = transform_feature(test, column)

## Now That we have Cleaned our dataset and Transformed our features, let us train our Model.

In [8]:
## Converting the Training dataframe into a matrix and predictor as y 
X = training.as_matrix()
y = train["status_group"].tolist()

> Splitting the Data Set with features into Train set  and Test set to train our model.

In [9]:
import sklearn.model_selection 
X_train, X_test, y_train, y_test = sklearn.model_selection.train_test_split(X, 
                                                                             y, 
                                                                             test_size = 0.3, 
                                                                             random_state = 0)

> As a part of EDA let us traing the data using different algorithms and findout which give us the better result.

## Random Forest Classifier

In [10]:
import sklearn.ensemble

rfc = sklearn.ensemble.RandomForestClassifier(n_estimators=1000, 
                                              min_samples_split=6,
                                              criterion='gini', 
                                              max_features='auto',
                                              oob_score=True,
                                              random_state=1,
                                              n_jobs=-1)


In [11]:
rfc.fit(X_train, y_train)

print('Random Forest Classifier Train Accuracy Score :', rfc.score(X_train, y_train))
print('Random Forest Classifier Test Score :', rfc.score(X_test, y_test))

Random Forest Classifier Train Accuracy Score : 0.904593554594
Random Forest Classifier Test Score : 0.795398428732


## Decision Tree

In [12]:
from sklearn.tree import DecisionTreeClassifier

In [13]:
dtc = DecisionTreeClassifier(criterion='gini',
                            max_depth = 10,
                            max_features = 'auto',
                            random_state = 1,
                            splitter = 'best')

In [14]:
dtc.fit(X_train, y_train)

DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=10,
            max_features='auto', max_leaf_nodes=None,
            min_impurity_split=1e-07, min_samples_leaf=1,
            min_samples_split=2, min_weight_fraction_leaf=0.0,
            presort=False, random_state=1, splitter='best')

In [15]:
print("Train Score :", dtc.score(X_train, y_train))
print("Test Score :", dtc.score(X_test, y_test))

Train Score : 0.733020683021
Test Score : 0.721717171717


## Extra Tree Classifier

In [16]:
from sklearn.ensemble import ExtraTreesClassifier

ETC = ExtraTreesClassifier(n_estimators=1000,min_samples_split=10)

In [17]:
ETC.fit(X_train, y_train)

ExtraTreesClassifier(bootstrap=False, class_weight=None, criterion='gini',
           max_depth=None, max_features='auto', max_leaf_nodes=None,
           min_impurity_split=1e-07, min_samples_leaf=1,
           min_samples_split=10, min_weight_fraction_leaf=0.0,
           n_estimators=1000, n_jobs=1, oob_score=False, random_state=None,
           verbose=0, warm_start=False)

In [18]:
print('Extra Tree Classifier Training Score :',ETC.score(X_train, y_train))
print('Extra Tree Classifier Test Score :',ETC.score(X_test, y_test))

Extra Tree Classifier Training Score : 0.890091390091
Extra Tree Classifier Test Score : 0.790572390572


## Linear Search CV

In [19]:
from sklearn.svm import LinearSVC
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler

In [20]:
pipe = Pipeline([('scl', StandardScaler()), ('clf', LinearSVC())])

In [21]:
param_grid = {'clf__C':[0.01],
                      'clf__class_weight':[None]}

In [22]:
estimator = GridSearchCV(estimator=pipe,
                                 param_grid=param_grid,
                                 n_jobs=-1)

In [26]:
estimator.fit(training, train.status_group)

GridSearchCV(cv=None, error_score='raise',
       estimator=Pipeline(steps=[('scl', StandardScaler(copy=True, with_mean=True, with_std=True)), ('clf', LinearSVC(C=1.0, class_weight=None, dual=True, fit_intercept=True,
     intercept_scaling=1, loss='squared_hinge', max_iter=1000,
     multi_class='ovr', penalty='l2', random_state=None, tol=0.0001,
     verbose=0))]),
       fit_params={}, iid=True, n_jobs=-1,
       param_grid={'clf__C': [0.01], 'clf__class_weight': [None]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score=True,
       scoring=None, verbose=0)

In [27]:
#print(estimator.score(X_test,y_train))

estimator.score(training,train.status_group)

0.63154882154882153

## Gradient Boosting

In [28]:
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import GradientBoostingClassifier

In [29]:
param_test1 = {'n_estimators':range(20,81,10)}
estimator = GridSearchCV(estimator = GradientBoostingClassifier(), 
                         param_grid = param_test1, scoring='roc_auc',n_jobs=4,iid=False, cv=5)
estimator.fit(X_train, y_train)

JoblibValueError: JoblibValueError
___________________________________________________________________________
Multiprocessing exception:
...........................................................................
/Users/dheerajkrishna/anaconda/lib/python3.6/runpy.py in _run_module_as_main(mod_name='ipykernel.__main__', alter_argv=1)
    188         sys.exit(msg)
    189     main_globals = sys.modules["__main__"].__dict__
    190     if alter_argv:
    191         sys.argv[0] = mod_spec.origin
    192     return _run_code(code, main_globals, None,
--> 193                      "__main__", mod_spec)
        mod_spec = ModuleSpec(name='ipykernel.__main__', loader=<_f...b/python3.6/site-packages/ipykernel/__main__.py')
    194 
    195 def run_module(mod_name, init_globals=None,
    196                run_name=None, alter_sys=False):
    197     """Execute a module's code without importing it

...........................................................................
/Users/dheerajkrishna/anaconda/lib/python3.6/runpy.py in _run_code(code=<code object <module> at 0x105e2ad20, file "/Use...3.6/site-packages/ipykernel/__main__.py", line 1>, run_globals={'__annotations__': {}, '__builtins__': <module 'builtins' (built-in)>, '__cached__': '/Users/dheerajkrishna/anaconda/lib/python3.6/sit...ges/ipykernel/__pycache__/__main__.cpython-36.pyc', '__doc__': None, '__file__': '/Users/dheerajkrishna/anaconda/lib/python3.6/site-packages/ipykernel/__main__.py', '__loader__': <_frozen_importlib_external.SourceFileLoader object>, '__name__': '__main__', '__package__': 'ipykernel', '__spec__': ModuleSpec(name='ipykernel.__main__', loader=<_f...b/python3.6/site-packages/ipykernel/__main__.py'), 'app': <module 'ipykernel.kernelapp' from '/Users/dheer.../python3.6/site-packages/ipykernel/kernelapp.py'>}, init_globals=None, mod_name='__main__', mod_spec=ModuleSpec(name='ipykernel.__main__', loader=<_f...b/python3.6/site-packages/ipykernel/__main__.py'), pkg_name='ipykernel', script_name=None)
     80                        __cached__ = cached,
     81                        __doc__ = None,
     82                        __loader__ = loader,
     83                        __package__ = pkg_name,
     84                        __spec__ = mod_spec)
---> 85     exec(code, run_globals)
        code = <code object <module> at 0x105e2ad20, file "/Use...3.6/site-packages/ipykernel/__main__.py", line 1>
        run_globals = {'__annotations__': {}, '__builtins__': <module 'builtins' (built-in)>, '__cached__': '/Users/dheerajkrishna/anaconda/lib/python3.6/sit...ges/ipykernel/__pycache__/__main__.cpython-36.pyc', '__doc__': None, '__file__': '/Users/dheerajkrishna/anaconda/lib/python3.6/site-packages/ipykernel/__main__.py', '__loader__': <_frozen_importlib_external.SourceFileLoader object>, '__name__': '__main__', '__package__': 'ipykernel', '__spec__': ModuleSpec(name='ipykernel.__main__', loader=<_f...b/python3.6/site-packages/ipykernel/__main__.py'), 'app': <module 'ipykernel.kernelapp' from '/Users/dheer.../python3.6/site-packages/ipykernel/kernelapp.py'>}
     86     return run_globals
     87 
     88 def _run_module_code(code, init_globals=None,
     89                     mod_name=None, mod_spec=None,

...........................................................................
/Users/dheerajkrishna/anaconda/lib/python3.6/site-packages/ipykernel/__main__.py in <module>()
      1 
      2 
----> 3 
      4 if __name__ == '__main__':
      5     from ipykernel import kernelapp as app
      6     app.launch_new_instance()
      7 
      8 
      9 
     10 

...........................................................................
/Users/dheerajkrishna/anaconda/lib/python3.6/site-packages/traitlets/config/application.py in launch_instance(cls=<class 'ipykernel.kernelapp.IPKernelApp'>, argv=None, **kwargs={})
    653 
    654         If a global instance already exists, this reinitializes and starts it
    655         """
    656         app = cls.instance(**kwargs)
    657         app.initialize(argv)
--> 658         app.start()
        app.start = <bound method IPKernelApp.start of <ipykernel.kernelapp.IPKernelApp object>>
    659 
    660 #-----------------------------------------------------------------------------
    661 # utility functions, for convenience
    662 #-----------------------------------------------------------------------------

...........................................................................
/Users/dheerajkrishna/anaconda/lib/python3.6/site-packages/ipykernel/kernelapp.py in start(self=<ipykernel.kernelapp.IPKernelApp object>)
    469             return self.subapp.start()
    470         if self.poller is not None:
    471             self.poller.start()
    472         self.kernel.start()
    473         try:
--> 474             ioloop.IOLoop.instance().start()
    475         except KeyboardInterrupt:
    476             pass
    477 
    478 launch_new_instance = IPKernelApp.launch_instance

...........................................................................
/Users/dheerajkrishna/anaconda/lib/python3.6/site-packages/zmq/eventloop/ioloop.py in start(self=<zmq.eventloop.ioloop.ZMQIOLoop object>)
    172             )
    173         return loop
    174     
    175     def start(self):
    176         try:
--> 177             super(ZMQIOLoop, self).start()
        self.start = <bound method ZMQIOLoop.start of <zmq.eventloop.ioloop.ZMQIOLoop object>>
    178         except ZMQError as e:
    179             if e.errno == ETERM:
    180                 # quietly return on ETERM
    181                 pass

...........................................................................
/Users/dheerajkrishna/anaconda/lib/python3.6/site-packages/tornado/ioloop.py in start(self=<zmq.eventloop.ioloop.ZMQIOLoop object>)
    882                 self._events.update(event_pairs)
    883                 while self._events:
    884                     fd, events = self._events.popitem()
    885                     try:
    886                         fd_obj, handler_func = self._handlers[fd]
--> 887                         handler_func(fd_obj, events)
        handler_func = <function wrap.<locals>.null_wrapper>
        fd_obj = <zmq.sugar.socket.Socket object>
        events = 1
    888                     except (OSError, IOError) as e:
    889                         if errno_from_exception(e) == errno.EPIPE:
    890                             # Happens when the client closes the connection
    891                             pass

...........................................................................
/Users/dheerajkrishna/anaconda/lib/python3.6/site-packages/tornado/stack_context.py in null_wrapper(*args=(<zmq.sugar.socket.Socket object>, 1), **kwargs={})
    270         # Fast path when there are no active contexts.
    271         def null_wrapper(*args, **kwargs):
    272             try:
    273                 current_state = _state.contexts
    274                 _state.contexts = cap_contexts[0]
--> 275                 return fn(*args, **kwargs)
        args = (<zmq.sugar.socket.Socket object>, 1)
        kwargs = {}
    276             finally:
    277                 _state.contexts = current_state
    278         null_wrapper._wrapped = True
    279         return null_wrapper

...........................................................................
/Users/dheerajkrishna/anaconda/lib/python3.6/site-packages/zmq/eventloop/zmqstream.py in _handle_events(self=<zmq.eventloop.zmqstream.ZMQStream object>, fd=<zmq.sugar.socket.Socket object>, events=1)
    435             # dispatch events:
    436             if events & IOLoop.ERROR:
    437                 gen_log.error("got POLLERR event on ZMQStream, which doesn't make sense")
    438                 return
    439             if events & IOLoop.READ:
--> 440                 self._handle_recv()
        self._handle_recv = <bound method ZMQStream._handle_recv of <zmq.eventloop.zmqstream.ZMQStream object>>
    441                 if not self.socket:
    442                     return
    443             if events & IOLoop.WRITE:
    444                 self._handle_send()

...........................................................................
/Users/dheerajkrishna/anaconda/lib/python3.6/site-packages/zmq/eventloop/zmqstream.py in _handle_recv(self=<zmq.eventloop.zmqstream.ZMQStream object>)
    467                 gen_log.error("RECV Error: %s"%zmq.strerror(e.errno))
    468         else:
    469             if self._recv_callback:
    470                 callback = self._recv_callback
    471                 # self._recv_callback = None
--> 472                 self._run_callback(callback, msg)
        self._run_callback = <bound method ZMQStream._run_callback of <zmq.eventloop.zmqstream.ZMQStream object>>
        callback = <function wrap.<locals>.null_wrapper>
        msg = [<zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>]
    473                 
    474         # self.update_state()
    475         
    476 

...........................................................................
/Users/dheerajkrishna/anaconda/lib/python3.6/site-packages/zmq/eventloop/zmqstream.py in _run_callback(self=<zmq.eventloop.zmqstream.ZMQStream object>, callback=<function wrap.<locals>.null_wrapper>, *args=([<zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>],), **kwargs={})
    409         close our socket."""
    410         try:
    411             # Use a NullContext to ensure that all StackContexts are run
    412             # inside our blanket exception handler rather than outside.
    413             with stack_context.NullContext():
--> 414                 callback(*args, **kwargs)
        callback = <function wrap.<locals>.null_wrapper>
        args = ([<zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>],)
        kwargs = {}
    415         except:
    416             gen_log.error("Uncaught exception, closing connection.",
    417                           exc_info=True)
    418             # Close the socket on an uncaught exception from a user callback

...........................................................................
/Users/dheerajkrishna/anaconda/lib/python3.6/site-packages/tornado/stack_context.py in null_wrapper(*args=([<zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>],), **kwargs={})
    270         # Fast path when there are no active contexts.
    271         def null_wrapper(*args, **kwargs):
    272             try:
    273                 current_state = _state.contexts
    274                 _state.contexts = cap_contexts[0]
--> 275                 return fn(*args, **kwargs)
        args = ([<zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>],)
        kwargs = {}
    276             finally:
    277                 _state.contexts = current_state
    278         null_wrapper._wrapped = True
    279         return null_wrapper

...........................................................................
/Users/dheerajkrishna/anaconda/lib/python3.6/site-packages/ipykernel/kernelbase.py in dispatcher(msg=[<zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>])
    271         if self.control_stream:
    272             self.control_stream.on_recv(self.dispatch_control, copy=False)
    273 
    274         def make_dispatcher(stream):
    275             def dispatcher(msg):
--> 276                 return self.dispatch_shell(stream, msg)
        msg = [<zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>]
    277             return dispatcher
    278 
    279         for s in self.shell_streams:
    280             s.on_recv(make_dispatcher(s), copy=False)

...........................................................................
/Users/dheerajkrishna/anaconda/lib/python3.6/site-packages/ipykernel/kernelbase.py in dispatch_shell(self=<ipykernel.ipkernel.IPythonKernel object>, stream=<zmq.eventloop.zmqstream.ZMQStream object>, msg={'buffers': [], 'content': {'allow_stdin': True, 'code': "param_test1 = {'n_estimators':range(20,81,10)}\ne...,iid=False, cv=5)\nestimator.fit(X_train, y_train)", 'silent': False, 'stop_on_error': True, 'store_history': True, 'user_expressions': {}}, 'header': {'date': '2017-06-18T19:41:20.723657', 'msg_id': 'C8DA3E51050740E39FA68238A739FCE8', 'msg_type': 'execute_request', 'session': '7998EBB4061848128E94D706CF9F07A1', 'username': 'username', 'version': '5.0'}, 'metadata': {}, 'msg_id': 'C8DA3E51050740E39FA68238A739FCE8', 'msg_type': 'execute_request', 'parent_header': {}})
    223             self.log.error("UNKNOWN MESSAGE TYPE: %r", msg_type)
    224         else:
    225             self.log.debug("%s: %s", msg_type, msg)
    226             self.pre_handler_hook()
    227             try:
--> 228                 handler(stream, idents, msg)
        handler = <bound method Kernel.execute_request of <ipykernel.ipkernel.IPythonKernel object>>
        stream = <zmq.eventloop.zmqstream.ZMQStream object>
        idents = [b'7998EBB4061848128E94D706CF9F07A1']
        msg = {'buffers': [], 'content': {'allow_stdin': True, 'code': "param_test1 = {'n_estimators':range(20,81,10)}\ne...,iid=False, cv=5)\nestimator.fit(X_train, y_train)", 'silent': False, 'stop_on_error': True, 'store_history': True, 'user_expressions': {}}, 'header': {'date': '2017-06-18T19:41:20.723657', 'msg_id': 'C8DA3E51050740E39FA68238A739FCE8', 'msg_type': 'execute_request', 'session': '7998EBB4061848128E94D706CF9F07A1', 'username': 'username', 'version': '5.0'}, 'metadata': {}, 'msg_id': 'C8DA3E51050740E39FA68238A739FCE8', 'msg_type': 'execute_request', 'parent_header': {}}
    229             except Exception:
    230                 self.log.error("Exception in message handler:", exc_info=True)
    231             finally:
    232                 self.post_handler_hook()

...........................................................................
/Users/dheerajkrishna/anaconda/lib/python3.6/site-packages/ipykernel/kernelbase.py in execute_request(self=<ipykernel.ipkernel.IPythonKernel object>, stream=<zmq.eventloop.zmqstream.ZMQStream object>, ident=[b'7998EBB4061848128E94D706CF9F07A1'], parent={'buffers': [], 'content': {'allow_stdin': True, 'code': "param_test1 = {'n_estimators':range(20,81,10)}\ne...,iid=False, cv=5)\nestimator.fit(X_train, y_train)", 'silent': False, 'stop_on_error': True, 'store_history': True, 'user_expressions': {}}, 'header': {'date': '2017-06-18T19:41:20.723657', 'msg_id': 'C8DA3E51050740E39FA68238A739FCE8', 'msg_type': 'execute_request', 'session': '7998EBB4061848128E94D706CF9F07A1', 'username': 'username', 'version': '5.0'}, 'metadata': {}, 'msg_id': 'C8DA3E51050740E39FA68238A739FCE8', 'msg_type': 'execute_request', 'parent_header': {}})
    385         if not silent:
    386             self.execution_count += 1
    387             self._publish_execute_input(code, parent, self.execution_count)
    388 
    389         reply_content = self.do_execute(code, silent, store_history,
--> 390                                         user_expressions, allow_stdin)
        user_expressions = {}
        allow_stdin = True
    391 
    392         # Flush output before sending the reply.
    393         sys.stdout.flush()
    394         sys.stderr.flush()

...........................................................................
/Users/dheerajkrishna/anaconda/lib/python3.6/site-packages/ipykernel/ipkernel.py in do_execute(self=<ipykernel.ipkernel.IPythonKernel object>, code="param_test1 = {'n_estimators':range(20,81,10)}\ne...,iid=False, cv=5)\nestimator.fit(X_train, y_train)", silent=False, store_history=True, user_expressions={}, allow_stdin=True)
    191 
    192         self._forward_input(allow_stdin)
    193 
    194         reply_content = {}
    195         try:
--> 196             res = shell.run_cell(code, store_history=store_history, silent=silent)
        res = undefined
        shell.run_cell = <bound method ZMQInteractiveShell.run_cell of <ipykernel.zmqshell.ZMQInteractiveShell object>>
        code = "param_test1 = {'n_estimators':range(20,81,10)}\ne...,iid=False, cv=5)\nestimator.fit(X_train, y_train)"
        store_history = True
        silent = False
    197         finally:
    198             self._restore_input()
    199 
    200         if res.error_before_exec is not None:

...........................................................................
/Users/dheerajkrishna/anaconda/lib/python3.6/site-packages/ipykernel/zmqshell.py in run_cell(self=<ipykernel.zmqshell.ZMQInteractiveShell object>, *args=("param_test1 = {'n_estimators':range(20,81,10)}\ne...,iid=False, cv=5)\nestimator.fit(X_train, y_train)",), **kwargs={'silent': False, 'store_history': True})
    496             )
    497         self.payload_manager.write_payload(payload)
    498 
    499     def run_cell(self, *args, **kwargs):
    500         self._last_traceback = None
--> 501         return super(ZMQInteractiveShell, self).run_cell(*args, **kwargs)
        self.run_cell = <bound method ZMQInteractiveShell.run_cell of <ipykernel.zmqshell.ZMQInteractiveShell object>>
        args = ("param_test1 = {'n_estimators':range(20,81,10)}\ne...,iid=False, cv=5)\nestimator.fit(X_train, y_train)",)
        kwargs = {'silent': False, 'store_history': True}
    502 
    503     def _showtraceback(self, etype, evalue, stb):
    504         # try to preserve ordering of tracebacks and print statements
    505         sys.stdout.flush()

...........................................................................
/Users/dheerajkrishna/anaconda/lib/python3.6/site-packages/IPython/core/interactiveshell.py in run_cell(self=<ipykernel.zmqshell.ZMQInteractiveShell object>, raw_cell="param_test1 = {'n_estimators':range(20,81,10)}\ne...,iid=False, cv=5)\nestimator.fit(X_train, y_train)", store_history=True, silent=False, shell_futures=True)
   2712                 self.displayhook.exec_result = result
   2713 
   2714                 # Execute the user code
   2715                 interactivity = "none" if silent else self.ast_node_interactivity
   2716                 has_raised = self.run_ast_nodes(code_ast.body, cell_name,
-> 2717                    interactivity=interactivity, compiler=compiler, result=result)
        interactivity = 'last_expr'
        compiler = <IPython.core.compilerop.CachingCompiler object>
   2718                 
   2719                 self.last_execution_succeeded = not has_raised
   2720 
   2721                 # Reset this so later displayed values do not modify the

...........................................................................
/Users/dheerajkrishna/anaconda/lib/python3.6/site-packages/IPython/core/interactiveshell.py in run_ast_nodes(self=<ipykernel.zmqshell.ZMQInteractiveShell object>, nodelist=[<_ast.Assign object>, <_ast.Assign object>, <_ast.Expr object>], cell_name='<ipython-input-29-cf999616ceed>', interactivity='last', compiler=<IPython.core.compilerop.CachingCompiler object>, result=<ExecutionResult object at 113369ac8, execution_..._before_exec=None error_in_exec=None result=None>)
   2822                     return True
   2823 
   2824             for i, node in enumerate(to_run_interactive):
   2825                 mod = ast.Interactive([node])
   2826                 code = compiler(mod, cell_name, "single")
-> 2827                 if self.run_code(code, result):
        self.run_code = <bound method InteractiveShell.run_code of <ipykernel.zmqshell.ZMQInteractiveShell object>>
        code = <code object <module> at 0x111e12ed0, file "<ipython-input-29-cf999616ceed>", line 4>
        result = <ExecutionResult object at 113369ac8, execution_..._before_exec=None error_in_exec=None result=None>
   2828                     return True
   2829 
   2830             # Flush softspace
   2831             if softspace(sys.stdout, 0):

...........................................................................
/Users/dheerajkrishna/anaconda/lib/python3.6/site-packages/IPython/core/interactiveshell.py in run_code(self=<ipykernel.zmqshell.ZMQInteractiveShell object>, code_obj=<code object <module> at 0x111e12ed0, file "<ipython-input-29-cf999616ceed>", line 4>, result=<ExecutionResult object at 113369ac8, execution_..._before_exec=None error_in_exec=None result=None>)
   2876         outflag = 1  # happens in more places, so it's easier as default
   2877         try:
   2878             try:
   2879                 self.hooks.pre_run_code_hook()
   2880                 #rprint('Running code', repr(code_obj)) # dbg
-> 2881                 exec(code_obj, self.user_global_ns, self.user_ns)
        code_obj = <code object <module> at 0x111e12ed0, file "<ipython-input-29-cf999616ceed>", line 4>
        self.user_global_ns = {'DecisionTreeClassifier': <class 'sklearn.tree.tree.DecisionTreeClassifier'>, 'ETC': ExtraTreesClassifier(bootstrap=False, class_weig...ate=None,
           verbose=0, warm_start=False), 'ExtraTreesClassifier': <class 'sklearn.ensemble.forest.ExtraTreesClassifier'>, 'GradientBoostingClassifier': <class 'sklearn.ensemble.gradient_boosting.GradientBoostingClassifier'>, 'GridSearchCV': <class 'sklearn.model_selection._search.GridSearchCV'>, 'In': ['', 'import pandas as pd\nimport numpy as np\nimport ma...oster")\n\nget_ipython().magic(\'matplotlib inline\')', "train = pd.read_csv('training_data.csv')\ntest = pd.read_csv('test_data.csv')", "training = train.drop('status_group', axis=1)", "training = training.drop('Unnamed: 0', axis=1)\n\ntest = test.drop('Unnamed: 0', axis=1)", "##Plot function for Confusion Matrix\n\n#plt.rcPar...2)\n    plt.xlabel('Predicted label', fontsize=12)", 'def transform_feature(df, column_name):\n    uniq...pply(lambda y: transformer_dict[y])\n    return df', "integer_columns = ['days_since_recorded', 'popul...olumn)\n    test = transform_feature(test, column)", '## Converting the Training dataframe into a matr...ng.as_matrix()\ny = train["status_group"].tolist()', 'import sklearn.model_selection \nX_train, X_test,...                                random_state = 0)', 'import sklearn.ensemble\n\nrfc = sklearn.ensemble....                                       n_jobs=-1)', "rfc.fit(X_train, y_train)\n\nprint('Random Forest ...ssifier Test Score :', rfc.score(X_test, y_test))", 'from sklearn.tree import DecisionTreeClassifier', "dtc = DecisionTreeClassifier(criterion='gini',\n ...1,\n                            splitter = 'best')", 'dtc.fit(X_train, y_train)', 'print("Train Score :", dtc.score(X_train, y_train))\nprint("Test Score :", dtc.score(X_test, y_test))', 'from sklearn.ensemble import ExtraTreesClassifie...lassifier(n_estimators=1000,min_samples_split=10)', 'ETC.fit(X_train, y_train)', "print('Extra Tree Classifier Training Score :',E...assifier Test Score :',ETC.score(X_test, y_test))", 'from sklearn.svm import LinearSVC\nfrom sklearn.m...\nfrom sklearn.preprocessing import StandardScaler', ...], 'LinearSVC': <class 'sklearn.svm.classes.LinearSVC'>, 'Out': {14: DecisionTreeClassifier(class_weight=None, criter...  presort=False, random_state=1, splitter='best'), 17: ExtraTreesClassifier(bootstrap=False, class_weig...ate=None,
           verbose=0, warm_start=False), 23: GridSearchCV(cv=None, error_score='raise',
     ...train_score=True,
       scoring=None, verbose=0), 24: 0.63154882154882153, 25: GridSearchCV(cv=None, error_score='raise',
     ...train_score=True,
       scoring=None, verbose=0), 26: GridSearchCV(cv=None, error_score='raise',
     ...train_score=True,
       scoring=None, verbose=0), 27: 0.63154882154882153}, 'Pipeline': <class 'sklearn.pipeline.Pipeline'>, 'StandardScaler': <class 'sklearn.preprocessing.data.StandardScaler'>, ...}
        self.user_ns = {'DecisionTreeClassifier': <class 'sklearn.tree.tree.DecisionTreeClassifier'>, 'ETC': ExtraTreesClassifier(bootstrap=False, class_weig...ate=None,
           verbose=0, warm_start=False), 'ExtraTreesClassifier': <class 'sklearn.ensemble.forest.ExtraTreesClassifier'>, 'GradientBoostingClassifier': <class 'sklearn.ensemble.gradient_boosting.GradientBoostingClassifier'>, 'GridSearchCV': <class 'sklearn.model_selection._search.GridSearchCV'>, 'In': ['', 'import pandas as pd\nimport numpy as np\nimport ma...oster")\n\nget_ipython().magic(\'matplotlib inline\')', "train = pd.read_csv('training_data.csv')\ntest = pd.read_csv('test_data.csv')", "training = train.drop('status_group', axis=1)", "training = training.drop('Unnamed: 0', axis=1)\n\ntest = test.drop('Unnamed: 0', axis=1)", "##Plot function for Confusion Matrix\n\n#plt.rcPar...2)\n    plt.xlabel('Predicted label', fontsize=12)", 'def transform_feature(df, column_name):\n    uniq...pply(lambda y: transformer_dict[y])\n    return df', "integer_columns = ['days_since_recorded', 'popul...olumn)\n    test = transform_feature(test, column)", '## Converting the Training dataframe into a matr...ng.as_matrix()\ny = train["status_group"].tolist()', 'import sklearn.model_selection \nX_train, X_test,...                                random_state = 0)', 'import sklearn.ensemble\n\nrfc = sklearn.ensemble....                                       n_jobs=-1)', "rfc.fit(X_train, y_train)\n\nprint('Random Forest ...ssifier Test Score :', rfc.score(X_test, y_test))", 'from sklearn.tree import DecisionTreeClassifier', "dtc = DecisionTreeClassifier(criterion='gini',\n ...1,\n                            splitter = 'best')", 'dtc.fit(X_train, y_train)', 'print("Train Score :", dtc.score(X_train, y_train))\nprint("Test Score :", dtc.score(X_test, y_test))', 'from sklearn.ensemble import ExtraTreesClassifie...lassifier(n_estimators=1000,min_samples_split=10)', 'ETC.fit(X_train, y_train)', "print('Extra Tree Classifier Training Score :',E...assifier Test Score :',ETC.score(X_test, y_test))", 'from sklearn.svm import LinearSVC\nfrom sklearn.m...\nfrom sklearn.preprocessing import StandardScaler', ...], 'LinearSVC': <class 'sklearn.svm.classes.LinearSVC'>, 'Out': {14: DecisionTreeClassifier(class_weight=None, criter...  presort=False, random_state=1, splitter='best'), 17: ExtraTreesClassifier(bootstrap=False, class_weig...ate=None,
           verbose=0, warm_start=False), 23: GridSearchCV(cv=None, error_score='raise',
     ...train_score=True,
       scoring=None, verbose=0), 24: 0.63154882154882153, 25: GridSearchCV(cv=None, error_score='raise',
     ...train_score=True,
       scoring=None, verbose=0), 26: GridSearchCV(cv=None, error_score='raise',
     ...train_score=True,
       scoring=None, verbose=0), 27: 0.63154882154882153}, 'Pipeline': <class 'sklearn.pipeline.Pipeline'>, 'StandardScaler': <class 'sklearn.preprocessing.data.StandardScaler'>, ...}
   2882             finally:
   2883                 # Reset our crash handler in place
   2884                 sys.excepthook = old_excepthook
   2885         except SystemExit as e:

...........................................................................
/Users/dheerajkrishna/Documents/identifying_faulty_pumps/<ipython-input-29-cf999616ceed> in <module>()
      1 
      2 
      3 param_test1 = {'n_estimators':range(20,81,10)}
----> 4 estimator = GridSearchCV(estimator = GradientBoostingClassifier(), 
      5                          param_grid = param_test1, scoring='roc_auc',n_jobs=4,iid=False, cv=5)
      6 estimator.fit(X_train, y_train)
      7 
      8 
      9 
     10 

...........................................................................
/Users/dheerajkrishna/anaconda/lib/python3.6/site-packages/sklearn/model_selection/_search.py in fit(self=GridSearchCV(cv=5, error_score='raise',
       e..._score=True,
       scoring='roc_auc', verbose=0), X=array([[   0,  286,    2, ...,    0,    6,    5]...      [   0, 1003,    2, ...,    0,    4,    4]]), y=['non functional', 'functional', 'functional', 'functional', 'functional', 'functional needs repair', 'functional', 'functional', 'non functional', 'non functional', 'non functional', 'non functional', 'functional', 'functional', 'functional', 'functional', 'non functional', 'functional', 'non functional', 'non functional', ...], groups=None)
    940 
    941         groups : array-like, with shape (n_samples,), optional
    942             Group labels for the samples used while splitting the dataset into
    943             train/test set.
    944         """
--> 945         return self._fit(X, y, groups, ParameterGrid(self.param_grid))
        self._fit = <bound method BaseSearchCV._fit of GridSearchCV(...score=True,
       scoring='roc_auc', verbose=0)>
        X = array([[   0,  286,    2, ...,    0,    6,    5]...      [   0, 1003,    2, ...,    0,    4,    4]])
        y = ['non functional', 'functional', 'functional', 'functional', 'functional', 'functional needs repair', 'functional', 'functional', 'non functional', 'non functional', 'non functional', 'non functional', 'functional', 'functional', 'functional', 'functional', 'non functional', 'functional', 'non functional', 'non functional', ...]
        groups = None
        self.param_grid = {'n_estimators': range(20, 81, 10)}
    946 
    947 
    948 class RandomizedSearchCV(BaseSearchCV):
    949     """Randomized search on hyper parameters.

...........................................................................
/Users/dheerajkrishna/anaconda/lib/python3.6/site-packages/sklearn/model_selection/_search.py in _fit(self=GridSearchCV(cv=5, error_score='raise',
       e..._score=True,
       scoring='roc_auc', verbose=0), X=array([[   0,  286,    2, ...,    0,    6,    5]...      [   0, 1003,    2, ...,    0,    4,    4]]), y=['non functional', 'functional', 'functional', 'functional', 'functional', 'functional needs repair', 'functional', 'functional', 'non functional', 'non functional', 'non functional', 'non functional', 'functional', 'functional', 'functional', 'functional', 'non functional', 'functional', 'non functional', 'non functional', ...], groups=None, parameter_iterable=<sklearn.model_selection._search.ParameterGrid object>)
    559                                   fit_params=self.fit_params,
    560                                   return_train_score=self.return_train_score,
    561                                   return_n_test_samples=True,
    562                                   return_times=True, return_parameters=True,
    563                                   error_score=self.error_score)
--> 564           for parameters in parameter_iterable
        parameters = undefined
        parameter_iterable = <sklearn.model_selection._search.ParameterGrid object>
    565           for train, test in cv_iter)
    566 
    567         # if one choose to see train score, "out" will contain train score info
    568         if self.return_train_score:

...........................................................................
/Users/dheerajkrishna/anaconda/lib/python3.6/site-packages/sklearn/externals/joblib/parallel.py in __call__(self=Parallel(n_jobs=4), iterable=<generator object BaseSearchCV._fit.<locals>.<genexpr>>)
    763             if pre_dispatch == "all" or n_jobs == 1:
    764                 # The iterable was consumed all at once by the above for loop.
    765                 # No need to wait for async callbacks to trigger to
    766                 # consumption.
    767                 self._iterating = False
--> 768             self.retrieve()
        self.retrieve = <bound method Parallel.retrieve of Parallel(n_jobs=4)>
    769             # Make sure that we get a last message telling us we are done
    770             elapsed_time = time.time() - self._start_time
    771             self._print('Done %3i out of %3i | elapsed: %s finished',
    772                         (len(self._output), len(self._output),

---------------------------------------------------------------------------
Sub-process traceback:
---------------------------------------------------------------------------
ValueError                                         Sun Jun 18 19:41:29 2017
PID: 2373           Python 3.6.0: /Users/dheerajkrishna/anaconda/bin/python
...........................................................................
/Users/dheerajkrishna/anaconda/lib/python3.6/site-packages/sklearn/externals/joblib/parallel.py in __call__(self=<sklearn.externals.joblib.parallel.BatchedCalls object>)
    126     def __init__(self, iterator_slice):
    127         self.items = list(iterator_slice)
    128         self._size = len(self.items)
    129 
    130     def __call__(self):
--> 131         return [func(*args, **kwargs) for func, args, kwargs in self.items]
        self.items = [(<function _fit_and_score>, (GradientBoostingClassifier(criterion='friedman_m...      subsample=1.0, verbose=0, warm_start=False), memmap([[   0,  286,    2, ...,    0,    6,    5...      [   0, 1003,    2, ...,    0,    4,    4]]), ['non functional', 'functional', 'functional', 'functional', 'functional', 'functional needs repair', 'functional', 'functional', 'non functional', 'non functional', 'non functional', 'non functional', 'functional', 'functional', 'functional', 'functional', 'non functional', 'functional', 'non functional', 'non functional', ...], make_scorer(roc_auc_score, needs_threshold=True), array([ 8189,  8190,  8194, ..., 41577, 41578, 41579]), array([   0,    1,    2, ..., 8507, 8519, 8523]), 0, {'n_estimators': 20}), {'error_score': 'raise', 'fit_params': {}, 'return_n_test_samples': True, 'return_parameters': True, 'return_times': True, 'return_train_score': True})]
    132 
    133     def __len__(self):
    134         return self._size
    135 

...........................................................................
/Users/dheerajkrishna/anaconda/lib/python3.6/site-packages/sklearn/externals/joblib/parallel.py in <listcomp>(.0=<list_iterator object>)
    126     def __init__(self, iterator_slice):
    127         self.items = list(iterator_slice)
    128         self._size = len(self.items)
    129 
    130     def __call__(self):
--> 131         return [func(*args, **kwargs) for func, args, kwargs in self.items]
        func = <function _fit_and_score>
        args = (GradientBoostingClassifier(criterion='friedman_m...      subsample=1.0, verbose=0, warm_start=False), memmap([[   0,  286,    2, ...,    0,    6,    5...      [   0, 1003,    2, ...,    0,    4,    4]]), ['non functional', 'functional', 'functional', 'functional', 'functional', 'functional needs repair', 'functional', 'functional', 'non functional', 'non functional', 'non functional', 'non functional', 'functional', 'functional', 'functional', 'functional', 'non functional', 'functional', 'non functional', 'non functional', ...], make_scorer(roc_auc_score, needs_threshold=True), array([ 8189,  8190,  8194, ..., 41577, 41578, 41579]), array([   0,    1,    2, ..., 8507, 8519, 8523]), 0, {'n_estimators': 20})
        kwargs = {'error_score': 'raise', 'fit_params': {}, 'return_n_test_samples': True, 'return_parameters': True, 'return_times': True, 'return_train_score': True}
    132 
    133     def __len__(self):
    134         return self._size
    135 

...........................................................................
/Users/dheerajkrishna/anaconda/lib/python3.6/site-packages/sklearn/model_selection/_validation.py in _fit_and_score(estimator=GradientBoostingClassifier(criterion='friedman_m...      subsample=1.0, verbose=0, warm_start=False), X=memmap([[   0,  286,    2, ...,    0,    6,    5...      [   0, 1003,    2, ...,    0,    4,    4]]), y=['non functional', 'functional', 'functional', 'functional', 'functional', 'functional needs repair', 'functional', 'functional', 'non functional', 'non functional', 'non functional', 'non functional', 'functional', 'functional', 'functional', 'functional', 'non functional', 'functional', 'non functional', 'non functional', ...], scorer=make_scorer(roc_auc_score, needs_threshold=True), train=array([ 8189,  8190,  8194, ..., 41577, 41578, 41579]), test=array([   0,    1,    2, ..., 8507, 8519, 8523]), verbose=0, parameters={'n_estimators': 20}, fit_params={}, return_train_score=True, return_parameters=True, return_n_test_samples=True, return_times=True, error_score='raise')
    255                              " numeric value. (Hint: if using 'raise', please"
    256                              " make sure that it has been spelled correctly.)")
    257 
    258     else:
    259         fit_time = time.time() - start_time
--> 260         test_score = _score(estimator, X_test, y_test, scorer)
        test_score = undefined
        estimator = GradientBoostingClassifier(criterion='friedman_m...      subsample=1.0, verbose=0, warm_start=False)
        X_test = memmap([[  0, 286,   2, ...,   0,   6,   5],
   ... 1],
       [  0, 405,   5, ...,   0,   1,   1]])
        y_test = ['non functional', 'functional', 'functional', 'functional', 'functional', 'functional needs repair', 'functional', 'functional', 'non functional', 'non functional', 'non functional', 'non functional', 'functional', 'functional', 'functional', 'functional', 'non functional', 'functional', 'non functional', 'non functional', ...]
        scorer = make_scorer(roc_auc_score, needs_threshold=True)
    261         score_time = time.time() - start_time - fit_time
    262         if return_train_score:
    263             train_score = _score(estimator, X_train, y_train, scorer)
    264 

...........................................................................
/Users/dheerajkrishna/anaconda/lib/python3.6/site-packages/sklearn/model_selection/_validation.py in _score(estimator=GradientBoostingClassifier(criterion='friedman_m...      subsample=1.0, verbose=0, warm_start=False), X_test=memmap([[  0, 286,   2, ...,   0,   6,   5],
   ... 1],
       [  0, 405,   5, ...,   0,   1,   1]]), y_test=['non functional', 'functional', 'functional', 'functional', 'functional', 'functional needs repair', 'functional', 'functional', 'non functional', 'non functional', 'non functional', 'non functional', 'functional', 'functional', 'functional', 'functional', 'non functional', 'functional', 'non functional', 'non functional', ...], scorer=make_scorer(roc_auc_score, needs_threshold=True))
    283 def _score(estimator, X_test, y_test, scorer):
    284     """Compute the score of an estimator on a given test set."""
    285     if y_test is None:
    286         score = scorer(estimator, X_test)
    287     else:
--> 288         score = scorer(estimator, X_test, y_test)
        score = undefined
        scorer = make_scorer(roc_auc_score, needs_threshold=True)
        estimator = GradientBoostingClassifier(criterion='friedman_m...      subsample=1.0, verbose=0, warm_start=False)
        X_test = memmap([[  0, 286,   2, ...,   0,   6,   5],
   ... 1],
       [  0, 405,   5, ...,   0,   1,   1]])
        y_test = ['non functional', 'functional', 'functional', 'functional', 'functional', 'functional needs repair', 'functional', 'functional', 'non functional', 'non functional', 'non functional', 'non functional', 'functional', 'functional', 'functional', 'functional', 'non functional', 'functional', 'non functional', 'non functional', ...]
    289     if hasattr(score, 'item'):
    290         try:
    291             # e.g. unwrap memmapped scalars
    292             score = score.item()

...........................................................................
/Users/dheerajkrishna/anaconda/lib/python3.6/site-packages/sklearn/metrics/scorer.py in __call__(self=make_scorer(roc_auc_score, needs_threshold=True), clf=GradientBoostingClassifier(criterion='friedman_m...      subsample=1.0, verbose=0, warm_start=False), X=memmap([[  0, 286,   2, ...,   0,   6,   5],
   ... 1],
       [  0, 405,   5, ...,   0,   1,   1]]), y=['non functional', 'functional', 'functional', 'functional', 'functional', 'functional needs repair', 'functional', 'functional', 'non functional', 'non functional', 'non functional', 'non functional', 'functional', 'functional', 'functional', 'functional', 'non functional', 'functional', 'non functional', 'non functional', ...], sample_weight=None)
    166         """
    167         super(_ThresholdScorer, self).__call__(clf, X, y,
    168                                                sample_weight=sample_weight)
    169         y_type = type_of_target(y)
    170         if y_type not in ("binary", "multilabel-indicator"):
--> 171             raise ValueError("{0} format is not supported".format(y_type))
        y_type = 'multiclass'
    172 
    173         if is_regressor(clf):
    174             y_pred = clf.predict(X)
    175         else:

ValueError: multiclass format is not supported
___________________________________________________________________________

In [None]:
best_params = estimator.best_params_
print ('Best parameters are:',best_params)
                                 
val_accuracy = estimator.score(X_test, y_test)
print('Gridsearch Accuracy score: ', va1_accuracy)

Best parameters are: {'learning_rate': 0.075, 'max_depth': 7, 'max_features': 1.0, 'min_samples_leaf': 8, 'n_estimators': 200}

Gridsearch Accuracy score:  0.791189674523

## Predictions

In [30]:
predict = ETC.predict(test)

In [31]:
submission = pd.read_csv('SubmissionFormat.csv')

In [32]:
Final_merge = pd.concat([test, submission], axis=1)

In [33]:
Final_merge.head()

Unnamed: 0,amount_tsh,days_since_recorded,funder,installer,basin,population,public_meeting,scheme_management,permit,construction_year,extraction_type_class,payment_type,water_quality,quantity_group,source_type,source_class,waterpoint_type,waterpoint_type_group,id,status_group
0,0,302,2,2,8,321,2,1,2,1,2,4,6,3,2,1,4,4,50785,predicted label
1,0,302,4,1,0,300,2,4,2,4,6,4,6,4,6,0,5,5,51630,predicted label
2,0,305,2,2,8,500,2,4,0,1,2,4,6,4,2,1,4,4,17168,predicted label
3,0,315,2,2,1,250,0,4,2,6,2,1,6,1,5,0,4,4,45559,predicted label
4,65,251,2,2,1,60,0,5,2,4,6,6,6,2,6,0,5,5,49871,predicted label


In [34]:
Final_merge['status_group'] = predict

In [35]:
Final_merge[['id','status_group']].to_csv('submission.csv', index=False)

In [36]:
sub = pd.read_csv('submission.csv')

In [37]:
sub.head()

Unnamed: 0,id,status_group
0,50785,functional
1,51630,functional
2,17168,functional
3,45559,non functional
4,49871,functional
