In [1]:
!pip install --upgrade pip
!pip install python-decouple
!pip install geoalchemy2
!pip install shapely
!pip install scipy
!pip install tpot
!pip3 install xgboost

Collecting pip
[?25l  Downloading https://files.pythonhosted.org/packages/5c/e0/be401c003291b56efc55aeba6a80ab790d3d4cece2778288d65323009420/pip-19.1.1-py2.py3-none-any.whl (1.4MB)
[K    100% |████████████████████████████████| 1.4MB 22.7MB/s ta 0:00:01
[?25hInstalling collected packages: pip
  Found existing installation: pip 10.0.1
    Uninstalling pip-10.0.1:
      Successfully uninstalled pip-10.0.1
Successfully installed pip-19.1.1
Collecting python-decouple
  Downloading https://files.pythonhosted.org/packages/9b/99/ddfbb6362af4ee239a012716b1371aa6d316ff1b9db705bfb182fbc4780f/python-decouple-3.1.tar.gz
Building wheels for collected packages: python-decouple
  Building wheel for python-decouple (setup.py) ... [?25ldone
[?25h  Stored in directory: /home/ec2-user/.cache/pip/wheels/0f/ee/80/75b684060dc6ecc5a28c07b75ef4063f378aff1a37556f342a
Successfully built python-decouple
Installing collected packages: python-decouple
Successfully installed python-decouple-3.1
Collecting geoal

In [2]:
from sqlalchemy import create_engine, func, text
from sqlalchemy.orm import sessionmaker
from decouple import config
from shapely import wkb, wkt
from shapely.geometry import Point
from geoalchemy2.shape import to_shape 

import pandas as pd
import numpy as np
import random
import json
from datetime import datetime, timedelta
import re
from matplotlib import pyplot as plt

from sklearn.preprocessing import RobustScaler
from keras.models import Sequential
from keras.layers import LSTM, Dense
from sklearn.model_selection import GridSearchCV
from xgboost import XGBRegressor
from sklearn.ensemble import RandomForestClassifier
from sklearn.multioutput import MultiOutputClassifier
from sklearn.pipeline import Pipeline

from sklearn.metrics import accuracy_score
from sklearn.metrics import mean_squared_error

Using TensorFlow backend.
  from numpy.core.umath_tests import inner1d


In [3]:
"""Contains models for DB."""

from sqlalchemy.ext.declarative import declarative_base
from sqlalchemy import Column, BigInteger, Integer, String, DateTime, ForeignKey, Float
from sqlalchemy.orm import relationship
from geoalchemy2 import Geometry


BASE = declarative_base()


class City(BASE):
    """City model for DB. Has information of cities."""
    __tablename__ = 'city'
    id            = Column(BigInteger, primary_key=True)
    city          = Column(String, unique=False, nullable=False)
    state         = Column(String, unique=False, nullable=True)
    country       = Column(String, unique=False, nullable=False)
    location      = Column(Geometry(geometry_type='POINT'), nullable=False)
    blocks        = relationship("Blocks", back_populates="city")
    zipcodes      = relationship("ZipcodeGeom", back_populates="city")
    incidents     = relationship("Incident", back_populates="city")


class Blocks(BASE):
    """Block model for DB. Has information of city blocks for a related city
        id."""
    __tablename__ = 'block'
    id            = Column(BigInteger, primary_key=True)
    cityid        = Column(BigInteger, ForeignKey('city.id'), nullable=False)
    shape         = Column(Geometry(geometry_type='MULTIPOLYGON'), nullable=False)
    population    = Column(Integer, nullable=False)
    city          = relationship("City", back_populates="blocks")
    incidents     = relationship("Incident", back_populates="block")

class ZipcodeGeom(BASE):
    """Zipcode geometry model for DB. Has information of zipcodes and related
        city id."""
    __tablename__ = 'zipcodegeom'
    id            = Column(BigInteger, primary_key=True)
    cityid        = Column(BigInteger, ForeignKey('city.id'), nullable=False)
    zipcode       = Column(String, nullable=False, unique=True)
    shape         = Column(Geometry(geometry_type='MULTIPOLYGON'), nullable=False)
    city          = relationship("City", back_populates="zipcodes")

class Incident(BASE):
    """Incident model for DB. Has information of a specific crime, including
        where it took place, when it took place, and the type of crime that
        occurred."""
    __tablename__ = 'incident'
    id            = Column(BigInteger, primary_key=True)
    crimetypeid   = Column(BigInteger, ForeignKey('crimetype.id'), nullable=False)
    locdescid     = Column(BigInteger, ForeignKey('locdesctype.id'), nullable=False)
    cityid        = Column(BigInteger, ForeignKey('city.id'), nullable=False)
    blockid       = Column(BigInteger, ForeignKey('block.id'), nullable=False)
    location      = Column(Geometry(geometry_type='POINT'), nullable=False)
    datetime      = Column(DateTime, nullable=False)
    hour          = Column(Integer, nullable=False)
    dow           = Column(Integer, nullable=False)
    month         = Column(Integer, nullable=False)
    year          = Column(Integer, nullable=False)
    city          = relationship("City", back_populates="incidents")
    block         = relationship("Blocks", back_populates="incidents")
    crimetype     = relationship("CrimeType", back_populates="incidents")
    locationdesc  = relationship("LocationDescriptionType", back_populates="incidents")

class CrimeType(BASE):
    """CrimeType model for DB. Has information of the types of crime, including
        a general description and the numerical severity of the crime."""
    __tablename__ = 'crimetype'
    id            = Column(BigInteger, primary_key=True)
    category      = Column(String, unique=True, nullable=False)
    severity      = Column(Integer, nullable=False)
    incidents     = relationship("Incident", back_populates="crimetype")


class LocationDescriptionType(BASE):
    """Location description model for DB. Has information on the type of
        location that the crime took place."""
    __tablename__ = 'locdesctype'
    id            = Column(BigInteger, primary_key=True)
    key1          = Column(String, nullable=False)
    key2          = Column(String, nullable=False)
    key3          = Column(String, nullable=False)
    incidents     = relationship("Incident", back_populates="locationdesc")

In [4]:
class GetData(object):
    def go(self, SESSION, start_year, end_year):
        SQL_QUERY = \
            f'''
                SELECT
                    incident.blockid,
                    incident.year,
                    incident.month,
                    incident.dow,
                    incident.hour,
                    SUM(crimetype.severity)/AVG(block.population) AS severity
                FROM incident
                INNER JOIN block ON incident.blockid = block.id
                INNER JOIN crimetype ON incident.crimetypeid = crimetype.id
                    AND block.population > 0
                    AND incident.cityid = 1
                    AND incident.year >= {start_year}
                    AND incident.year <= {end_year}
                GROUP BY
                    incident.blockid,
                    incident.year,
                    incident.month,
                    incident.dow,
                    incident.hour
            '''
        return SESSION.execute(text(SQL_QUERY)).fetchall()

In [5]:
from contextlib import contextmanager

@contextmanager
def session_scope():
    """Provide a transactional scope around a series of operations."""

    DB_URI  = config('DB_URI')
    ENGINE  = create_engine(DB_URI)
    Session = sessionmaker(bind=ENGINE)
    SESSION = Session()
    
    try:
        yield SESSION
        SESSION.commit()
    except:
        SESSION.rollback()
        raise
    finally:
        SESSION.close()


def get_data(training_start_year, training_end_year,
             testing_start_year, testing_end_year):
    
    def compile_data(data):
        result = []
        for r in data:
            result.append((r[0], r[1], r[2], r[3], r[4], r[5]))
        return result

    with session_scope() as session:
        training_data = GetData().go(session,
                                     training_start_year,
                                     training_end_year)
        testing_data = GetData().go(session,
                                     testing_start_year,
                                     testing_end_year)
        
        return compile_data(training_data), compile_data(testing_data)

In [6]:
%%time
training_data, testing_data = get_data(2015, 2017, 2016, 2018)

  """)


CPU times: user 2.3 s, sys: 445 ms, total: 2.75 s
Wall time: 36.2 s


In [7]:
def process_data(data, start_year, end_year):

    # records is the list of rows we get from the query with this order:
    #   blockid, year, month, dow, hour, risk
    #   month is from 1 - 12
    #   `POINT(<lat> <lng>)`
    
    NUM_BLOCKIDS = 801
    START_MONTH = START_MONTH_LAST_YEAR = 1
    DATA_PER_YEAR = 12*7*24
    
    if (end_year - start_year + 1) != 3:
        raise ValueError('3 years must be passed in')

    years_data_X = 2
    years_data_y = 1
    X = np.zeros((NUM_BLOCKIDS, years_data_X*DATA_PER_YEAR+1))
    y = np.zeros((NUM_BLOCKIDS, years_data_y*DATA_PER_YEAR+1))
    
    for r in data:
        if r[1] == end_year:
            y[r[0]-1, (((12*r[1]+r[2]-1)-(end_year*12+START_MONTH_LAST_YEAR-1))*7+r[3])*24+r[4]] = \
                float(r[5])
        else:
            X[r[0]-1, (((12*r[1]+r[2]-1)-(start_year*12+START_MONTH-1))*7+r[3])*24+r[4]] = \
                float(r[5])
    
    X[:, -1] = start_year*12+START_MONTH-1

    def make_columns(num_data_cols):
        cols = []
        for i in range(num_data_cols):
            cols.append('col' + str(i))
        cols.append('month')
        
        return cols

    def fill_df(values, num_blockids, num_years, data_per_year):
        columns = make_columns(num_years*data_per_year)
        df = pd.DataFrame(np.zeros((num_blockids, num_years*data_per_year + 1)),
                          columns=columns)
        
        values_transposed = values.transpose()  # X 801 x 4033 => 4033 x 801
        for row in range(len(values[0] - 1)):
            df[columns[row]] = values_transposed[row]
        
        return df
    
    df_X = fill_df(X, NUM_BLOCKIDS, years_data_X, DATA_PER_YEAR)
    df_X['month'] = (end_year - start_year)*12+START_MONTH-1
    df_y = fill_df(y, NUM_BLOCKIDS, years_data_y, DATA_PER_YEAR)

    return df_X, df_y

In [8]:
X_train, y_train = process_data(training_data, 2015, 2017)
X_test, y_test = process_data(testing_data, 2016, 2018)
X_train.shape, X_test.shape, y_train.shape, y_test.shape

((801, 4033), (801, 4033), (801, 2017), (801, 2017))

## Classifier

    #             X                 y
    #     2015:  1, 2, 0, 3, 4      2, 8, 9, 3, 2
    #     2016:  2, 0, 3, 4, 0      1, 3, 4, 0, 1
    #     2017:  1, 1, 0, 2, 1      7, 0, 0, 3, 2

    #     Classifier:
    #         y    (801 x 2016) => nonzero -> 1
    #         X    (801 x 2017) => 1 extra value at the end for 12*yr+month-1
    #              (801 x 4033)


    #     Train on X and y_classif_train
    #     y_classif_train = y.apply(lambda x: 1 if x > 0 else 0)

    #     y is a future year
    #     Prediction output = y_pred_classif



In [9]:
y_train_classif = y_train.drop(columns=['month']).astype(bool).astype(int)
y_test_classif = y_test.drop(columns=['month']).astype(bool).astype(int)
y_test_classif.head()

Unnamed: 0,col0,col1,col2,col3,col4,col5,col6,col7,col8,col9,...,col2006,col2007,col2008,col2009,col2010,col2011,col2012,col2013,col2014,col2015
0,0,1,0,0,1,0,0,0,0,0,...,1,1,0,0,1,1,1,1,1,1
1,0,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
2,1,0,0,1,0,0,0,1,0,0,...,0,0,0,0,1,0,1,0,1,0
3,0,0,0,0,0,0,0,1,1,0,...,0,0,0,0,0,0,0,1,0,0
4,0,1,0,0,0,0,0,0,1,0,...,1,0,0,1,0,1,0,0,0,1


In [10]:
all_uniques = set()
unique_fn = np.vectorize(lambda x: all_uniques.add(x))
y_train_classif.applymap(unique_fn)
all_uniques

{0, 1}

In [11]:
# This same code works when you create the param_grid in the same cell
# that you run GridSearchCV(). It does not work when you pass in the
# param_grid created in another cell.
# So we have to repeat all of this for the classifier and regressor.
#
# def gridsearch_fit_predict(X_train, y_train, X_test, y_test,
#                            model,
#                            param_grid,
#                            scoring, 
#                            model_type):
    
#     gridsearch = GridSearchCV(model,
#                               param_grid=param_grid, 
#                               scoring=scoring, 
#                               cv=3, n_jobs=-1,
#                               return_train_score=True, verbose=10)

#     gridsearch.fit(X_train, y_train)

#     if scoring == 'neg_mean_squared_error':
#         best_training_score = -gridsearch.score(X_train, y_train)
#         best_testing_score  = -gridsearch.score(X_test, y_test)
#         score_type = 'mse'
#     elif scoring == 'accuracy':
#         best_training_score = gridsearch.score(X_train, y_train)
#         best_testing_score  = gridsearch.score(X_test, y_test)
#         score_type = 'accuracy'
#     else:
#         raise ValueError('Cannot handle scoring type:', scoring)
        
#     best_model_params = gridsearch.cv_results_['params'][gridsearch.best_index_]
    
#     print(f'  Best training {score_type} from grid search:', best_training_score)
#     print(f'  Best testing {score_type} from grid search: ', best_testing_score)
#     print('Best Grid Search model:', best_model_params)
    
#     y_pred = gridsearch.predict(X_test)
#     if scoring == 'neg_mean_squared_error':
#         score_value = mean_squared_error(y_test, y_pred)
#         print('MSE:', score_value)
#     elif scoring == 'accuracy':
#         score_value = accuracy_score(y_test, y_pred)
#         print('Accuracy:', score_value)
    
#     return y_pred, score_value, best_training_score, best_testing_score, best_model_params

In [12]:
# x1, x2, x3, x4, x5 = gridsearch_fit_predict(X_train,
#                                             y_train_classif,
#                                             X_test,
#                                             y_test_classif,
#                                             pipeline,
#                                             param_grid,
#                                             'accuracy',
#                                             'classifier')
# y_pred = x1
# print('mse:', x2)
# print('best training mse:', x3)
# print('best testing mse: ', x4)
# print('best model params:', x5)

In [13]:
clf = MultiOutputClassifier(RandomForestClassifier())

# For regressors:
# param_grid = {
#     'learning_rate': [0.01, 0.03, 0.05, 0.08, 0.1, 0.2, 0.5, 0.7],
#     'n_estimators':  [80, 100, 120, 1000, 3000, 5000, 10000],
#     'max_depth': [2, 3, 4, 5, 6, 7, 8],
# }

# For classifiers:
param_grid = {
    'estimator__n_estimators':  [80, 100, 120],
    'estimator__max_depth': [2, 3, 4, 5, 6, 7, 8],
}

gridsearch = GridSearchCV(clf,
                          param_grid=param_grid, 
                          scoring='accuracy', 
                          cv=3, n_jobs=-1,
                          return_train_score=True, verbose=10)
gridsearch.fit(X_train, y_train_classif)

best_training_score = gridsearch.score(X_train, y_train_classif)
best_testing_score  = gridsearch.score(X_test, y_test_classif)
best_model_params = gridsearch.cv_results_['params'][gridsearch.best_index_]

print(f'  Best training accuracy:', best_training_score)
print(f'  Best testing accuracy: ', best_testing_score)
print('Best Grid Search model:', best_model_params)

y_pred_classif = gridsearch.predict(X_test)
score_value = accuracy_score(y_test_classif, y_pred_classif)
print('Accuracy:', score_value)

Fitting 3 folds for each of 21 candidates, totalling 63 fits
[CV] estimator__max_depth=2, estimator__n_estimators=80 ..............
[CV] estimator__max_depth=2, estimator__n_estimators=80 ..............
[CV] estimator__max_depth=2, estimator__n_estimators=80 ..............
[CV] estimator__max_depth=2, estimator__n_estimators=100 .............
[CV] estimator__max_depth=2, estimator__n_estimators=100 .............
[CV] estimator__max_depth=2, estimator__n_estimators=100 .............
[CV] estimator__max_depth=2, estimator__n_estimators=120 .............
[CV] estimator__max_depth=2, estimator__n_estimators=120 .............
[CV] estimator__max_depth=2, estimator__n_estimators=120 .............
[CV] estimator__max_depth=3, estimator__n_estimators=80 ..............
[CV] estimator__max_depth=3, estimator__n_estimators=80 ..............
[CV] estimator__max_depth=3, estimator__n_estimators=80 ..............
[CV] estimator__max_depth=3, estimator__n_estimators=100 .............
[CV] estimator__

[Parallel(n_jobs=-1)]: Done   7 out of  63 | elapsed: 11.7min remaining: 93.4min


[CV] estimator__max_depth=6, estimator__n_estimators=80 ..............
[CV]  estimator__max_depth=2, estimator__n_estimators=100, score=0.003745318352059925, total=11.0min
[CV] estimator__max_depth=6, estimator__n_estimators=100 .............
[CV]  estimator__max_depth=2, estimator__n_estimators=100, score=0.003745318352059925, total=11.2min
[CV] estimator__max_depth=6, estimator__n_estimators=100 .............
[CV]  estimator__max_depth=4, estimator__n_estimators=80, score=0.003745318352059925, total=11.9min
[CV] estimator__max_depth=6, estimator__n_estimators=100 .............
[CV]  estimator__max_depth=4, estimator__n_estimators=80, score=0.00749063670411985, total=11.9min
[CV] estimator__max_depth=6, estimator__n_estimators=120 .............
[CV]  estimator__max_depth=4, estimator__n_estimators=80, score=0.003745318352059925, total=12.2min
[CV] estimator__max_depth=6, estimator__n_estimators=120 .............
[CV]  estimator__max_depth=3, estimator__n_estimators=100, score=0.003745

[Parallel(n_jobs=-1)]: Done  14 out of  63 | elapsed: 13.8min remaining: 48.2min


[CV] estimator__max_depth=7, estimator__n_estimators=80 ..............
[CV]  estimator__max_depth=3, estimator__n_estimators=100, score=0.003745318352059925, total=13.2min
[CV] estimator__max_depth=7, estimator__n_estimators=80 ..............
[CV]  estimator__max_depth=2, estimator__n_estimators=120, score=0.00749063670411985, total=13.1min
[CV] estimator__max_depth=7, estimator__n_estimators=80 ..............
[CV]  estimator__max_depth=5, estimator__n_estimators=80, score=0.00749063670411985, total=13.3min
[CV] estimator__max_depth=7, estimator__n_estimators=100 .............
[CV]  estimator__max_depth=2, estimator__n_estimators=120, score=0.003745318352059925, total=13.2min
[CV] estimator__max_depth=7, estimator__n_estimators=100 .............
[CV]  estimator__max_depth=2, estimator__n_estimators=120, score=0.003745318352059925, total=13.4min
[CV] estimator__max_depth=7, estimator__n_estimators=100 .............
[CV]  estimator__max_depth=5, estimator__n_estimators=80, score=0.003745

[Parallel(n_jobs=-1)]: Done  21 out of  63 | elapsed: 14.4min remaining: 28.9min


[CV] estimator__max_depth=7, estimator__n_estimators=120 .............
[CV]  estimator__max_depth=4, estimator__n_estimators=100, score=0.00749063670411985, total=14.7min
[CV] estimator__max_depth=7, estimator__n_estimators=120 .............
[CV]  estimator__max_depth=4, estimator__n_estimators=100, score=0.003745318352059925, total=15.0min
[CV] estimator__max_depth=8, estimator__n_estimators=80 ..............
[CV]  estimator__max_depth=4, estimator__n_estimators=100, score=0.003745318352059925, total=15.1min
[CV] estimator__max_depth=8, estimator__n_estimators=80 ..............
[CV]  estimator__max_depth=3, estimator__n_estimators=120, score=0.00749063670411985, total=15.4min
[CV] estimator__max_depth=8, estimator__n_estimators=80 ..............
[CV]  estimator__max_depth=3, estimator__n_estimators=120, score=0.003745318352059925, total=15.8min
[CV] estimator__max_depth=8, estimator__n_estimators=100 .............
[CV]  estimator__max_depth=3, estimator__n_estimators=120, score=0.0037

[Parallel(n_jobs=-1)]: Done  28 out of  63 | elapsed: 17.5min remaining: 21.8min


[CV] estimator__max_depth=8, estimator__n_estimators=100 .............
[CV]  estimator__max_depth=5, estimator__n_estimators=100, score=0.003745318352059925, total=16.9min
[CV] estimator__max_depth=8, estimator__n_estimators=120 .............
[CV]  estimator__max_depth=4, estimator__n_estimators=120, score=0.00749063670411985, total=17.7min
[CV] estimator__max_depth=8, estimator__n_estimators=120 .............
[CV]  estimator__max_depth=4, estimator__n_estimators=120, score=0.003745318352059925, total=17.9min
[CV] estimator__max_depth=8, estimator__n_estimators=120 .............
[CV]  estimator__max_depth=4, estimator__n_estimators=120, score=0.003745318352059925, total=18.2min
[CV]  estimator__max_depth=6, estimator__n_estimators=80, score=0.00749063670411985, total=14.9min
[CV]  estimator__max_depth=6, estimator__n_estimators=80, score=0.003745318352059925, total=14.9min
[CV]  estimator__max_depth=6, estimator__n_estimators=80, score=0.003745318352059925, total=15.2min


[Parallel(n_jobs=-1)]: Done  35 out of  63 | elapsed: 27.3min remaining: 21.8min


[CV]  estimator__max_depth=5, estimator__n_estimators=100, score=0.003745318352059925, total=17.1min
[CV]  estimator__max_depth=7, estimator__n_estimators=80, score=0.00749063670411985, total=15.0min
[CV]  estimator__max_depth=5, estimator__n_estimators=120, score=0.003745318352059925, total=19.1min
[CV]  estimator__max_depth=5, estimator__n_estimators=120, score=0.00749063670411985, total=19.8min
[CV]  estimator__max_depth=7, estimator__n_estimators=80, score=0.003745318352059925, total=15.9min
[CV]  estimator__max_depth=6, estimator__n_estimators=100, score=0.00749063670411985, total=18.4min
[CV]  estimator__max_depth=7, estimator__n_estimators=80, score=0.003745318352059925, total=16.3min


[Parallel(n_jobs=-1)]: Done  42 out of  63 | elapsed: 31.0min remaining: 15.5min


[CV]  estimator__max_depth=5, estimator__n_estimators=120, score=0.003745318352059925, total=19.6min
[CV]  estimator__max_depth=6, estimator__n_estimators=100, score=0.003745318352059925, total=18.7min
[CV]  estimator__max_depth=6, estimator__n_estimators=100, score=0.003745318352059925, total=18.7min
[CV]  estimator__max_depth=6, estimator__n_estimators=120, score=0.003745318352059925, total=19.3min
[CV]  estimator__max_depth=8, estimator__n_estimators=80, score=0.003745318352059925, total=16.2min
[CV]  estimator__max_depth=8, estimator__n_estimators=80, score=0.003745318352059925, total=16.6min
[CV]  estimator__max_depth=8, estimator__n_estimators=80, score=0.00749063670411985, total=16.9min


[Parallel(n_jobs=-1)]: Done  49 out of  63 | elapsed: 33.1min remaining:  9.5min


[CV]  estimator__max_depth=7, estimator__n_estimators=100, score=0.00749063670411985, total=18.9min
[CV]  estimator__max_depth=7, estimator__n_estimators=100, score=0.003745318352059925, total=18.9min
[CV]  estimator__max_depth=7, estimator__n_estimators=100, score=0.003745318352059925, total=19.3min
[CV]  estimator__max_depth=6, estimator__n_estimators=120, score=0.003745318352059925, total=20.0min
[CV]  estimator__max_depth=6, estimator__n_estimators=120, score=0.00749063670411985, total=21.4min
[CV]  estimator__max_depth=8, estimator__n_estimators=100, score=0.003745318352059925, total=17.8min
[CV]  estimator__max_depth=8, estimator__n_estimators=100, score=0.00749063670411985, total=18.3min


[Parallel(n_jobs=-1)]: Done  56 out of  63 | elapsed: 35.4min remaining:  4.4min


[CV]  estimator__max_depth=7, estimator__n_estimators=120, score=0.003745318352059925, total=19.5min
[CV]  estimator__max_depth=7, estimator__n_estimators=120, score=0.00749063670411985, total=20.9min
[CV]  estimator__max_depth=7, estimator__n_estimators=120, score=0.003745318352059925, total=20.9min
[CV]  estimator__max_depth=8, estimator__n_estimators=120, score=0.00749063670411985, total=18.0min
[CV]  estimator__max_depth=8, estimator__n_estimators=100, score=0.003745318352059925, total=19.0min
[CV]  estimator__max_depth=8, estimator__n_estimators=120, score=0.003745318352059925, total=19.7min
[CV]  estimator__max_depth=8, estimator__n_estimators=120, score=0.003745318352059925, total=21.0min


[Parallel(n_jobs=-1)]: Done  63 out of  63 | elapsed: 40.4min remaining:    0.0s
[Parallel(n_jobs=-1)]: Done  63 out of  63 | elapsed: 40.4min finished


  Best training accuracy: 0.004993757802746567
  Best testing accuracy:  0.004993757802746567


NameError: name 'best_model_params' is not defined

In [None]:
for i in y.columns:
    y_temp = y.loc[y_pred_classif.loc[:,i]>0,[i]]  # y_temp is from the original y's
    X_temp = X.loc[y__pred_classif.loc[:,i]>0,:]

In [14]:
best_model_params = gridsearch.cv_results_['params'][gridsearch.best_index_]
best_model_params

{'estimator__max_depth': 2, 'estimator__n_estimators': 80}

In [15]:
y_pred_classif = gridsearch.predict(X_test)
score_value = accuracy_score(y_test_classif, y_pred_classif)
print('Accuracy:', score_value)

Accuracy: 0.004993757802746567


In [None]:

def nonzeros(df):
    values = df.values
    print('len(values):', len(values), 'len(values[0]):', len(values[0]))
    nonzeros = []
    for i in range(len(values)):
        nonzeros.append([values[i][values[i] != 0]])
        print('number of nonzeros:', len(nonzeros[i]))
    return nonzeros

nonzeros(df_X)


In [None]:
    #             X                 y
    #     2015:  1, 2, 0, 3, 4      2, 8, 9, 3, 2
    #     2016:  2, 0, 3, 4, 0      1, 3, 4, 0, 1
    #     2017:  1, 1, 0, 2, 1      7, 0, 0, 3, 2

    #     Classifier:
    #         y    (801 x 2016) => nonzero -> 1
    #         X    (801 x 2017) => 1 extra value at the end for 12*yr+month-1
    #              (801 x 4033)


    #     Train on X and y_classif_train
    #     y_classif_train = y.apply(lambda x: 1 if x > 0 else 0)

    #     y is a future year
    #     Prediction output = y_classif

    #     for i in y.columns:
    #         y_temp = y.loc[y_classif.loc[:,i]>0,[i]]  # y_temp is from the original y's
    #         X_temp = X.loc[y_classif.loc[:,i]>0,:]

    #     Regressor:
    #         Use y_pred to decide which values to drop in X, and y for regressor.
    #         y is for current year of y_classif
    #     One model per column on y          (?? x 1)
    #     Use all X values (including 0's)   (?? x 2016)
    #                                        (?? x 4032)

    #     for i in y.columns:
    #         y_temp.loc[:,[i]] = y_regress
    #         y_pred.loc[y_temp.index,i] = y_temp.loc[:,i]

    #     y_pred goes into the database

    # idx 0  2  3  4  5  6  7  8  9  10        0  1  2  3  4
    # r   1, 2, 0, 3, 4, 2, 0, 3, 4, 0         7, 0, 0, 3, 2
    # loc h, o, s, b, ...                      h, o, s, b, ...
    #     [] [] [] []                          [] [] [] []

    #     X: years 2015, 2016
    #               (24*7*12)*2years = 4000
    #               feature0  feature1 ... feature4000
    #     blockid0   risk       risk         0
    #     blockid1   risk       0            risk

    #     y: year 2017
    #               (24*7*12)*1year = 2000
    #               feature0  feature1 ... feature2000
    #     blockid0   risk       0            0
    #     blockid1   risk       0            risk

    #     y_pred: year 2017
    #               (24*7*12)*1year = 2000
    #               feature0  feature1 ... feature2000
    #     blockid0   0          0            0
    #     blockid1   1          0            1
        
    


In [151]:
print(X_train.shape, X_test.shape, y_train.shape, y_test.shape)
X_train = X_train.transpose()
X_test  = X_test.transpose()
y_train = y_train.transpose()
y_test  = y_test.transpose()
print(X_train.shape, X_test.shape, y_train.shape, y_test.shape)

(801, 4835) (801, 4835) (801, 2819) (801, 2819)
(4835, 801) (4835, 801) (2819, 801) (2819, 801)


In [152]:
X_train = nonzeros(X_train)
y_train = nonzeros(y_train)
X_test  = nonzeros(X_test)
y_test  = nonzeros(y_test)
X_test.shape, y_test.shape

len(values): 4835 len(values[0]): 801
number of nonzeros: 1
number of nonzeros: 1
number of nonzeros: 1
number of nonzeros: 1
number of nonzeros: 1
number of nonzeros: 1
number of nonzeros: 1
number of nonzeros: 1
number of nonzeros: 1
number of nonzeros: 1
number of nonzeros: 1
number of nonzeros: 1
number of nonzeros: 1
number of nonzeros: 1
number of nonzeros: 1
number of nonzeros: 1
number of nonzeros: 1
number of nonzeros: 1
number of nonzeros: 1
number of nonzeros: 1
number of nonzeros: 1
number of nonzeros: 1
number of nonzeros: 1
number of nonzeros: 1
number of nonzeros: 1
number of nonzeros: 1
number of nonzeros: 1
number of nonzeros: 1
number of nonzeros: 1
number of nonzeros: 1
number of nonzeros: 1
number of nonzeros: 1
number of nonzeros: 1
number of nonzeros: 1
number of nonzeros: 1
number of nonzeros: 1
number of nonzeros: 1
number of nonzeros: 1
number of nonzeros: 1
number of nonzeros: 1
number of nonzeros: 1
number of nonzeros: 1
number of nonzeros: 1
number of nonzer

number of nonzeros: 1
number of nonzeros: 1
number of nonzeros: 1
number of nonzeros: 1
number of nonzeros: 1
number of nonzeros: 1
number of nonzeros: 1
number of nonzeros: 1
number of nonzeros: 1
number of nonzeros: 1
number of nonzeros: 1
number of nonzeros: 1
number of nonzeros: 1
number of nonzeros: 1
number of nonzeros: 1
number of nonzeros: 1
number of nonzeros: 1
number of nonzeros: 1
number of nonzeros: 1
number of nonzeros: 1
number of nonzeros: 1
number of nonzeros: 1
number of nonzeros: 1
number of nonzeros: 1
number of nonzeros: 1
number of nonzeros: 1
number of nonzeros: 1
number of nonzeros: 1
number of nonzeros: 1
number of nonzeros: 1
number of nonzeros: 1
number of nonzeros: 1
number of nonzeros: 1
number of nonzeros: 1
number of nonzeros: 1
number of nonzeros: 1
number of nonzeros: 1
number of nonzeros: 1
number of nonzeros: 1
number of nonzeros: 1
number of nonzeros: 1
number of nonzeros: 1
number of nonzeros: 1
number of nonzeros: 1
number of nonzeros: 1
number of 

number of nonzeros: 1
number of nonzeros: 1
number of nonzeros: 1
number of nonzeros: 1
number of nonzeros: 1
number of nonzeros: 1
number of nonzeros: 1
number of nonzeros: 1
number of nonzeros: 1
number of nonzeros: 1
number of nonzeros: 1
number of nonzeros: 1
number of nonzeros: 1
number of nonzeros: 1
number of nonzeros: 1
number of nonzeros: 1
number of nonzeros: 1
number of nonzeros: 1
number of nonzeros: 1
number of nonzeros: 1
number of nonzeros: 1
number of nonzeros: 1
number of nonzeros: 1
number of nonzeros: 1
number of nonzeros: 1
number of nonzeros: 1
number of nonzeros: 1
number of nonzeros: 1
number of nonzeros: 1
number of nonzeros: 1
number of nonzeros: 1
number of nonzeros: 1
number of nonzeros: 1
number of nonzeros: 1
number of nonzeros: 1
number of nonzeros: 1
number of nonzeros: 1
number of nonzeros: 1
number of nonzeros: 1
number of nonzeros: 1
number of nonzeros: 1
number of nonzeros: 1
number of nonzeros: 1
number of nonzeros: 1
number of nonzeros: 1
number of 

number of nonzeros: 1
number of nonzeros: 1
number of nonzeros: 1
number of nonzeros: 1
number of nonzeros: 1
number of nonzeros: 1
number of nonzeros: 1
number of nonzeros: 1
number of nonzeros: 1
number of nonzeros: 1
number of nonzeros: 1
number of nonzeros: 1
number of nonzeros: 1
number of nonzeros: 1
number of nonzeros: 1
number of nonzeros: 1
number of nonzeros: 1
number of nonzeros: 1
number of nonzeros: 1
number of nonzeros: 1
number of nonzeros: 1
number of nonzeros: 1
number of nonzeros: 1
number of nonzeros: 1
number of nonzeros: 1
number of nonzeros: 1
number of nonzeros: 1
number of nonzeros: 1
number of nonzeros: 1
number of nonzeros: 1
number of nonzeros: 1
number of nonzeros: 1
number of nonzeros: 1
number of nonzeros: 1
number of nonzeros: 1
number of nonzeros: 1
number of nonzeros: 1
number of nonzeros: 1
number of nonzeros: 1
number of nonzeros: 1
number of nonzeros: 1
number of nonzeros: 1
number of nonzeros: 1
number of nonzeros: 1
number of nonzeros: 1
number of 

number of nonzeros: 1
number of nonzeros: 1
number of nonzeros: 1
number of nonzeros: 1
number of nonzeros: 1
number of nonzeros: 1
number of nonzeros: 1
number of nonzeros: 1
number of nonzeros: 1
number of nonzeros: 1
number of nonzeros: 1
number of nonzeros: 1
number of nonzeros: 1
number of nonzeros: 1
number of nonzeros: 1
number of nonzeros: 1
number of nonzeros: 1
number of nonzeros: 1
number of nonzeros: 1
number of nonzeros: 1
number of nonzeros: 1
number of nonzeros: 1
number of nonzeros: 1
number of nonzeros: 1
number of nonzeros: 1
number of nonzeros: 1
number of nonzeros: 1
number of nonzeros: 1
number of nonzeros: 1
number of nonzeros: 1
number of nonzeros: 1
number of nonzeros: 1
number of nonzeros: 1
number of nonzeros: 1
number of nonzeros: 1
number of nonzeros: 1
number of nonzeros: 1
number of nonzeros: 1
number of nonzeros: 1
number of nonzeros: 1
number of nonzeros: 1
number of nonzeros: 1
number of nonzeros: 1
number of nonzeros: 1
number of nonzeros: 1
number of 

number of nonzeros: 1
number of nonzeros: 1
number of nonzeros: 1
number of nonzeros: 1
number of nonzeros: 1
number of nonzeros: 1
number of nonzeros: 1
number of nonzeros: 1
number of nonzeros: 1
number of nonzeros: 1
number of nonzeros: 1
number of nonzeros: 1
number of nonzeros: 1
number of nonzeros: 1
number of nonzeros: 1
number of nonzeros: 1
number of nonzeros: 1
number of nonzeros: 1
number of nonzeros: 1
number of nonzeros: 1
number of nonzeros: 1
number of nonzeros: 1
number of nonzeros: 1
number of nonzeros: 1
number of nonzeros: 1
number of nonzeros: 1
number of nonzeros: 1
number of nonzeros: 1
number of nonzeros: 1
number of nonzeros: 1
number of nonzeros: 1
number of nonzeros: 1
number of nonzeros: 1
number of nonzeros: 1
number of nonzeros: 1
number of nonzeros: 1
number of nonzeros: 1
number of nonzeros: 1
number of nonzeros: 1
number of nonzeros: 1
number of nonzeros: 1
number of nonzeros: 1
number of nonzeros: 1
number of nonzeros: 1
number of nonzeros: 1
number of 

number of nonzeros: 1
number of nonzeros: 1
number of nonzeros: 1
number of nonzeros: 1
number of nonzeros: 1
number of nonzeros: 1
number of nonzeros: 1
number of nonzeros: 1
number of nonzeros: 1
number of nonzeros: 1
number of nonzeros: 1
number of nonzeros: 1
number of nonzeros: 1
number of nonzeros: 1
number of nonzeros: 1
number of nonzeros: 1
number of nonzeros: 1
number of nonzeros: 1
number of nonzeros: 1
number of nonzeros: 1
number of nonzeros: 1
number of nonzeros: 1
number of nonzeros: 1
number of nonzeros: 1
number of nonzeros: 1
number of nonzeros: 1
number of nonzeros: 1
number of nonzeros: 1
number of nonzeros: 1
number of nonzeros: 1
number of nonzeros: 1
number of nonzeros: 1
number of nonzeros: 1
number of nonzeros: 1
number of nonzeros: 1
number of nonzeros: 1
number of nonzeros: 1
number of nonzeros: 1
number of nonzeros: 1
number of nonzeros: 1
number of nonzeros: 1
number of nonzeros: 1
number of nonzeros: 1
number of nonzeros: 1
number of nonzeros: 1
number of 

number of nonzeros: 1
number of nonzeros: 1
number of nonzeros: 1
number of nonzeros: 1
number of nonzeros: 1
number of nonzeros: 1
number of nonzeros: 1
number of nonzeros: 1
number of nonzeros: 1
number of nonzeros: 1
number of nonzeros: 1
number of nonzeros: 1
number of nonzeros: 1
number of nonzeros: 1
number of nonzeros: 1
number of nonzeros: 1
number of nonzeros: 1
number of nonzeros: 1
number of nonzeros: 1
number of nonzeros: 1
number of nonzeros: 1
number of nonzeros: 1
number of nonzeros: 1
number of nonzeros: 1
number of nonzeros: 1
number of nonzeros: 1
number of nonzeros: 1
number of nonzeros: 1
number of nonzeros: 1
number of nonzeros: 1
number of nonzeros: 1
number of nonzeros: 1
number of nonzeros: 1
number of nonzeros: 1
number of nonzeros: 1
number of nonzeros: 1
number of nonzeros: 1
number of nonzeros: 1
number of nonzeros: 1
number of nonzeros: 1
number of nonzeros: 1
number of nonzeros: 1
number of nonzeros: 1
number of nonzeros: 1
number of nonzeros: 1
number of 

number of nonzeros: 1
number of nonzeros: 1
number of nonzeros: 1
number of nonzeros: 1
number of nonzeros: 1
number of nonzeros: 1
number of nonzeros: 1
number of nonzeros: 1
number of nonzeros: 1
number of nonzeros: 1
number of nonzeros: 1
number of nonzeros: 1
number of nonzeros: 1
number of nonzeros: 1
number of nonzeros: 1
number of nonzeros: 1
number of nonzeros: 1
number of nonzeros: 1
number of nonzeros: 1
number of nonzeros: 1
number of nonzeros: 1
number of nonzeros: 1
number of nonzeros: 1
number of nonzeros: 1
number of nonzeros: 1
number of nonzeros: 1
number of nonzeros: 1
number of nonzeros: 1
number of nonzeros: 1
number of nonzeros: 1
number of nonzeros: 1
number of nonzeros: 1
number of nonzeros: 1
number of nonzeros: 1
number of nonzeros: 1
number of nonzeros: 1
number of nonzeros: 1
number of nonzeros: 1
number of nonzeros: 1
number of nonzeros: 1
number of nonzeros: 1
number of nonzeros: 1
number of nonzeros: 1
number of nonzeros: 1
number of nonzeros: 1
number of 

number of nonzeros: 1
number of nonzeros: 1
number of nonzeros: 1
number of nonzeros: 1
number of nonzeros: 1
number of nonzeros: 1
number of nonzeros: 1
number of nonzeros: 1
number of nonzeros: 1
number of nonzeros: 1
number of nonzeros: 1
number of nonzeros: 1
number of nonzeros: 1
number of nonzeros: 1
number of nonzeros: 1
number of nonzeros: 1
number of nonzeros: 1
number of nonzeros: 1
number of nonzeros: 1
number of nonzeros: 1
number of nonzeros: 1
number of nonzeros: 1
number of nonzeros: 1
number of nonzeros: 1
number of nonzeros: 1
number of nonzeros: 1
number of nonzeros: 1
number of nonzeros: 1
number of nonzeros: 1
number of nonzeros: 1
number of nonzeros: 1
number of nonzeros: 1
number of nonzeros: 1
number of nonzeros: 1
number of nonzeros: 1
number of nonzeros: 1
number of nonzeros: 1
number of nonzeros: 1
number of nonzeros: 1
number of nonzeros: 1
number of nonzeros: 1
number of nonzeros: 1
number of nonzeros: 1
number of nonzeros: 1
number of nonzeros: 1
number of 

number of nonzeros: 1
number of nonzeros: 1
number of nonzeros: 1
number of nonzeros: 1
number of nonzeros: 1
number of nonzeros: 1
number of nonzeros: 1
number of nonzeros: 1
number of nonzeros: 1
number of nonzeros: 1
number of nonzeros: 1
number of nonzeros: 1
number of nonzeros: 1
number of nonzeros: 1
number of nonzeros: 1
number of nonzeros: 1
number of nonzeros: 1
number of nonzeros: 1
number of nonzeros: 1
number of nonzeros: 1
number of nonzeros: 1
number of nonzeros: 1
number of nonzeros: 1
number of nonzeros: 1
number of nonzeros: 1
number of nonzeros: 1
number of nonzeros: 1
number of nonzeros: 1
number of nonzeros: 1
number of nonzeros: 1
number of nonzeros: 1
number of nonzeros: 1
number of nonzeros: 1
number of nonzeros: 1
number of nonzeros: 1
number of nonzeros: 1
number of nonzeros: 1
number of nonzeros: 1
number of nonzeros: 1
number of nonzeros: 1
number of nonzeros: 1
number of nonzeros: 1
number of nonzeros: 1
number of nonzeros: 1
number of nonzeros: 1
number of 

number of nonzeros: 1
number of nonzeros: 1
number of nonzeros: 1
number of nonzeros: 1
number of nonzeros: 1
number of nonzeros: 1
number of nonzeros: 1
number of nonzeros: 1
number of nonzeros: 1
number of nonzeros: 1
number of nonzeros: 1
number of nonzeros: 1
number of nonzeros: 1
number of nonzeros: 1
number of nonzeros: 1
number of nonzeros: 1
number of nonzeros: 1
number of nonzeros: 1
number of nonzeros: 1
number of nonzeros: 1
number of nonzeros: 1
number of nonzeros: 1
number of nonzeros: 1
number of nonzeros: 1
number of nonzeros: 1
number of nonzeros: 1
number of nonzeros: 1
number of nonzeros: 1
number of nonzeros: 1
number of nonzeros: 1
number of nonzeros: 1
number of nonzeros: 1
number of nonzeros: 1
number of nonzeros: 1
number of nonzeros: 1
number of nonzeros: 1
number of nonzeros: 1
number of nonzeros: 1
number of nonzeros: 1
number of nonzeros: 1
number of nonzeros: 1
number of nonzeros: 1
number of nonzeros: 1
number of nonzeros: 1
number of nonzeros: 1
number of 

number of nonzeros: 1
number of nonzeros: 1
number of nonzeros: 1
number of nonzeros: 1
number of nonzeros: 1
number of nonzeros: 1
number of nonzeros: 1
number of nonzeros: 1
number of nonzeros: 1
number of nonzeros: 1
number of nonzeros: 1
number of nonzeros: 1
number of nonzeros: 1
number of nonzeros: 1
number of nonzeros: 1
number of nonzeros: 1
number of nonzeros: 1
number of nonzeros: 1
number of nonzeros: 1
number of nonzeros: 1
number of nonzeros: 1
number of nonzeros: 1
number of nonzeros: 1
number of nonzeros: 1
number of nonzeros: 1
number of nonzeros: 1
number of nonzeros: 1
number of nonzeros: 1
number of nonzeros: 1
number of nonzeros: 1
number of nonzeros: 1
number of nonzeros: 1
number of nonzeros: 1
number of nonzeros: 1
number of nonzeros: 1
number of nonzeros: 1
number of nonzeros: 1
number of nonzeros: 1
number of nonzeros: 1
number of nonzeros: 1
number of nonzeros: 1
number of nonzeros: 1
number of nonzeros: 1
number of nonzeros: 1
number of nonzeros: 1
number of 

number of nonzeros: 1
number of nonzeros: 1
number of nonzeros: 1
number of nonzeros: 1
number of nonzeros: 1
number of nonzeros: 1
number of nonzeros: 1
number of nonzeros: 1
number of nonzeros: 1
number of nonzeros: 1
number of nonzeros: 1
number of nonzeros: 1
number of nonzeros: 1
number of nonzeros: 1
number of nonzeros: 1
number of nonzeros: 1
number of nonzeros: 1
number of nonzeros: 1
number of nonzeros: 1
number of nonzeros: 1
number of nonzeros: 1
number of nonzeros: 1
number of nonzeros: 1
number of nonzeros: 1
number of nonzeros: 1
number of nonzeros: 1
number of nonzeros: 1
number of nonzeros: 1
number of nonzeros: 1
number of nonzeros: 1
number of nonzeros: 1
number of nonzeros: 1
number of nonzeros: 1
number of nonzeros: 1
number of nonzeros: 1
number of nonzeros: 1
number of nonzeros: 1
number of nonzeros: 1
number of nonzeros: 1
number of nonzeros: 1
number of nonzeros: 1
number of nonzeros: 1
number of nonzeros: 1
number of nonzeros: 1
number of nonzeros: 1
number of 

number of nonzeros: 1
number of nonzeros: 1
number of nonzeros: 1
number of nonzeros: 1
number of nonzeros: 1
number of nonzeros: 1
number of nonzeros: 1
number of nonzeros: 1
number of nonzeros: 1
number of nonzeros: 1
number of nonzeros: 1
number of nonzeros: 1
number of nonzeros: 1
number of nonzeros: 1
number of nonzeros: 1
number of nonzeros: 1
number of nonzeros: 1
number of nonzeros: 1
number of nonzeros: 1
number of nonzeros: 1
number of nonzeros: 1
number of nonzeros: 1
number of nonzeros: 1
number of nonzeros: 1
number of nonzeros: 1
number of nonzeros: 1
number of nonzeros: 1
number of nonzeros: 1
number of nonzeros: 1
number of nonzeros: 1
number of nonzeros: 1
number of nonzeros: 1
number of nonzeros: 1
number of nonzeros: 1
number of nonzeros: 1
number of nonzeros: 1
number of nonzeros: 1
number of nonzeros: 1
number of nonzeros: 1
number of nonzeros: 1
number of nonzeros: 1
number of nonzeros: 1
number of nonzeros: 1
number of nonzeros: 1
number of nonzeros: 1
number of 

number of nonzeros: 1
number of nonzeros: 1
number of nonzeros: 1
number of nonzeros: 1
number of nonzeros: 1
number of nonzeros: 1
number of nonzeros: 1
number of nonzeros: 1
number of nonzeros: 1
number of nonzeros: 1
number of nonzeros: 1
number of nonzeros: 1
number of nonzeros: 1
number of nonzeros: 1
number of nonzeros: 1
number of nonzeros: 1
number of nonzeros: 1
number of nonzeros: 1
number of nonzeros: 1
number of nonzeros: 1
number of nonzeros: 1
number of nonzeros: 1
number of nonzeros: 1
number of nonzeros: 1
number of nonzeros: 1
number of nonzeros: 1
number of nonzeros: 1
number of nonzeros: 1
number of nonzeros: 1
number of nonzeros: 1
number of nonzeros: 1
number of nonzeros: 1
number of nonzeros: 1
number of nonzeros: 1
number of nonzeros: 1
number of nonzeros: 1
number of nonzeros: 1
number of nonzeros: 1
number of nonzeros: 1
number of nonzeros: 1
number of nonzeros: 1
number of nonzeros: 1
number of nonzeros: 1
number of nonzeros: 1
number of nonzeros: 1
number of 

number of nonzeros: 1
number of nonzeros: 1
number of nonzeros: 1
number of nonzeros: 1
number of nonzeros: 1
number of nonzeros: 1
number of nonzeros: 1
number of nonzeros: 1
number of nonzeros: 1
number of nonzeros: 1
number of nonzeros: 1
number of nonzeros: 1
number of nonzeros: 1
number of nonzeros: 1
number of nonzeros: 1
number of nonzeros: 1
number of nonzeros: 1
number of nonzeros: 1
number of nonzeros: 1
number of nonzeros: 1
number of nonzeros: 1
number of nonzeros: 1
number of nonzeros: 1
number of nonzeros: 1
number of nonzeros: 1
number of nonzeros: 1
number of nonzeros: 1
number of nonzeros: 1
number of nonzeros: 1
number of nonzeros: 1
number of nonzeros: 1
number of nonzeros: 1
number of nonzeros: 1
number of nonzeros: 1
number of nonzeros: 1
number of nonzeros: 1
number of nonzeros: 1
number of nonzeros: 1
number of nonzeros: 1
number of nonzeros: 1
number of nonzeros: 1
number of nonzeros: 1
number of nonzeros: 1
number of nonzeros: 1
number of nonzeros: 1
number of 

number of nonzeros: 1
number of nonzeros: 1
number of nonzeros: 1
number of nonzeros: 1
number of nonzeros: 1
number of nonzeros: 1
number of nonzeros: 1
number of nonzeros: 1
number of nonzeros: 1
number of nonzeros: 1
number of nonzeros: 1
number of nonzeros: 1
number of nonzeros: 1
number of nonzeros: 1
number of nonzeros: 1
number of nonzeros: 1
number of nonzeros: 1
number of nonzeros: 1
number of nonzeros: 1
number of nonzeros: 1
number of nonzeros: 1
number of nonzeros: 1
number of nonzeros: 1
number of nonzeros: 1
number of nonzeros: 1
number of nonzeros: 1
number of nonzeros: 1
number of nonzeros: 1
number of nonzeros: 1
number of nonzeros: 1
number of nonzeros: 1
number of nonzeros: 1
number of nonzeros: 1
number of nonzeros: 1
number of nonzeros: 1
number of nonzeros: 1
number of nonzeros: 1
number of nonzeros: 1
number of nonzeros: 1
number of nonzeros: 1
number of nonzeros: 1
number of nonzeros: 1
number of nonzeros: 1
number of nonzeros: 1
number of nonzeros: 1
number of 

number of nonzeros: 1
number of nonzeros: 1
number of nonzeros: 1
number of nonzeros: 1
number of nonzeros: 1
number of nonzeros: 1
number of nonzeros: 1
number of nonzeros: 1
number of nonzeros: 1
number of nonzeros: 1
number of nonzeros: 1
number of nonzeros: 1
number of nonzeros: 1
number of nonzeros: 1
number of nonzeros: 1
number of nonzeros: 1
number of nonzeros: 1
number of nonzeros: 1
number of nonzeros: 1
number of nonzeros: 1
number of nonzeros: 1
number of nonzeros: 1
number of nonzeros: 1
number of nonzeros: 1
number of nonzeros: 1
number of nonzeros: 1
number of nonzeros: 1
number of nonzeros: 1
number of nonzeros: 1
number of nonzeros: 1
number of nonzeros: 1
number of nonzeros: 1
number of nonzeros: 1
number of nonzeros: 1
number of nonzeros: 1
number of nonzeros: 1
number of nonzeros: 1
number of nonzeros: 1
number of nonzeros: 1
number of nonzeros: 1
number of nonzeros: 1
number of nonzeros: 1
number of nonzeros: 1
number of nonzeros: 1
number of nonzeros: 1
number of 

AttributeError: 'list' object has no attribute 'shape'

In [133]:
def count_empty_lists(data, data_name):
    num_empty_lists = 0
    for i in range(len(data)):
        if len(data[i]) == 0:
            num_empty_lists += 1
    print(data_name + ': ', 'Num empty lists:', num_empty_lists, 'out of total:', len(data))

count_empty_lists(X_train, 'X_train')
count_empty_lists(X_train, 'y_train')
count_empty_lists(X_train, 'X_test')
count_empty_lists(X_train, 'y_test')

X_train:  Num empty lists: 4033 out of total: 4835
y_train:  Num empty lists: 4033 out of total: 4835
X_test:  Num empty lists: 4033 out of total: 4835
y_test:  Num empty lists: 4033 out of total: 4835


In [121]:
# def check_arrays(X, y):
#     len_X = []
#     len_y = []
#     for i in range(len(X)):
#         len_X.append(len(X[i]))
#         len_y.append(len(y[i]))
#     plt.hist(len_X)
#     plt.show()
#     plt.hist(len_y)
#     plt.show()

# check_arrays(X_train, y_train)
# check_arrays(X_test, y_test)

In [122]:
# def remove_outliers_from_risk(df):
    
#     if 'risk_past' in df.columns:
#         risk = 'risk_past'
#     else:
#         risk = 'risk_future'
        
#     std = np.std(df[risk])
#     df[risk] = np.where(df[risk] < 20*std, 
#                          df[risk], 
#                          [0.]*len(df[risk]))

#     return df

# X_train, y_train, X_test, y_test = \
#     remove_outliers_from_risk(X_train), \
#     remove_outliers_from_risk(y_train), \
#     remove_outliers_from_risk(X_test), \
#     remove_outliers_from_risk(y_test)

In [185]:
def gridsearch_fit_predict(X_train, y_train, X_test, y_test,
                           model, scoring, model_type):
    param_grid = {
        'learning_rate': [0.01, 0.03, 0.05, 0.08, 0.1, 0.2, 0.5, 0.7],
        'n_estimators':  [80, 100, 120, 1000, 3000, 5000, 10000],
        'max_depth': [2, 3, 4, 5, 6, 7, 8],
    }

    gridsearch = GridSearchCV(model,
                              param_grid=param_grid, 
                              scoring=scoring, 
                              cv=3, n_jobs=-1,
                              return_train_score=True, verbose=10)

    gridsearch.fit(X_train, y_train)

    if scoring == 'neg_mean_squared_error':
        best_training_score = -gridsearch.score(X_train, y_train)
        best_testing_score  = -gridsearch.score(X_test, y_test)
        score_type = 'mse'
    else:
        pass
        
    best_model_params = gridsearch.cv_results_['params'][gridsearch.best_index_]
    
    print(f'  Best training {mse} from grid search:', best_training_score)
    print(f'  Best testing {mse} from grid search: ', best_testing_score)
    print('Best Grid Search model:', best_model_params)
    

    y_pred = gridsearch.predict(X_test)
    if score == 'neg_mean_squared_error':
        score_value = mean_squared_error(y_test, y_pred)
        print('MSE:', score_value)
    else:
        pass
        
    return y_pred, score_value, best_training_score, best_testing_score, best_model_params

In [130]:
y_pred, mse, best_training_mse, best_testing_mse, best_model_params = [], [], [], [], []
for i in range(0, 1):  # len(X_train)
    if len(X_train[i]) == 0:
        continue

    x1, x2, x3, x4, x5 = gridsearch_fit_predict(pd.DataFrame(data={'risk': X_train[i]}), 
                                                y_train[i],
                                                pd.DataFrame(data={'risk': X_test[i]}), 
                                                y_test[i],
                                                XGBRegressor(),
                                                'neg_mean_squared_error',
                                                'regressor')
    y_pred.append(x1)
    mse.append(x2)
    best_training_mse.append(x3)
    best_testing_mse.append(x4)
    best_model_params.append(x5)
    break;

Fitting 3 folds for each of 392 candidates, totalling 1176 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Batch computation too fast (0.0221s.) Setting batch_size=18.
[Parallel(n_jobs=-1)]: Done   2 tasks      | elapsed:    0.0s
[Parallel(n_jobs=-1)]: Done   9 tasks      | elapsed:    0.1s
[Parallel(n_jobs=-1)]: Done  16 tasks      | elapsed:    1.1s
[Parallel(n_jobs=-1)]: Batch computation too slow (9.9066s.) Setting batch_size=9.


KeyboardInterrupt: 

In [10]:
def plot_output(y, y_pred, dataset_type):

    fig = plt.figure(figsize=(10, 8))
    plt.scatter(np.arange(len(y.flatten())),
                 y.flatten(), color='blue');
    plt.scatter(np.arange(len(y_pred.flatten())),
                 y_pred.flatten(), color='red',
                s=1);
    plt.xlabel('Sample number', fontsize=16)
    plt.ylabel('Risk', fontsize=18)
    plt.title(dataset_type + ' dataset', fontsize=18)
    plt.legend(labels=['risk', 'predicted risk'], prop={'size': 20})
    plt.show()

In [11]:
plot_output(y_test.values, y_pred, 'testing')

NameError: name 'y_test' is not defined

## Update database with model predictions

In [None]:
# # MODEL PREDICTION HERE

# # Put predictions into pandas DataFrame with corresponding block id
# predictions = pd.DataFrame([[x] for x in list(block_ids)], columns=["id"])

# block_ids = {}
# for i in range(X.shape[0]):
#     block_ids[X.loc[i, 'blockid']] = i

# # Put predictions into pandas DataFrame with corresponding block id
# predictions = pd.DataFrame([[x] for x in list(block_ids)], columns=["id"])
# predictions.loc[:, "prediction"] = predictions["id"].apply(lambda x: y_pred[block_ids[x],:].astype(np.float64).tobytes().hex())
# predictions.loc[:, "month"] = end_month
# predictions.loc[:, "year"] = end_year
# predictions.to_csv("predictions.csv", index=False)

# # Query SQL
# query_commit_predictions = """
# CREATE TEMPORARY TABLE temp_predictions (
#     id SERIAL PRIMARY KEY,
#     prediction TEXT,
#     month INTEGER,
#     year INTEGER
# );

# COPY temp_predictions (id, prediction, month, year) FROM STDIN DELIMITER ',' CSV HEADER;

# UPDATE block
# SET 
#     prediction = DECODE(temp_predictions.prediction, 'hex'),
#     month = temp_predictions.month,
#     year = temp_predictions.year 
# FROM temp_predictions
# WHERE block.id = temp_predictions.id;

# DROP TABLE temp_predictions;
# """

# # Open saved predictions and send to database using above query
# with open("predictions.csv", "r") as f:
#     print("SENDING TO DB")
#     RAW_CONN = create_engine(DB_URI).raw_connection()
#     cursor = RAW_CONN.cursor()
#     cursor.copy_expert(query_commit_predictions, f)
#     RAW_CONN.commit()
#     RAW_CONN.close()
# os.remove("predictions.csv")

# for r in SESSION.execute("SELECT ENCODE(prediction::BYTEA, 'hex'), id FROM block WHERE prediction IS NOT NULL LIMIT 5;").fetchall():
#     print(np.frombuffer(bytes.fromhex(r[0]), dtype=np.float64).reshape((12,7,24)))
#     print(X[block_ids[int(r[1])], :].reshape((12,7,24)))