<a href="https://colab.research.google.com/github/jahudlow/Data_Fellowship_Project/blob/master/Update_Case_Dispatcher3_0.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
#@title

!pip install -U scikit-learn==0.21.2

import gspread
import json
import matplotlib
import numpy as np
import os
import pandas as pd
import pickle
import psycopg2
import random
import re
import sys

from copy import deepcopy
from datetime import datetime, date
from google.colab import auth, drive
from io import StringIO
from oauth2client.service_account import ServiceAccountCredentials
from time import time

from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.preprocessing import StandardScaler
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV

pd.set_option('chained_assignment', None)

drive.mount('/content/drive')
%cd /content/drive/Shared\ drives

os.chdir('/content/drive/Shared drives/Colab_Notebooks/')

import SL_Access.db_connect as dc
import SL_Access.gsheets as gs
db_cred = 'SL_Access/database.ini'
#gs_cred='SL_Access/creds.json'

## Parameters
All of the parameters below can be modified for the current run of the Case Dispatcher. 

After adjusting parameters as needed, select 'Runtime' -> 'Run All' from the menu above.

In [None]:
# 'country' and 'version' are used to access the Case Dispatcher Google Sheet,
# and should match the text in the Sheet title after 'Case Dispatcer 3.0 - '
country = 'Uganda'

version = ' - test'


# Weights of 'Priority Rating' components:
Priority_Weights = {
    'Eminence':	0.2,
    'Solvability':	0.6,
    'Strength_of_Case':	0.2,
}

# Weights of 'Solvability' sub-components:
Solvability_Weights = {
    'Victim_Willing_to_Testify': 2,
    'Bio_and_Location_of_Suspect': 2,
    'Other_Suspect(s)_Arrested': 1,
    'Police_Willing_to_Arrest': 1,
    'Recency_of_Case': 4,
    'Exploitation_Reported': 2,
    'PV_Believes': 2,
}

# The Case Dispatcher gives higher priority to more recent cases. The weight 
# placed on recency can be adjusted below (e.g. when 'Discount_Coef' is set to 
# 0.01, there is a 1% reduction in priority for each day that pases since the 
# case was initiated when the 'Discount_Exp' is set to 1, and a faster reduction 
# when it is > 1):
Recency_Vars = {
    'Discount_Coef': 0.01,
    'Discount_Exp': 1,
}

# Weights for different beliefs the PV holds about the suspect:
PV_Believes = {
    'pv_believes_definitely_trafficked_many': 1,
    'pv_believes_trafficked_some': 0.9,
    'pv_believes_suspect_trafficker': 0.5,
    'pv_believes_not_a_trafficker': 0
}


# Weights for different types of exploitation:
Exploitation_Type = {
  'exploitation_forced_prostitution_exp': 0.5, 
  'exploitation_sexual_abuse_exp': 0.45, 
  'exploitation_physical_abuse_exp': 0.3, 
  'exploitation_debt_bondage_exp': 0.4, 
  'exploitation_forced_labor_exp': 0.5, 
  'exploitation_organ_removal_exp': 0.5, 
  'exploitation_forced_prostitution_occ':	1, 
  'exploitation_sexual_abuse_occ':	0.9, 
  'exploitation_physical_abuse_occ':	0.6, 
  'exploitation_debt_bondage_occ':	0.8, 
  'exploitation_forced_labor_occ':	1, 
  'exploitation_organ_removal_occ':	1, 
}

# 'V_Multiplier' dictionary increases the 'solvability' of a case according to 
# the number of victims who are willing to testify:
v_mult = {
    0:0, 1:.5, 7:1
    }

for i in range(1, 6):
  v_mult[i + 1] = v_mult[i] + (1 - v_mult[i]) * .5


gscv = 'On' # Grid Search Cross Validation - Options: 'On', 'Off'
cutoff_days = 90 
#For doing GridsearchCV to find the best model

## Google Sheets Data Import

In [None]:
#@title
import gspread_dataframe as gd

auth.authenticate_user()

from google.auth import default
creds, _ = default()

gc = gspread.authorize(creds)

gs_name = 'Case Dispatcher 3.0 - ' + country + version

In [None]:
#@title
sheet_names = ['Suspects',
               'Victims',
               'Police',
               'Closed_Sus',
               'Closed_Vic',
               'Closed_Pol',
               'Cases']

def get_gsheets(workbook_name, sheet_names):
    """Return a list of Google worksheets from the name of a Google Sheet."""
    gsheets = []
    for name in sheet_names:
        sht = gc.open(workbook_name).worksheet(name)
        gsheets.append(sht)
    return gsheets

cdws = get_gsheets(gs_name, sheet_names)
dfs = gs.get_dfs(cdws)

## Searchlight Data Import

In [None]:
#@title
dbc = dc.DB_Conn(db_cred)

db_cif = dbc.ex_query("SELECT cif.cif_number, \
                      cif.interview_date, \
                      cif.number_of_victims, \
                      cif.case_notes, \
                      cif.number_of_traffickers, \
                      cif.recruited_agency, \
                      cif.recruited_broker, \
                      cif.recruited_no, \
                      cif.how_recruited_promised_job, \
                      cif.how_recruited_married, \
                      cif.how_recruited_promised_marriage, \
                      cif.how_recruited_at_work, \
                      cif.how_recruited_at_school, \
                      cif.how_recruited_job_ad, \
                      cif.how_recruited_broker_online, \
                      cif.how_recruited_broker_approached, \
                      cif.how_recruited_broker_through_friends, \
                      cif.how_recruited_broker_through_family, \
                      cif.how_recruited_broker_called_pv, \
                      cif.travel_expenses_paid_themselves, \
                      cif.travel_expenses_paid_by_broker, \
                      cif.expected_earning, \
                      cif.purpose_for_leaving_education, \
                      cif.purpose_for_leaving_travel_tour, \
                      cif.purpose_for_leaving_marriage, \
                      cif.purpose_for_leaving_family, \
                      cif.purpose_for_leaving_medical, \
                      cif.purpose_for_leaving_job_hotel, \
                      cif.purpose_for_leaving_job_household, \
                      cif.planned_destination, \
                      cif.id_made_no, \
                      cif.id_made_real, \
                      cif.id_made_fake, \
                      cif.id_made_false_name, \
                      cif.id_made_other_false, \
                      cif.exploitation_forced_prostitution_occ, \
                      cif.exploitation_forced_labor_occ, \
                      cif.exploitation_physical_abuse_occ, \
                      cif.exploitation_sexual_abuse_occ, \
                      cif.exploitation_debt_bondage_occ, \
                      cif.exploitation_organ_removal_occ, \
                      cif.exploitation_forced_prostitution_exp, \
                      cif.exploitation_forced_labor_exp, \
                      cif.exploitation_physical_abuse_exp, \
                      cif.exploitation_sexual_abuse_exp, \
                      cif.exploitation_debt_bondage_exp, \
                      cif.exploitation_organ_removal_exp, \
                      cif.legal_action_taken, \
                      cif.total_blue_flags, \
                      cif.station_id, \
                      pb.cif_id, \
                      pb.person_id, \
                      pb.pb_number, \
                      dp.arrested, \
                      dp.pv_believes, \
                      dp.role, \
                      bs.station_name \
                      FROM public.dataentry_cifcommon cif \
                      INNER JOIN public.dataentry_personboxcommon pb \
                      ON cif.id = pb.cif_id \
                      INNER JOIN public.dataentry_person dp \
                      ON pb.person_id = dp.id \
                      INNER JOIN public.dataentry_borderstation bs \
                      ON bs.id = cif.station_id \
                      INNER JOIN public.dataentry_country c \
                      ON c.id = bs.operating_country_id \
                      WHERE c.name = '{}'".format(country))

db_sus = dbc.ex_query("SELECT cif.cif_number, \
                      dp.full_name, \
                      dp.phone_contact, \
                      dp.address_notes, \
                      dp.case_filed_against, \
                      dp.social_media, \
                      dp.arrested, \
                      pb.person_id \
                      FROM public.dataentry_cifcommon cif \
                      INNER JOIN public.dataentry_personboxcommon pb \
                      ON cif.id = pb.cif_id \
                      INNER JOIN public.dataentry_person dp \
                      ON pb.person_id = dp.id \
                      INNER JOIN public.dataentry_borderstation bs \
                      ON bs.id = cif.station_id \
                      INNER JOIN public.dataentry_country c \
                      ON c.id = bs.operating_country_id \
                      WHERE c.name = '{}'".format(country))

db_vics = dbc.ex_query("SELECT cif.cif_number, \
                      dp.full_name, \
                      dp.phone_contact, \
                      dp.address_notes, \
                      dp.social_media \
                      FROM public.dataentry_cifcommon cif \
                      INNER JOIN public.dataentry_person dp \
                      ON cif.main_pv_id = dp.id \
                      INNER JOIN public.dataentry_borderstation bs \
                      ON bs.id = cif.station_id \
                      INNER JOIN public.dataentry_country c \
                      ON c.id = bs.operating_country_id \
                      WHERE c.name = '{}'".format(country))

irf_case_notes = dbc.ex_query("SELECT irf_number, \
                            case_notes \
                            FROM public.dataentry_irfcommon;")

dbc.close_conn()

PostgreSQL connection is closed


In [None]:
#@title
def generate_narrative(db_cif):
    """For cases that are missing a narrative, generate one from selected columns."""
    #without_cn = db_cif[db_cif['case_notes'] == '']
    without_cn = db_cif
    without_cn['narrative_broker_recruited'] = np.where(
        without_cn['recruited_broker'] == True, 
        'They were recruited by a broker. ', '')
    without_cn['narrative_recruited'] = np.where(
        without_cn['how_recruited_promised_job'] == True, 
        'Recruited by job promise. ', '')
    without_cn['narrative_recruited'] = np.where(
        without_cn['how_recruited_married'] == True,
        'Recruited by marriage. ' + without_cn['narrative_recruited'],
        without_cn['narrative_recruited'])
    without_cn['narrative_recruited'] = np.where(
        without_cn['how_recruited_promised_marriage'] == True,
        'Recruited by marriage promise. ' + without_cn['narrative_recruited'],
        without_cn['narrative_recruited'])
    without_cn['narrative_recruited'] = np.where(
        without_cn['how_recruited_at_work'] == True,
        'Recruited at work. ' + without_cn['narrative_recruited'],
        without_cn['narrative_recruited'])
    without_cn['narrative_recruited'] = np.where(
        without_cn['how_recruited_at_school'] == True,
        'Recruited at school. ' + without_cn['narrative_recruited'],
        without_cn['narrative_recruited'])
    without_cn['narrative_recruited'] = np.where(
        without_cn['how_recruited_job_ad'] == True,
        'Recruited through job ad. ' + without_cn['narrative_recruited'],
        without_cn['narrative_recruited'])
    without_cn['narrative_recruited'] = np.where(
        without_cn['how_recruited_broker_online'] == True,
        'Recruited online. ' + without_cn['narrative_recruited'],
        without_cn['narrative_recruited'])
    without_cn['narrative_recruited'] = np.where(
        without_cn['how_recruited_broker_approached'] == True,
        'Recruited by broker approaching them. ' \
        + without_cn['narrative_recruited'], 
        without_cn['narrative_recruited'])
    without_cn['narrative_recruited'] = np.where(
        without_cn['how_recruited_broker_approached'] == True,
        'Recruited by broker approaching them. ' \
        + without_cn['narrative_recruited'],
        without_cn['narrative_recruited'])
    without_cn['narrative_recruited'] = np.where(
        without_cn['how_recruited_broker_through_friends'] == True,
        'Recruited through friends. ' + without_cn['narrative_recruited'],
        without_cn['narrative_recruited'])
    without_cn['narrative_recruited'] = np.where(
        without_cn['how_recruited_broker_through_family'] == True,
        'Recruited through family. ' + without_cn['narrative_recruited'],
        without_cn['narrative_recruited'])
    without_cn['narrative_travel_expenses_paid_themselves'] = np.where(
        without_cn['travel_expenses_paid_themselves'] == True,
        'They paid the travel expenses themselves. ', '')
    without_cn['narrative_travel_expenses_paid_by_broker'] = np.where(
        without_cn['travel_expenses_paid_by_broker'] == True,
        'The broker paid the travel expenses. ', '')
    without_cn['narrative_expected_earnings'] = np.where(
        without_cn['expected_earning'] != '',
        'Broker said they would be earning ' \
        + without_cn['expected_earning'] + ' per month. ', '')
    without_cn['narrative_purpose'] = np.where(
        without_cn['purpose_for_leaving_education'] == True,
        'Left home for education. ', '')
    without_cn['narrative_purpose'] = np.where(
        without_cn['purpose_for_leaving_travel_tour'] == True,
        'Left home for travel or tour. ', without_cn['narrative_purpose'])
    without_cn['narrative_purpose'] = np.where(
        without_cn['purpose_for_leaving_marriage'] == True,
        'Left home for marriage. ', without_cn['narrative_purpose'])
    without_cn['narrative_purpose'] = np.where(
        without_cn['purpose_for_leaving_family'] == True,
        'Left home for family. ', without_cn['narrative_purpose'])
    without_cn['narrative_purpose'] = np.where(
        without_cn['purpose_for_leaving_medical'] == True,
        'Left home for medical reasons. ', without_cn['narrative_purpose'])
    without_cn['narrative_purpose'] = np.where(
        without_cn['purpose_for_leaving_job_hotel'] == True,
        'Left home for job at hotel. ', without_cn['narrative_purpose'])
    without_cn['narrative_purpose'] = np.where(
        without_cn['purpose_for_leaving_job_household'] == True,
        'Left home for household job. ', without_cn['narrative_purpose'])
    without_cn['narrative_destination'] = np.where(
        without_cn['planned_destination'] != '',
        'Planned destination: ' + without_cn['planned_destination'] + ' ', '')
    without_cn['narrative_id'] = np.where(
        without_cn['id_made_no'] == True,
        'No ID made. ', '')
    without_cn['narrative_id'] = np.where(
        without_cn['id_made_real'] == True,
        'Real ID made. ', without_cn['narrative_id'])
    without_cn['narrative_id'] = np.where(
        without_cn['id_made_fake'] == True,
        'Fake ID made. ', without_cn['narrative_id'])
    without_cn['narrative_id'] = np.where(
        without_cn['id_made_false_name'] == True,
        'ID with false name made. ', without_cn['narrative_id'])
    without_cn['narrative_id'] = np.where(
        without_cn['id_made_other_false'] == True,
        'ID made with other false info. ', without_cn['narrative_id'])
    without_cn['narrative_legal'] = np.where(
        without_cn['legal_action_taken'].str.contains('yes'),
        'Legal Case Filed. ', '')
    without_cn['narrative_pv_believes'] = np.where(
        without_cn['pv_believes'].str.contains('Definitely', regex=False),
        'PV believes the suspect has definitely trafficked many. ', '')
    without_cn['narrative_pv_believes'] = np.where(
        without_cn['pv_believes'].str.contains('some', regex=False),
        'PV believes the suspect has trafficked some. ', 
        without_cn['narrative_pv_believes'])
    without_cn['narrative_pv_believes'] = np.where(
        without_cn['pv_believes'].str.contains('Suspect', regex=False),
        'PV suspects they are a trafficker. ', 
        without_cn['narrative_pv_believes'])
    without_cn['narrative_pv_believes'] = np.where(
        without_cn['pv_believes'].str.contains('Don', regex=False),
        'PV does not believe the suspect is a trafficker. ', 
        without_cn['narrative_pv_believes'])
    without_cn['narrative'] = without_cn['narrative_broker_recruited'].fillna(
        '') + without_cn['narrative_recruited'].fillna('') + \
    without_cn['narrative_travel_expenses_paid_themselves'].fillna('') + \
    without_cn['narrative_travel_expenses_paid_by_broker'].fillna('') + \
    without_cn['narrative_expected_earnings'].fillna('') + \
    without_cn['narrative_purpose'].fillna('') + \
    without_cn['narrative_destination'].fillna('') + \
    without_cn['narrative_id'].fillna('') + \
    without_cn['narrative_legal'].fillna('') + \
    without_cn['narrative_pv_believes'].fillna('')
    without_cn['case_notes'] = np.where(
        without_cn['case_notes'] == '',
        without_cn['narrative'],
        without_cn['case_notes'])
    without_cn['case_notes'] = without_cn['case_notes'].replace(
        'nan', np.nan).fillna('')
    without_cn = without_cn[
        without_cn.columns[
            ~without_cn.columns.str.contains("narrative")]]
    return without_cn

def pre_proc(db_cif):
    """Generates suspect IDs and narratives."""

    soc_df = db_cif[(db_cif.role != 'Complainant') & (db_cif.role != 'Witness')]
    soc_df = soc_df[~soc_df['pb_number'].isna()]
    soc_df['suspect_id'] = soc_df['cif_number'].str.replace('.', '')
    soc_df['pb_number'] = soc_df['pb_number'].astype(int)
    soc_df['suspect_id'] = soc_df['suspect_id'].str[:-1] \
        + ".PB" + soc_df['pb_number'].map(str)
    #soc_df = soc_df.drop_duplicates(subset='suspect_id')
    soc_df = generate_narrative(soc_df)

    return soc_df


def organize_uganda_dest(soc_df):
    """Clean and organize desitnation data so it is ready for feature union."""
    soc_df['planned_destination'] = soc_df['planned_destination'].str.replace(
        r'[^\w\s]+', '')
    soc_df['destination_gulf'] = np.where(
        soc_df['planned_destination'].str.contains(
            'Gulf|Kuwait|Dubai|UAE|Oman|Saudi|Iraq|Qatar|Bahrain'), 
            True, False)
    
    dest = ['Kampala',
            'Kyegegwa',
            'Nairobi',
            'Kenya']

    for d in dest:
        soc_df['destination_' + str(d)] = np.where(
            soc_df['planned_destination'].str.contains(d),
            True, False)

    #soc_df.pb_number = soc_df.pb_number.fillna(0)
    #soc_df.pb_number = soc_df.pb_number.astype(int)
    return soc_df

def organize_dtypes(soc_df):
    """Assigns relevant data types to variables."""
    num_features = [
        'total_blue_flags',
        'number_of_traffickers',
        'number_of_victims',
        'expected_earning']

    boolean_features = list(
        set(list(soc_df.columns)) -
        set(num_features) -
        set(['suspect_id', 'interview_date', 'case_notes', 'cif_number']))
    soc_df[boolean_features] = soc_df[boolean_features].astype(bool)
    soc_df[num_features] = soc_df[num_features].fillna(0).astype(float)

    return soc_df


def remove_non_numeric(x):
    try:
        x = re.sub("[^0-9]", 0, x)
    except:
        x = 0
    return x


def en_features(soc_df):
    """Engineer features for selected destinations Person Box variables."""
    soc_df = organize_uganda_dest(soc_df)
    
    soc_df['expected_earning'] = soc_df['expected_earning'].apply(
        lambda x: remove_non_numeric(x))
    #soc_df['expected_earning'] = soc_df['expected_earning'].astype(int)
  
    soc_df['number_of_victims'] = np.where(
        soc_df['number_of_victims'].isna(), 1, soc_df['number_of_victims'])
    
    soc_df['pv_believes_definitely_trafficked_many'] = np.where(
        soc_df['pv_believes'].str.contains('Definitely', regex=False),
        True, False)
    soc_df['pv_believes_trafficked_some'] = np.where(
        soc_df['pv_believes'].str.contains('some', regex=False),
        True, False)
    soc_df['pv_believes_suspect_trafficker'] = np.where(
        soc_df['pv_believes'].str.contains('Suspect', regex=False),
        True, False)
    soc_df['pv_believes_not_a_trafficker'] = np.where(
        soc_df['pv_believes'].str.contains('Don', regex=False),
        True, False)

    soc_df = soc_df.drop(columns=[
        'arrested',
        'station_id',
        'cif_id',
        'pb_number',
        'role',
        'planned_destination',
        'pv_believes',
        'legal_action_taken',
        'station_name',
        ])

    soc_df = organize_dtypes(soc_df)

    soc_df = soc_df.loc[:, (soc_df != False).any(axis=0)]

    return soc_df


class TypeSelector(BaseEstimator, TransformerMixin):
    """This is a class for applying transformations based on data type."""

    def __init__(self, dtype):
        self.dtype = dtype

    def fit(self, X, y=None):
        return self

    def transform(self, X):
        assert isinstance(X, pd.DataFrame)
        return X.select_dtypes(include=[self.dtype])


def build_transformer():
    transformer = Pipeline([
        ('features', FeatureUnion(transformer_list=[
            ('boolean', Pipeline([
                ('selector', TypeSelector('bool')),
            ])),

            ('numericals', Pipeline([
                ('selector', TypeSelector(np.number)),
                ('scaler', StandardScaler()),
            ]))
        ], n_jobs=1)),
    ])
    return transformer


def remove_recent(df, cutoff_days):
    """Eliminates cases more recent than the cutoff date."""
    today = date.today()
    today.strftime("%m/%d/%Y")
    df['Days'] = (today - df.loc[:, 'interview_date']) / np.timedelta64(1, 'D')
    sub_df = df[(df['Days'] > cutoff_days) | (df['Arrest'] == True)]
    return sub_df


def train_test_val_split(sub_df, te_size=.2, val_size=.1):
    """Splits dataset into training, testing, and validation sets."""
    X = (sub_df.drop(columns=['Arrest',
                              'cif_number',
                              'Days',
                              'interview_date',
                              'suspect_id',
                              'person_id',
                              'case_notes']))
    y = sub_df.Arrest
    val_size = val_size / (1 - te_size)
    X_train, X_test, y_train, y_test = train_test_split(X,
                                                        y,
                                                        test_size=te_size)
    X_train, X_validation, y_train, y_validation = train_test_split(X_train,
                                                                    y_train,
                                                                    test_size=val_size)
    
    return X_train, X_validation, y_train, y_validation


def get_cls_pipe(clf=RandomForestClassifier()):
    """Builds pipeline with transformer and classifier algorithm."""
    transformer = build_transformer()
    cls_pipeline = Pipeline([
        ('transformer', transformer),
        ('clf', clf)
    ])
    return cls_pipeline


def pipe_predict(cls_pipeline, X_train, y_train, X_validation):
    """Make predictions with classifier pipeline."""
    cls_pipeline.fit(X_train, y_train)
    y_rf = cls_pipeline.predict_proba(X_validation)
    return y_rf


def do_gridsearch(cls_pipeline, X_train, y_train):
    """Conducts gridsearch cross validation on selected classifer."""
    search_space = [{'clf': [RandomForestClassifier()],
                     'clf__bootstrap': [False, True],
                     'clf__n_estimators': [10, 100],
                     #'clf__max_depth': [5, 10, 20, 30, 40, 50, None],
                     'clf__max_depth': [20, 30],
                     #'clf__max_features': [0.5, 0.6, 0.7, 0.8, 1],
                     'clf__max_features': [0.5, 0.6],
                     'clf__class_weight': ["balanced", None]}]
                     #'clf__class_weight': ["balanced", 
                     #                      "balanced_subsample", None]}]
    grid_search = GridSearchCV(cls_pipeline,
                               search_space,
                               cv=4, n_jobs=-1,
                               verbose=1)

    print("Performing grid search...")
    print("parameters:")
    print(search_space)
    t0 = time()
    best_model = grid_search.fit(X_train, y_train)
    print("done in %0.3fs" % (time() - t0))
    print()
    best_parameters = best_model.best_estimator_.get_params()['clf']

    print("Best score: %0.3f" % grid_search.best_score_)
    print("Best parameters set:")
    print(best_parameters)
    return best_model


def full_gridsearch_pipe(soc_df, cutoff_days=90):
    sub_df = remove_recent(soc_df, cutoff_days)
    X_train, X_validation, y_train, y_validation = train_test_val_split(sub_df)
    cls_pipeline = get_cls_pipe()
    best_model = do_gridsearch(cls_pipeline, X_train, y_train)
    x_cols = list(X_validation.columns)
    return best_model, x_cols, X_validation


def check_grid_search_cv(soc_df, gscv, cutoff_days):
    """Check to see if Grid Search CV is on, and if it is run Grid Search CV."""
    if gscv == 'On':
        best_model, x_cols, X_Validation = full_gridsearch_pipe(soc_df, 
                                                                cutoff_days)
    return best_model, x_cols, X_Validation


def make_new_predictions(df, soc_model, x_cols):
    """Use existing classifier algorithm on new cases without recalculating 
    best fit."""
    X = df[df.columns & x_cols]
    df['soc'] = soc_model.predict_proba(X)[:, 1]
    return df




class GetAttr:
    """This is a class which allows objects in it's subclasses to be indexed."""
    def __getitem__(cls, x):
        return getattr(cls, x)


class EntityGroup(GetAttr):
    """This is a class for Victims, Suspects, and Police entity groups with 
    corresponding sheets."""
    sheets = []

    def __init__(self, uid, new_cases, active_gsheet, closed_gsheet, gsdfs):
        EntityGroup.sheets.append(self)
        self.uid = uid
        self.new = new_cases
        self.gsheet = gsdfs[active_gsheet]
        self.closed = gsdfs[closed_gsheet]
        self.active_name = active_gsheet
        self.closed_name = closed_gsheet

    @classmethod
    def merge_addresses(cls, addr):
        """Adds relevant address data to new entity groups."""
        addr = addr
        for sheet in cls.sheets:
            #sheet.new.infer_objects
            if 'address1_id' in sheet.new:
                sheet.new['address1_id'] = sheet.new['address1_id'].fillna(0).astype(int)
                sheet.new['address2_id'] = sheet.new['address2_id'].fillna(0).astype(int)
                sheet.new = pd.merge(sheet.new, addr, how='left', 
                                     on='address2_id')
                sheet.new['Address'] = sheet.new['address_2'].map(str) + ", " \
                + sheet.new['address_1']

    @classmethod
    def subset_country(cls, scc_key, country):
        """Subset dataframe for a specific country."""
        for sheet in cls.sheets:
            sheet.new['station_code'] = sheet.new[sheet.uid].str[:3]
            sheet.new = pd.merge(sheet.new, scc_key, how='left', 
                                 on='station_code')
            sheet.new = sheet.new[sheet.new.country_name == country]
            sheet.new = sheet.new.iloc[:, 0:len(sheet.new.columns)-2]
            sheet.new.drop_duplicates(subset=sheet.uid, inplace=True)

    @classmethod
    def set_case_id(cls):
        """Creates a Case ID from the form ID stored in the database."""
        for sheet in cls.sheets:
            sheet.new.loc[:, 'Case_ID'] = sheet.new['Case_ID'].str.replace('.', 
                                                                           '')
            sheet.new['Case_ID'] = sheet.new['Case_ID'].str[:-1]

    @classmethod
    def combine_sheets(cls):
        """Adds new cases to data already in the corresponding Google Sheet."""
        for sheet in cls.sheets:
            sheet.newcopy = deepcopy(sheet.new)
            sheet.newcopy = sheet.newcopy.reindex(
                columns=sheet.new.columns.tolist() + list(sheet.gsheet.columns))
            sheet.newcopy = sheet.newcopy.iloc[:, 7:len(sheet.newcopy.columns)]
            sheet.active = pd.concat([sheet.gsheet, sheet.newcopy], sort=False)
            sheet.active.drop_duplicates(subset=sheet.uid, inplace=True)

    @classmethod
    def move_closed(cls, soc_df):
        """Moves closed cases to closed sheet for each Entity Group instance."""
        for sheet in cls.sheets:
            prev_closed = sheet.newcopy[sheet.newcopy[sheet.uid].isin(
                soc_df[soc_df.Arrest==1].suspect_id)]
            prev_closed['Case_Status'] = "Closed: Already in Legal Cases Sheet"
            newly_closed = sheet.gsheet[sheet.gsheet['Date_Closed'].str.len() \
                                        > 1]
            prev_closed = prev_closed[
                ~prev_closed[sheet.uid].isin(sheet.closed[sheet.uid])]
            original_cols = list(prev_closed.columns)
            new_cols = [original_cols[i] + str(i) for i in range(
                len(original_cols))]
            sheet.closed.columns = new_cols
            prev_closed.columns = new_cols
            newly_closed.columns = new_cols
            sheet.closed = pd.concat([sheet.closed, prev_closed, newly_closed])
            sheet.closed.columns = original_cols
            sheet.closed.drop_duplicates(subset=sheet.uid, inplace=True)
            sheet.active = sheet.active[~sheet.active[sheet.uid].isin(
                sheet.closed[sheet.uid])]

    @classmethod
    def move_other_closed(cls, suspects, police, victims):
        """Moves cases closed in other Entity Groups to closed sheets."""
        closed_suspects = suspects.active[
            (suspects.active['Suspect_ID'].isin(police.closed['Suspect_ID'])) |
            (~suspects.active['Case_ID'].isin(victims.active['Case_ID']))]
        closed_police = police.active[
            (police.active['Suspect_ID'].isin(suspects.closed['Suspect_ID'])) |
            (~police.active['Case_ID'].isin(victims.active['Case_ID']))]
        closed_victims = victims.active[
            (~victims.active['Case_ID'].isin(police.active['Case_ID'])) |
            (~victims.active['Case_ID'].isin(suspects.active['Case_ID']))]
        orig_scols = list(suspects.closed.columns)
        new_scols = [orig_scols[i] + str(i) for i in range(len(orig_scols))]
        suspects.closed.columns = new_scols
        closed_suspects.columns = new_scols
        suspects.closed = pd.concat(
            [suspects.closed, closed_suspects])
        suspects.closed.columns = orig_scols
        suspects.closed.drop_duplicates(subset='Suspect_ID')
        orig_pcols = list(police.closed.columns)
        new_pcols = [orig_pcols[i] + str(i) for i in range(len(orig_pcols))]
        police.closed.columns = new_pcols
        closed_police.columns = new_pcols
        police.closed = pd.concat(
            [police.closed, closed_police])
        police.closed.columns = orig_pcols
        police.closed.drop_duplicates(subset='Suspect_ID')
        orig_vcols = list(victims.closed.columns)
        new_vcols = [orig_vcols[i] + str(i) for i in range(len(orig_vcols))]
        victims.closed.columns = new_vcols
        closed_victims.columns = new_vcols
        victims.closed = pd.concat(
            [victims.closed, closed_victims])
        victims.closed.columns = orig_vcols
        victims.closed.drop_duplicates(subset='Victim_ID', inplace=True)
        
        for sheet in cls.sheets:
            sheet.active = sheet.active[~sheet.active[sheet.uid].isin(
                sheet.closed[sheet.uid])]
            sheet.closed = sheet.closed[sheet.closed['Case_ID']!='']
            sheet.closed = sheet.closed.loc[:,:'Date_Closed']

    @classmethod
    def update_gsheets(cls, gs_name, active_cases):
        """Update Google Sheets with new data."""
        for sheet in cls.sheets:
            target_sheet = gc.open(gs_name).worksheet(sheet.active_name)
            up_sheet = sheet.active.iloc[:,:len(sheet.active.columns)-1]
            gd.set_with_dataframe(target_sheet, up_sheet)
            target_sheet = gc.open(gs_name).worksheet(sheet.closed_name)
            gd.set_with_dataframe(target_sheet, sheet.closed)
        target_sheet = gc.open(gs_name).worksheet('Cases')
        up_sheet = active_cases.iloc[:,:len(active_cases.columns)]
        gd.set_with_dataframe(target_sheet, up_sheet)

    @classmethod
    def add_irf_notes(cls, irf_notes):
        """Update Google Sheets with new data."""
        for sheet in cls.sheets:
          sheet.active = pd.merge(sheet.active, irf_notes, how='left', 
                                  left_on='Case_ID', right_on='irf_number')
          sheet.active['IRF_Case_Notes'] = sheet.active['case_notes']
          sheet.active = sheet.active.loc[:,:'Date_Closed']

    @classmethod
    def add_case_name_formula(cls):
      for sheet in cls.sheets:
        sheet.active.reset_index(inplace=True)
        sheet.active = sheet.active.drop(columns='index')
        for index, row in sheet.active.iterrows():
          sheet.active.at[index, 'Case_Name'] = '=iferror(index(Cases!B:B,match(A{},Cases!A:A,0)),)'.format(index + 2)

    new_gsheets = []

def set_vic_id(new_victims):
    """Creates a unique ID for each victim from Case ID and subsets/renames 
    columns."""
    new_victims = new_victims[['cif_number',
                               'full_name',
                               'phone_contact',
                               'address_notes',
                               'social_media']]
    new_victims.loc[:, 'Victim_ID'] = new_victims['cif_number']
    replacements = {
        'Victim_ID': {
            r'(\.1|A$)': '.V1', r'B$': '.V2', r'C$': '.V3', r'D$': '.V4', 
            r'E$': '.V5', r'F$': '.V6', r'G$': '.V7', r'H$': '.V8', 
            r'I$': '.V9', r'J$': '.V10'}
    }
    new_victims.replace(replacements, regex=True, inplace=True)
    new_victims.sort_values('full_name', inplace=True)
    new_victims = new_victims.drop_duplicates(subset='Victim_ID')
    non_blanks = new_victims['full_name'] != ""
    new_victims = new_victims[non_blanks]
    vcols = ['Case_ID', 'Name',
             'Phone Number(s)',
             'Address',
             'Socail Media',
             'Victim_ID']
    new_victims.columns = vcols
    new_victims['Narrative'] = ''
    return new_victims


def set_sus_id(new_suspects, db_cif):
    """Creates a unique ID for each suspect from Case ID and subsets/renames 
    columns."""
    new_suspects = new_suspects[['person_id',
                                 'full_name',
                                 'phone_contact',
                                 'address_notes',
                                 'social_media']]
    cif_ids = db_cif[['cif_number', 'person_id', 'pb_number', 'case_notes']]
    new_suspects = pd.merge(new_suspects, cif_ids, how='outer', on='person_id', 
                            sort=True, suffixes=('x', 'y'), copy=True)
    new_suspects.loc[:, 'pb_number'] = new_suspects['pb_number'].fillna(
        0).astype(int)
    new_suspects.loc[:, 'Suspect_ID'] = new_suspects.\
    loc[:,'cif_number'].str.replace('.', '')
    new_suspects.loc[:, 'Suspect_ID'] = new_suspects. \
    loc[:, 'Suspect_ID'].str[:-1] + ".PB" + new_suspects['pb_number'].map(str)
    new_suspects = new_suspects.drop_duplicates(subset='Suspect_ID')
    new_suspects = new_suspects[['cif_number', 'Suspect_ID', 'full_name',
                                 'phone_contact', 'address_notes',
                                 'social_media', 'case_notes']]
    new_suspects.rename(columns={
        'full_name': 'Name',
        'phone_contact': 'Phone Number(s)',
        'address_notes': 'Address',
        'social_media': 'Social Media ID',
        'cif_number': 'Case_ID',
        'case_notes': 'Narrative',
        }, inplace=True)
    return new_suspects

def save_results(best_model, X_validation):
    """Pickles model and column names and saves them for later use."""
    pickle.dump(best_model, open('Case Dispatcher 3.0/u21_rf_model.sav', 'wb'))
    xcols = list(X_validation.columns)
    with open('Case Dispatcher 3.0/X_cols.txt', 'w') as f:
        for item in xcols:
            f.write("%s\n" % item)



def sum_and_join_vic(x):
    """Aggregate count of victims willing to testify by Case ID."""
    return pd.Series(dict(count=x['count'].sum(),
                          willing_to_testify=', '.join(
                              x.astype(str)['willing_to_testify'])))


def sum_and_join_sus(x):
    """Aggregate count of suspects located by Case ID."""
    return pd.Series(dict(count=x['count'].sum(),
                          located=', '.join(x.astype(str)['located'])))


def get_vics_willing_to_testify(victims):
    """Get subset of victims who have indicated they're willing to testify 
    against traffickers."""
    vics_willing = victims.loc[
                               victims['Case_Status'] == \
                               "Step Complete: Victim is willing to press charges"]
    if len(vics_willing) > 0:
        vics_willing = vics_willing[['Case_ID', 'Name']]
        vics_willing.rename(columns={"Name": "willing_to_testify"},inplace=True)
        vics_willing['count'] = 1
        vics_willing = vics_willing.groupby('Case_ID').apply(sum_and_join_vic)
    else:
        vics_willing['willing_to_testify'] = ''
    return vics_willing


def add_vic_names(target_sheet, vics_willing):
    """Add comma separated list of victims willing to testify to active police 
    or suspect sheet."""
    if len(vics_willing) > 0:
        target_sheet = pd.merge(target_sheet, vics_willing, how='left', 
                                on='Case_ID')
        target_sheet['Victims_Willing_to_Testify'] = \
        target_sheet['willing_to_testify'].fillna('')
        target_sheet.drop(columns=['willing_to_testify', 'count'], inplace=True)
    return target_sheet


def get_sus_located(suspects):
    """Get subset of suspects who have been identified and located."""
    sus_located = suspects.loc[
        suspects.Case_Status.str.contains("Step Complete", na=False)]
    if len(sus_located) > 0:
        sus_located = sus_located[['Case_ID', 'Name']]
        sus_located.rename(columns={"Name": "located"}, inplace=True)
        sus_located['count'] = 1
        sus_located = sus_located.groupby('Case_ID').apply(sum_and_join_sus)
    return sus_located


def add_sus_located(target_sheet, sus_located):
    """Add comma separated list of suspects identified and located to other 
    sheet."""
    if len(sus_located) > 0:
        target_sheet = pd.merge(target_sheet, sus_located, how='left', 
                                on='Case_ID')
        target_sheet['Suspects_Identified_and_Located'] = \
        target_sheet['located'].fillna('')
        target_sheet.drop(columns=['located', 'count'], inplace=True)
    return target_sheet


def calc_vics_willing_scores(sus, vics_willing):
    """Calculate scores for number of victims willing to testify and add them 
    to suspect sheet."""
    if len(vics_willing)>0:
        sus = pd.merge(suspects.active, vics_willing, how='left', on='Case_ID')
        sus['count'] = sus['count'].fillna(0).astype(int)
        sus['V_Mult'] = sus['count'].map(v_mult)
        sus.drop(columns=['willing_to_testify',
                          'count'], inplace=True)
        sus['V_Mult'].fillna(0, inplace=True)
        sus['V_Mult'] = sus['V_Mult'].astype('float')
    else:
        sus['V_Mult'] = 0
        sus['V_Mult'] = sus['V_Mult'].astype('float')
    return sus


def calc_arrest_scores(sus, soc_df, pol):
    """Calculate scores for the number of other suspects arrested in each case 
    and create fields for 'bio known' and for police willing to arrest."""
    sus['Bio_Known'] = np.where(
        sus['Case_Status'].eq(
            'Step Complete: Identity and Location Confirmed'), 0, 1)
    arrests = get_total_arrests(soc_df)
    sus = pd.merge(sus, arrests, how='left', on='Case_ID')
    sus['Total_Arrests'] = sus['Total_Arrests'].fillna(0).astype(int)
    sus.rename(columns={'Total_Arrests': 'Others_Arrested'}, inplace=True)
    pol['Willing_to_Arrest'] = np.where(
        pol.Case_Status.str.contains("Step Complete", na=False), 1, 0)
    sus = pd.merge(sus, pol[['Case_ID', 'Willing_to_Arrest']], how='left', 
                   on='Case_ID')
    return sus


def weight_pv_believes(sus, soc_df, PV_Believes):
    pvb = soc_df[['cif_number',
                  'pv_believes_definitely_trafficked_many',
                  'pv_believes_trafficked_some',
                  'pv_believes_suspect_trafficker']]
    pvb['pv_believes'] = np.where(pvb[pvb.columns[1]]==True,
                              PV_Believes[pvb.columns[1]],
                              np.where(pvb[pvb.columns[2]]==True, 
                                       PV_Believes[pvb.columns[2]],
                                       np.where(pvb[pvb.columns[3]]==True, 
                                                PV_Believes[pvb.columns[3]],0)))
    pvb['Case_ID'] = pvb['cif_number'].str[:-1].replace('.', '')
    pvb = pvb[['Case_ID', 'pv_believes']]
    pvb['pv_believes'] = pvb['pv_believes'].astype(float)
    pvb.drop_duplicates(subset='Case_ID', inplace=True)
    sus = pd.merge(sus, pvb, how='left', on='Case_ID')
    return sus


def get_exp_score(sus, soc_df, Exploitation_Type):
    """Calculate exploitation score based on parameters and reported 
    exploitation."""
    exp_cols = [x for x in soc_df.columns if 'exploitation' in x]
    exp_cols.append('cif_number')
    exp_df = soc_df[exp_cols]
    exp_df['exp'] = 0
    for c in exp_df.columns:
      try:
        exp_df['exp'] = np.where(exp_df[c]==True,
                                    Exploitation_Type[c] + exp_df['exp'],
                                    exp_df['exp'])
      except:
        pass
    exp_df['Case_ID'] = exp_df['cif_number'].str[:-1].replace('.', '')
    exp_df = exp_df[['Case_ID', 'exp']]
    exp_df['exp'] = exp_df['exp'].astype(float)
    exp_df.drop_duplicates(subset='Case_ID', inplace=True)
    sus = pd.merge(sus, exp_df, how='left', on='Case_ID')
    return sus


def calc_recency_scores(sus, soc_df, weights):
    """Assign score to each case that is higher the more recent it is."""
    today = date.today()
    today.strftime("%m/%d/%Y")
    cif_dates = soc_df[['cif_number', 'interview_date']]
    cif_dates['Days_Old'] = (
        today - cif_dates.loc[:, 'interview_date']) / np.timedelta64(1, 'D')
    cif_dates['Case_ID'] = cif_dates['cif_number'].str[:-1].replace('.', '')
    sus = pd.merge(
        sus, cif_dates[['Case_ID', 'Days_Old']], how='left', on='Case_ID')
    coef = weights['Discount_Coef']
    exp = weights['Discount_Exp']
    sus['Recency_Score'] = np.where(
        (1 - coef * sus['Days_Old'] ** exp) > 0, 
        1 - coef * sus['Days_Old'] ** exp, 0)
    sus = sus.drop_duplicates(subset='Suspect_ID')
    return sus


def calc_network_scores(sus_with_links, sus):
    """
    Calculate weighted scores based on 1st and 2nd degree links that each suspect
    has with suspects from other cases and add these scores to the 'sus' dataframe.

        1st degree case link = two suspects have a direct connection
        2nd degree case link = two suspects are connected by one or more mutual contacts

    The calculations are made by dividing the number of first and second degree case links
    by the log (base 10) of the total number of connections the suspect has (plus nine).
    Nine is added to the number of connections so that if the number of connections is
    between 1-9 the product of the log will not be less than 1.
    """
    sus_with_links['1d_case_score'] = sus_with_links['first_degree_case_links'] / np.log10(
        sus_with_links['first_degree_links'] + 9)
    sus_with_links['2d_case_score'] = sus_with_links['second_degree_case_links'] / np.log10(
        sus_with_links['first_degree_links'] + 9)
    sus = pd.merge(sus,
                   sus_with_links[['suspect_case_id', '1d_case_score', '2d_case_score']],
                   how='left', left_on='Suspect_ID', right_on='suspect_case_id')
    sus.drop(columns=['suspect_case_id'], inplace=True)
    return sus


def get_network_weights(Parameters):
    """Get weights for network analysis from Parameters Google Sheet."""
    net_weights = pd.DataFrame(Parameters.iloc[5:9, 4:6])
    net_weights.columns = ['key', 'value']
    return net_weights


def weight_network_scores(sus, net_weights):
    """Weight the network scores according to the weights provided in the 
    Parameters Sheet."""
    s = net_weights.set_index('key')['value']
    one_link_add = float(s['1 Link Em Added'])
    max_add = float(s['Max Em Added'])
    second_d_weight = float(s['2nd Degree Weight'])
    sus['net_weight'] = sus['1d_case_score'] * one_link_add + (
            sus['2d_case_score'] * one_link_add) * second_d_weight
    sus['net_weight'].round(2)
    sus['net_weight'] = np.where(
        sus['net_weight'] > max_add,
        max_add,
        sus['net_weight'])
    sus.drop(columns=['1d_case_score', '2d_case_score'], inplace=True)
    return sus


def check_update_links(sus_with_links, sus, Parameters):
    """Check to see if Network Analysis is on, and if it is calculate network 
    scores and weight."""
    net_weights = get_network_weights(Parameters)
    if net_weights.iloc[0,1] == 'On':
        sus = calc_network_scores(sus_with_links, sus)
        sus = weight_network_scores(sus, net_weights)
    return sus


def get_eminence_score(sus):
    """Get eminence score from active sheet, if blank enter '1'."""
    sus['Em2'] = sus['Eminence'].fillna(1)
    sus.loc[sus['Eminence'].str.len() < 1, 'Em2'] = 1
    sus['Em2'] = sus['Em2'].astype(float)
    if 'net_weight' in sus:
        sus['Em2'] += sus['net_weight']
        sus['Em2'] = np.where(sus['Em2'] > 9, 9, sus['Em2'])
        sus['Em2'] = sus['Em2'].fillna(0)
        sus.drop(columns=['net_weight'], inplace=True)
    return sus


def get_sus_located_in(sus, location):
    """Get subset of suspects who have a particular location mentioned in 
    their address."""
    sus['loc'] = np.where(sus['Address'].str.contains(location), 1, 0)
    return sus


def get_new_soc_score(sus, soc_df):
    """Merge newly calculated Strength of Case scores to suspects sheet."""
    sus = pd.merge(sus,
                   soc_df[['suspect_id', 'soc']],
                   how='left', left_on='Suspect_ID', right_on='suspect_id')
    sus['Strength_of_Case'] = sus['soc'].round(decimals=3)
    return sus


def calculate_weights(Parameters):
    """Get current weights from Parameters Google Sheet."""
    weights_vs = pd.Series(Parameters.iloc[0:16, 1]).replace('', 0).append(
        pd.Series(Parameters.iloc[0:3, 5])).astype(float)
    weights_keys = pd.Series(Parameters.iloc[0:16, 0]).append(
        pd.Series(Parameters.iloc[0:3, 4]))
    weights = {k: v for k, v in zip(weights_keys, weights_vs)}
    return weights


def calc_solvability(sus, weights):
    """Calculate weighted solvability score on active suspects."""
    sus['Solvability'] = (
        sus['V_Mult'].apply(
            lambda x: x * weights['Victim_Willing_to_Testify']).fillna(0) + \
        sus['Bio_Known'].apply(
            lambda x: x * Solvability_Weights['Bio_and_Location_of_Suspect']).fillna(0) + \
        sus['Others_Arrested'].apply(
          lambda x: x * weights['Other_Suspect(s)_Arrested']).fillna(0) + \
        sus['Willing_to_Arrest'].apply(
          lambda x: x * weights['Police_Willing_to_Arrest']).fillna(0) + \
        sus['Recency_Score'].apply(
          lambda x: x * weights['Recency_of_Case']).fillna(0) + \
        sus['pv_believes'].apply(
          lambda x: x * weights['PV_Believes']).fillna(0) + \
        sus['exp'].apply(
          lambda x: x * weights['Exploitation_Reported']).fillna(0)) \
          / sum(weights.values())
    return sus


def calc_priority(sus, weights, Suspects):
    """Calculate weighted priority score on active suspects."""
    sus['Priority'] = (
        sus['Solvability'].apply(
            lambda x: x * weights['Solvability']) + \
        sus['Strength_of_Case'].apply(
            lambda x: x * weights['Strength_of_Case']) + \
        sus['Em2'].apply(
            lambda x: x * 0.1 * weights['Eminence'])).round(decimals=3)
    sus['Priority'] = sus['Priority'].fillna(0)
    sus['Priority'].astype(float)
    sus.sort_values('Priority', ascending=False, inplace=True)
    sus = sus.iloc[:, 0:len(Suspects.columns)].fillna('')
    sus = sus.drop_duplicates(subset='Suspect_ID')
    return sus


def truncate_rows(df, nrow=200):
    df = df.iloc[:nrow, :]
    return df


def calc_all_sus_scores(sus, vics_willing, pol, db_cif, soc_df, 
                        Suspects):
    """Complete all suspect sheet calculations in priority_calc module."""
    sus = calc_vics_willing_scores(sus, vics_willing)
    sus = calc_arrest_scores(sus, soc_df, pol)
    weights = {
        **Solvability_Weights, 
        **Recency_Vars, 
        **Exploitation_Type, 
        **PV_Believes, 
        }
    sus = calc_recency_scores(sus, soc_df, weights)
    sus = weight_pv_believes(sus, soc_df, PV_Believes)
    sus = get_exp_score(sus, soc_df, Exploitation_Type)
    sus = get_new_soc_score(sus, soc_df)
    sus = get_eminence_score(sus)
    sus = calc_solvability(sus, Solvability_Weights)
    sus = calc_priority(sus, Priority_Weights, Suspects)
    sus = truncate_rows(sus)
    return sus


def add_priority_to_others(sus, other_entity_group, id_type, entity_gsheet, 
                           uid):
    """Copy priority score from suspects to other active sheets and sort them 
    by priority."""
    other_entity_group = pd.merge(other_entity_group,
                                  sus[[id_type, 'Priority', 'Narrative']],
                                  how='outer', on=id_type)
    other_entity_group = other_entity_group[other_entity_group['Priority']!='']
    other_entity_group['Priority'] = other_entity_group['Priority'].fillna(
        0).astype(float)
    other_entity_group['Narrative_x'] = other_entity_group['Narrative_y']
    other_entity_group.rename(columns={
        'Narrative_x': 'Narrative'}, inplace=True)
    other_entity_group.drop_duplicates(subset=uid, inplace=True)
    other_entity_group.sort_values('Priority', ascending=False, inplace=True)
    other_entity_group = other_entity_group.iloc[:, 0:len(
        entity_gsheet.columns)].fillna('')
    return other_entity_group


def get_total_arrests(soc_df):
    """Create Case_ID from suspect_id and aggregate arrests."""
    arrests = deepcopy(soc_df)
    arrests['Case_ID'] = arrests['suspect_id'].str.replace('.', '').str[:-3]
    arrests = pd.pivot_table(arrests, values='Arrest', index='Case_ID', 
                             aggfunc='sum').reset_index()
    arrests.columns = 'Case_ID', 'Total_Arrests'
    return arrests


def update_active_cases(active_suspects, active_police):
    active_cases = active_suspects[['Case_ID', 'Case_Name', 'Priority', 
                                  'IRF_Case_Notes', 'Narrative', 
                                  'Case_Status']]
    active_cases['Case_Status'] = np.where(active_cases.Case_ID.isin(
        active_police[active_police.Case_Status.str.contains('Complete')]['Case_ID']),
        "Third Step Complete - Police are willing to arrest suspect.",
        np.where(active_cases.Case_ID.isin(
            active_suspects[active_suspects.Case_Status.str.contains('Complete')]['Case_ID']),
            "Second Step Complete: Suspect Located",
            np.where(active_cases.Case_ID.isin(
                active_suspects[active_suspects.Victims_Willing_to_Testify.str.contains(",")]['Case_ID']),
                'First Step Complete: Two or more PVs willing to testify', 
                np.where(active_cases.Case_ID.isin(
                    active_suspects[active_suspects.Victims_Willing_to_Testify!='']['Case_ID']),
                    'First Step Complete: One PV willing to testify', ''))))
    active_cases['Next_Action_Priority'] = np.where(active_cases.Case_ID.isin(
        active_police[active_police.Case_Status.str.contains('Complete')]['Case_ID']),
        "Ensure Arrest is Made",
        np.where(active_cases.Case_ID.isin(
            active_suspects[active_suspects.Case_Status.str.contains('Complete')]['Case_ID']),
            "Ask Police to Arrest",
            np.where(active_cases.Case_ID.isin(
                active_suspects[active_suspects.Victims_Willing_to_Testify!='']['Case_ID']),
               'Locate Suspect', 'Contact Victim')))
    active_cases.drop_duplicates('Case_ID', inplace=True)
    return active_cases

## Run Case Dispatcher

In [None]:
#@title
db_cif = pre_proc(db_cif)

db_cif['Arrest'] = np.where(db_cif.arrested=='Yes', 1,0)

soc_df = en_features(db_cif)

with open('Case Dispatcher 3.0/u21_rf_model.sav', 'rb') as f:
  best_model = pickle.load(f)

x_cols = pd.read_table('Case Dispatcher 3.0/X_cols.txt', header=None)

best_model, x_cols, X_Validation = check_grid_search_cv(soc_df, gscv, 
                                                        cutoff_days)
save_results(best_model, X_Validation)

In [None]:
#@title
soc_df = make_new_predictions(soc_df, best_model, x_cols)

new_victims = db_vics
victims = EntityGroup('Victim_ID',
                              new_victims,
                              'Victims',
                              'Closed_Vic',
                              dfs)
new_suspects = db_sus
suspects = EntityGroup('Suspect_ID',
                               new_suspects,
                               'Suspects',
                               'Closed_Sus',
                               dfs)

victims.new = set_vic_id(victims.new)

suspects.new = set_sus_id(suspects.new, db_cif)

EntityGroup.set_case_id()

new_police = deepcopy(x=suspects.new)
new_police.rename(columns={'Name': 'Suspect_Name'}, inplace=True)
police = EntityGroup('Suspect_ID',
                     new_police,
                     'Police',
                     'Closed_Pol',
                     dfs)

EntityGroup.combine_sheets()

EntityGroup.add_irf_notes(irf_case_notes)

EntityGroup.move_closed(soc_df)

EntityGroup.move_other_closed(suspects, police, victims)

vics_willing = get_vics_willing_to_testify(victims.active)
police.active = add_vic_names(police.active, vics_willing)
suspects.active = add_vic_names(suspects.active, vics_willing)
sus_located = get_sus_located(suspects.active)
victims.active = add_sus_located(victims.active, sus_located)

suspects.active = calc_all_sus_scores(suspects.active,
                          vics_willing,
                          police.active,
                          db_cif,
                          soc_df,
                          dfs['Suspects'])

victims.active = add_priority_to_others(suspects.active,
                                        victims.active,
                                        'Case_ID',
                                        dfs['Victims'],
                                        'Victim_ID')

police.active = add_priority_to_others(suspects.active,
                                              police.active,
                                              'Suspect_ID',
                                              dfs['Police'],
                                              'Suspect_ID')

In [None]:
active_cases = update_active_cases(suspects.active, police.active)
EntityGroup.add_case_name_formula()

In [None]:
# Update Case Dispatcher Google Sheet
EntityGroup.update_gsheets(gs_name, active_cases)