In [1]:
import pandas as pd
from sklearn.preprocessing import OneHotEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
import numpy as np
from sklearn import metrics

In [2]:
def prepare_data(csv_link):
    """Prepare the Stanford Open Policing Data to be used in the classification model
    Parameters
    ----------
    csv_link : str
        The link of the data you want to do analysis on.
    race_included : int
        If you want the model to include race or not, 1 
        indicating race should be included, else race will not be included."""
    
    def convert_search(x):
        if x:
            return 1
        else:
            return 0
        
    def get_month(x):
        return int(x[5:7])
    
    def time_gaps(x):
        if x >= '05:00:00' and x < '12:00:00':
            return 'Morning'
        elif x >= '12:00:00' and x < '17:00:00':
            return 'Afternoon'
        elif x >= '17:00:00' and x < '21:00:00':
            return 'Evening'
        elif x >= '21:00:00' and x < '05:00:00':
            return 'Night'
        
    def change_contraband(row):
        if row['is_searched'] == 1:
            return row['contraband_found']
        else:
            return False
        
    def failed_search(row):
        if row['is_searched'] == 1 and row['contraband_found'] == False:
            return 1
        else:
            return 0
        
    def improved_target(row):
        if row['is_searched'] == 1 and row['failed_search'] == 1:
            return 0
        elif row['is_searched'] == 0:
            return 0
        else:
            return 1
        
    stanford_df = pd.read_csv(csv_link)
    stanford_df = stanford_df.iloc[: , 1:]
    stanford_df = stanford_df[stanford_df['date'].notna()]
    stanford_df = stanford_df[stanford_df['time'].notna()]
    
    use_columns = ['date', 'time', 'service_area', 'subject_age', 'subject_race', 'subject_sex',
                           'type', 'reason_for_stop', 'month', 'search_conducted', 'contraband_found']
    stanford_df = stanford_df[stanford_df['subject_race'].notna()]
    race_options = ['white', 'black']
    stanford_df = stanford_df[stanford_df['subject_race'].isin(race_options)]
        
    # get the month in a column
    stanford_df['month'] = stanford_df['date'].apply(get_month)
    
    # keep all columns that will help in prediction model
    stanford_df = stanford_df[use_columns]
    # get day of week name in column
    stanford_df['date'] = pd.to_datetime(stanford_df['date'])
    stanford_df['day of week'] = stanford_df['date'].dt.day_name()
    
    stanford_df = stanford_df[stanford_df['service_area'] != 'Unknown']
    stanford_df['time of day'] = stanford_df['time'].apply(time_gaps)
    stanford_df['is_searched'] = stanford_df['search_conducted'].apply(convert_search)
    stanford_df['contraband_found'] = stanford_df.apply(change_contraband, axis=1)
    stanford_df['failed_search'] = stanford_df.apply(failed_search, axis=1)
    stanford_df['improved_target'] = stanford_df.apply(improved_target, axis=1)
    
    stanford_df = stanford_df.dropna().reset_index()
    stanford_df = stanford_df.drop(['index','time', 'search_conducted', 'date', 'contraband_found', 'failed_search', 'is_searched'], axis=1)
    return stanford_df

In [3]:
stanford_df = prepare_data('https://stacks.stanford.edu/file/druid:yg821jf8611/yg821jf8611_ca_san_diego_2020_04_01.csv.zip')

In [4]:
stanford_df.head()

Unnamed: 0,service_area,subject_age,subject_race,subject_sex,type,reason_for_stop,month,day of week,time of day,improved_target
0,320,42.0,white,male,vehicular,Moving Violation,1,Wednesday,Morning,0
1,610,23.0,white,male,vehicular,Moving Violation,1,Wednesday,Morning,0
2,120,32.0,white,male,vehicular,Moving Violation,1,Wednesday,Morning,0
3,120,36.0,white,male,vehicular,Moving Violation,1,Wednesday,Morning,0
4,120,16.0,white,male,vehicular,Moving Violation,1,Wednesday,Afternoon,0


In [5]:
def eval_model(second_run):
    # analysis between runs
    black_drivers = second_run[second_run['subject_race_black'] == 1.0]
    white_drivers = second_run[second_run['subject_race_white'] == 1.0]

    black_precision = metrics.precision_score(black_drivers['improved_target'].values, black_drivers['preds'].values)
    black_recall = metrics.recall_score(black_drivers['improved_target'].values, black_drivers['preds'].values)

    white_precision = metrics.precision_score(white_drivers['improved_target'].values, white_drivers['preds'].values)
    white_recall = metrics.recall_score(white_drivers['improved_target'].values, white_drivers['preds'].values)

    print('black precision: ' + str(black_precision) + '   black recall: ' + str(black_recall)) 
    print('white precision: ' + str(white_precision) + '   white recall: ' + str(white_recall))
    print(metrics.accuracy_score(second_run['improved_target'], second_run['preds']))

In [6]:
def make_model(data, race = 1, freq = 3):
    """Train and Make the Classification Model
    Parameters
    ----------
    data : Pandas DataFrame or str
        It can either be data that has already been cleaned to be used for 
        the model or the link of the data you want to do analysis on.
    race : int
        If you want the model to include race or not, 1 
        indicating race should be included, else race will not be included.
    freq : int
        How many months you want the model to train at a time. Can either 
        be 1,2,3,4,6."""
    
    if freq not in [1,2,3,4,6]:
        raise ValueError(f'Value "{freq}" is not allowed choose from 1, 2, 3, 4, 6.')
        
    # checks if given data is a link or dataframe already
    if type(data) == str:
        stanford_df = prepare_data(df, race)
    else:
        stanford_df = data
    
    save_race = stanford_df['subject_race']
    
    # checks if race is in dataframe to avoid any errors
    if race == 1:
        cat_cols = ['subject_race','subject_sex', 'type','reason_for_stop', 'day of week', 'time of day']
    else:
        cat_cols = ['subject_sex', 'type','reason_for_stop', 'day of week', 'time of day']
        stanford_df = stanford_df.drop(['subject_race'], axis=1)
    
    months = [1,2,3,4,5,6,7,8,9,10,11,12]
    
    encoder = OneHotEncoder(sparse=False)

    # Encode Categorical Data
    df_encoded = pd.DataFrame(encoder.fit_transform(stanford_df[cat_cols]))
    df_encoded.columns = encoder.get_feature_names(cat_cols)

    # Replace Categotical Data with Encoded Data
    df_stanford_ready = stanford_df.drop(cat_cols ,axis=1)
    df_stanford_ready = pd.concat([df_encoded, df_stanford_ready], axis=1)
    
    first_months = months[:freq]
    second_months = months[freq:freq+freq]
    
#     reg_rf = RandomForestClassifier()
    
    reg_rf = LinearDiscriminantAnalysis()
    
    for i in range(int(12/freq)-1):
        
        # df containing only the first freq months
        first_run = df_stanford_ready[df_stanford_ready['month'].isin(first_months)].reset_index().drop(['index'], axis=1)
        target = first_run['improved_target']
        first_run = first_run.drop(['improved_target', 'month'], axis = 1).reset_index().drop(['index'], axis=1)
        
        reg_rf.fit(first_run, target)
        
        # df containing the next freq months after the ones above
        second_run = df_stanford_ready[df_stanford_ready['month'].isin(second_months)].reset_index().drop(['index'], axis=1)
        second_run_target = second_run['improved_target']
        second_run = second_run.drop(['month', 'improved_target'], axis=1)

        # predicting
        y_pred = reg_rf.predict(second_run)
        second_run['improved_target'] = second_run_target
        second_run['preds'] = y_pred
        
        # evals the model
        if race == 1:
            eval_model(second_run)
            print(first_months, second_months)
        
        first_months = second_months
        second_months = list(np.array(second_months)+freq)
    return df_stanford_ready

In [7]:
df_stanford_ready = make_model(stanford_df, 1, 3)

black precision: 0.09174311926605505   black recall: 0.20833333333333334
white precision: 0.0708955223880597   white recall: 0.2602739726027397
0.9879458659799463
[1, 2, 3] [4, 5, 6]
black precision: 0.05555555555555555   black recall: 0.10204081632653061
white precision: 0.036885245901639344   white recall: 0.18
0.9871363232117901
[4, 5, 6] [7, 8, 9]
black precision: 0.041237113402061855   black recall: 0.11428571428571428
white precision: 0.023952095808383235   white recall: 0.0784313725490196
0.9888888888888889
[7, 8, 9] [10, 11, 12]
