# Calculating Cosine similarity of each use of force policy
- Determining the likeness of use of force policies

In [1]:
import pandas as pd
import numpy as np

In [7]:
df = pd.read_csv('C:\\Users\\dakersey\\Documents\\NLS\\NLS_Final\\Model_Output\\dimenstion_table.csv', 
                 dtype = {'chokehold_ban':np.int,'descalate':np.int,'reporting':np.int,
                          'moving_vehicle':np.int,'intervention':np.int,'tech':np.int,'violence':np.int})

### Define "dummy" city 
- The values for each dimension for the dummy city are the max values from the entire column

In [8]:
dummy = pd.DataFrame({'City':['Dummy'], 'chokehold_ban':df['chokehold_ban'].max(),'descalate':df['descalate'].max(),
                      'reporting':df['reporting'].max(),'intervention':df['intervention'].max(),
                      'tech':df['tech'].max(),'violence':df['violence'].max(),'moving_vehicle':df['moving_vehicle'].max()})

df = pd.concat([df, dummy], ignore_index=True)
df.set_index('City', inplace=True)

### Normalize Values column-wise
- Scale values by dividing by the max value in column

In [9]:
import pandas as pd
from sklearn import preprocessing

x = df.values #returns a numpy array
min_max_scaler = preprocessing.MinMaxScaler()
x_scaled = min_max_scaler.fit_transform(x)
df_scaled = pd.DataFrame(x_scaled, columns = df.columns, index=df.index)


### Define Cosine Similarity function
- Cosine similarity is the cosine of the angle between two n-dimensional vectors in an n-dimensional space.
- It is the dot product of the two vectors divided by the product of the two vectors' lengths (or magnitudes)

In [10]:
def cos_sim(a, b):
    """Takes 2 vectors a, b and returns the cosine similarity according 
    to the definition of the dot product
    """
    a = np.squeeze(np.asarray([a]))
    b = np.squeeze(np.asarray([b]))
    dot_product = np.dot(a, b)
    norm_a = np.linalg.norm(a)
    norm_b = np.linalg.norm(b)
    return dot_product / (norm_a * norm_b)

### Calculate cosine similarity for each city policy against dummy city policy
- Our assumption is that the "dummy" policy represents the "ideal" policy 
- We calculated the cosine similarity of each city policy to the "dummy" policy to determine how similar each policy is to the "ideal" policy

In [11]:
cos = []
for i in df_scaled.index:
    cos.append(cos_sim(df_scaled.loc[i],df_scaled.loc['Dummy']))

df_scaled['Cos_Sim'] = cos

df_scaled['Pctile'] = df_scaled['Cos_Sim'].rank(pct=True)

### Top 10 cities for cosine similarity to the "ideal" policy

In [12]:
df_scaled.sort_values('Pctile', ascending=False).head(11)

Unnamed: 0_level_0,chokehold_ban,descalate,reporting,intervention,tech,violence,moving_vehicle,Cos_Sim,Pctile
City,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
Dummy,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
Austin,1.0,0.888889,1.0,0.9,1.0,1.0,1.0,0.998789,0.975
Anchorage,0.714286,1.0,1.0,1.0,1.0,1.0,1.0,0.994612,0.95
Arlington,0.428571,0.444444,0.357143,0.3,0.4,0.357143,0.5,0.988296,0.925
Cincinnati,0.571429,0.444444,0.642857,0.4,0.6,0.642857,0.5,0.986705,0.9
Dallas,0.285714,0.555556,0.5,0.4,0.6,0.642857,0.5,0.975186,0.875
Spokane,0.428571,1.0,1.0,0.7,1.0,1.0,1.0,0.972474,0.85
Glendale,0.428571,0.444444,0.428571,0.3,0.4,0.642857,0.25,0.962971,0.825
Philadelphia,0.428571,0.444444,0.642857,0.2,0.4,0.428571,0.5,0.962882,0.8
Plano,0.428571,0.222222,0.5,0.3,0.4,0.642857,0.5,0.957545,0.775


### Bottom 10 cities for cosine similarity to the "ideal" policy

In [13]:
df_scaled.sort_values('Pctile', ascending=False).tail(10)

Unnamed: 0_level_0,chokehold_ban,descalate,reporting,intervention,tech,violence,moving_vehicle,Cos_Sim,Pctile
City,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
Anaheim,0.142857,0.222222,0.5,0.2,0.0,0.428571,0.5,0.845934,0.25
Boston,0.142857,0.111111,0.285714,0.3,0.0,0.071429,0.25,0.841491,0.225
Sacramento,0.0,0.444444,0.571429,0.4,0.0,0.571429,0.75,0.824891,0.2
Atlanta,0.142857,0.222222,0.285714,0.3,0.0,0.214286,0.0,0.821499,0.175
Baltimore,0.857143,0.333333,0.142857,0.2,0.0,0.357143,0.5,0.79743,0.15
Columbus,0.0,0.111111,0.642857,0.2,0.2,0.357143,0.75,0.781571,0.125
LongBeach,0.285714,0.222222,0.357143,0.0,0.0,0.0,0.25,0.743805,0.1
Reno,0.0,0.0,0.142857,0.2,0.2,0.071429,0.5,0.706353,0.075
Chandler,0.285714,0.333333,0.0,0.1,0.0,0.5,0.0,0.684771,0.05
Irvine,0.0,0.333333,0.428571,0.5,0.0,0.071429,0.0,0.6796,0.025


In [14]:
final_outpath = 'C:\\Users\\dakersey\\Documents\\NLS\\NLS_Final\\Model_Output\\'

In [15]:
df_scaled.to_excel(final_outpath + 'Cos_Sim_Grid.xlsx', index=False)