In [None]:
import pandas as pd
import numpy as np
from sklearn import linear_model
from sklearn.linear_model import LinearRegression
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score


In [None]:
file_path = "C:\\Users\\HP\\Desktop\\terry dtops data\\Terry_Stops_20240301.csv"
df = pd.read_csv(file_path)

df

In [None]:
df.head()

In [None]:
df.dtypes

In [None]:
missing_values = df.isnull().sum()
missing_values

In [None]:
df.head()


In [None]:
print(df.columns)

In [None]:
# i'm Summarizing numerical columns and round to 3 decimal places
numerical_summary = df.describe().round(3)

print(numerical_summary)



In [None]:
# Iterate over columns to identify categorical variables
for column in df.columns:
    if df[column].dtype == 'object':
        unique_values = df[column].unique()
        print(f"Unique values for {column}: {unique_values}")


getting the relevant data for exploration

In [None]:

relevant_columns = ['Officer Race', 'Subject Perceived Race', 'Stop Resolution', 'Frisk Flag', 'Arrest Flag','Precinct', 'Beat', 'Initial Call Type', 'Final Call Type']

# storing them into a new dataset
data2 = df[relevant_columns]

print(data2.head())

In [None]:
data2.head()

In [None]:
data2.dtypes

race composition grapgh

In [None]:
race_counts = data2['Officer Race'].value_counts()

# Plotting the race composition
plt.figure(figsize=(8, 6))
sns.barplot(x=race_counts.index, y=race_counts.values, palette='viridis')
plt.title('Race Composition of Officers')
plt.xlabel('Officer Race')
plt.ylabel('Count')
plt.xticks(rotation=45, ha='right')  
plt.tight_layout()  
plt.show()


In [None]:
# Plotting the distribution of subject perceived race with rotated labels
plt.figure(figsize=(10, 6))
sns.countplot(x='Subject Perceived Race', data=data2, palette='viridis')
plt.title('Distribution of Subject Perceived Race')
plt.xlabel('Subject Perceived Race')
plt.ylabel('Count')
plt.xticks(rotation=90)  
plt.tight_layout()  
plt.show()


In [None]:
data2

relation ship between officer and subject race

In [None]:

# i'm Creating a pivot table to count occurrences of each combination of races
pivot_table = data2.pivot_table(index='Officer Race', columns='Subject Perceived Race', aggfunc='size', fill_value=0)

# Plotting the heatmap
plt.figure(figsize=(10, 6))
sns.heatmap(pivot_table, annot=True, fmt='d', cmap='viridis')
plt.title('Relationship between Officer Race and Subject Perceived Race')
plt.xlabel('Subject Perceived Race')
plt.ylabel('Officer Race')
plt.xticks(rotation=45, ha='right')  
plt.yticks(rotation=0, va='center')  
plt.tight_layout()  
plt.show()



In [None]:
data2

In [None]:
print(data2.dtypes)


## Hypothesis Testing: Relationship Between Officer's Race and Stop Resolution

### Hypotheses:
- **Null Hypothesis (H0):** There is no association between the officer's race and the stop resolution.
- **Alternative Hypothesis (H1):** There is an association between the officer's race and the stop resolution.

### Finding:
Based on the small p-value obtained from the chi-square test (much smaller than the significance level of 0.05), we reject the null hypothesis in favor of the alternative hypothesis, indicating that there is indeed a significant association between the officer's race and the stop resolution.
This analysis suggests that the type of stop resolution is not independent of the officer's race, indicating potential biases or underlying factors influencing the stop resolution based on the officer's race.

In [None]:
import pandas as pd
from scipy.stats import chi2_contingency

# Assuming data2 is properly defined DataFrame

# Creating a contingency table
contingency_table = pd.crosstab(data2['Officer Race'], data2['Stop Resolution'])

print("Contingency Table:")
print(contingency_table)

# Calculating chi-square statistic and p-value
chi2, p, _, _ = chi2_contingency(contingency_table)
print("\nChi-square Statistic:", chi2)
print("P-value:", p)


 the relationship between the officer's race and stop resolution

In [None]:
import matplotlib.pyplot as plt

# Plotting the stacked bar plot
contingency_table.plot(kind='bar', stacked=True, figsize=(10, 6))
plt.title('Stop Resolution by Officer Race')
plt.xlabel('Officer Race')
plt.ylabel('Frequency')
plt.legend(title='Stop Resolution')
plt.xticks(rotation=45, ha='right')
plt.tight_layout()
plt.show()


arrest rates by race

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns



plt.figure(figsize=(10, 6))
sns.countplot(data=data2, x='Officer Race', hue='Arrest Flag')


plt.title('Arrest Rates by Officer Race')
plt.xlabel('Officer Race')
plt.ylabel('Arrest Rate')

plt.xticks(rotation=45, ha='right')

plt.tight_layout()
plt.show()


Computing the proportions or percentages to compare the likelihood of frisks or arrests across different racial groups.

In [None]:
# Converting 'Frisk Flag' and 'Arrest Flag' columns to numeric (0 for 'N', 1 for 'Y')
data2['Frisk Flag'] = data2['Frisk Flag'].map({'Y': 1, 'N': 0})
data2['Arrest Flag'] = data2['Arrest Flag'].map({'Y': 1, 'N': 0})

# Computing total frisks and arrests for each racial group of officers and subjects
total_frisks = data2['Frisk Flag'].sum()
total_arrests = data2['Arrest Flag'].sum()

# Computing frisk and arrest counts for each racial group of officers and subjects
frisk_counts = data2.groupby(['Officer Race', 'Subject Perceived Race'])['Frisk Flag'].sum()
arrest_counts = data2.groupby(['Officer Race', 'Subject Perceived Race'])['Arrest Flag'].sum()

# Computing proportions or percentages
frisk_proportions = frisk_counts / total_frisks
arrest_proportions = arrest_counts / total_arrests

print("Frisk Proportions:")
print(frisk_proportions)
print("\nArrest Proportions:")
print(arrest_proportions)


frisk and arrest visualizations 

In [None]:
import matplotlib.pyplot as plt

# Plotting frisk proportions
plt.figure(figsize=(14, 8))  # Enlarge the figure size
frisk_proportions.unstack().plot(kind='bar', stacked=True)
plt.title('Proportions of Frisks by Officer Race and Subject Perceived Race')
plt.xlabel('Officer Race')
plt.ylabel('Proportion of Frisks')
plt.xticks(rotation=45, ha='right')
plt.legend(title='Subject Perceived Race')
plt.tight_layout()
plt.show()

# Plotting arrest proportions
plt.figure(figsize=(14, 8))  # Enlarge the figure size
arrest_proportions.unstack().plot(kind='bar', stacked=True)
plt.title('Proportions of Arrests by Officer Race and Subject Perceived Race')
plt.xlabel('Officer Race')
plt.ylabel('Proportion of Arrests')
plt.xticks(rotation=45, ha='right')
plt.legend(title='Subject Perceived Race')
plt.tight_layout()
plt.show()





### ENCODING DATA

In [None]:
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
# Encoding categorical variables
data2_encoded = pd.get_dummies(data2, drop_first=True) 

# Performing feature scaling or normalization
scaler = StandardScaler()
scaled_features = scaler.fit_transform(data2_encoded)

# training and testing sets splits
X = pd.DataFrame(scaled_features, columns=data2_encoded.columns)  
X.drop(columns=['Arrest Flag'], inplace=True) 
y = data2['Arrest Flag'] 

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


print("Training set shape:", X_train.shape, y_train.shape)
print("Testing set shape:", X_test.shape, y_test.shape)


### My test splits with random state 42

In [None]:
X_train

In [None]:
X_test

In [None]:
y_train

In [None]:
y_test

Baseline Modeling
