# Classifying Shooting Incident Fatality
We have partnered with, a leading data analytics firm specializing in urban crime analysis, they have initiated a project to enhance response strategies through data-driven insights and efficient allocation of resources based on severity prioritization in New York City.

They aim is to provide actionable insights to law enforcement agencies for the development of targeted policing strategies. The task is to develop a machine learning model to analyze historical shooting incident data, classifying them as fatal or non-fatal. This model will provide valuable insights into the factors influencing fatality rates and the ability to focus on high priority areas. (Predict: STATISTICAL_MURDER_FLAG - TRUE or FALSE)

# About the dataset

**incident_key** - Randomly generated persistent ID for each arrest

**occur_date** - Exact date of the shooting incident

**occur_time** - Exact time of the shooting incident

**boro** - Borough where the shooting incident occurred

**loc_of_occur_desc** - (no description provided)

**precinct** - Precinct where the shooting incident occurred

**jurisdiction_code** - Jurisdiction where the shooting incident occurred. Jurisdiction codes 0(Patrol), 1(Transit) and 2(Housing) represent NYPD whilst codes 3 and more represent non NYPD jurisdictions

**loc_classfctn_desc** - (no description provided)

**location_desc** - Location of the shooting incident

**statistical_murder_flag** - Shooting resulted in the victim's death which would be counted as a murder

**perp_age_group** - Perpetrator's age within a category

**perp_sex** - Perpetrator's sex description

**perp_race** - Perpetrator's race description

**vic_age_group** - Victim's age within a category

**vic_sex** - Victim's sex description

**vic_race** - Victim's race description

**x_coord_cd** - Midblock X-coordinate for New York State Plane Coordinate System, Long Island Zone, NAD 83, units feet (FIPS 3104)

**y_coord_cd** - Midblock Y-coordinate for New York State Plane Coordinate System, Long Island Zone, NAD 83, units feet (FIPS 3104)

**latitude** - Latitude coordinate for Global Coordinate System, WGS 1984, decimal degrees (EPSG 4326)

**longitude** - Longitude coordinate for Global Coordinate System, WGS 1984, decimal degrees (EPSG 4326)

**Lon_Lat (geocoded_column)** - Longitude and Latitude Coordinates for mapping

In [None]:
!pip install seaborn

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

In [None]:
df=pd.read_csv('NYPD_Shooting_Incident_Data__Historic_.csv')
df.head()

In [None]:
df.shape

In [None]:
df.info()

In [None]:
df.columns

In [None]:
df.describe()

In [None]:
df.isnull()

In [None]:
features =['INCIDENT_KEY', 'OCCUR_DATE', 'OCCUR_TIME', 'BORO', 'LOC_OF_OCCUR_DESC',
       'PRECINCT', 'JURISDICTION_CODE', 'LOC_CLASSFCTN_DESC', 'LOCATION_DESC',
       'STATISTICAL_MURDER_FLAG', 'PERP_AGE_GROUP', 'PERP_SEX', 'PERP_RACE',
       'VIC_AGE_GROUP', 'VIC_SEX', 'VIC_RACE', 'X_COORD_CD', 'Y_COORD_CD',
       'Latitude', 'Longitude', 'Lon_Lat']

In [None]:
for feature in features:
    null_check = pd.isnull(df[feature])
null_check

null_check.value_counts()

In [None]:
print(df.isnull().sum())

In [None]:
# Drop unnecessary columns
df = df.drop(['OCCUR_DATE', 'OCCUR_TIME', 'BORO', 'LOC_OF_OCCUR_DESC', 'LOC_CLASSFCTN_DESC', 
              'LOCATION_DESC', 'PERP_AGE_GROUP', 'PERP_SEX', 'PERP_RACE', 
              'VIC_AGE_GROUP', 'VIC_SEX', 'VIC_RACE', 'Lon_Lat'], axis=1)

In [None]:
df = df.drop(columns=['INCIDENT_KEY'])

In [None]:
df.head()

In [None]:
df['STATISTICAL_MURDER_FLAG'] = df['STATISTICAL_MURDER_FLAG'].replace({True:1,False:0})

In [None]:
df.head()

In [None]:
df.info()

In [None]:
# Visualization - Histogram of a few numerical columns
df.hist(column=['PRECINCT', 'X_COORD_CD', 'Y_COORD_CD'], bins=50, figsize=(20, 15))
plt.show()

In [None]:
print(df.isnull().sum())

In [None]:
df = df.dropna(how='any')

In [None]:
print(df.isnull().sum())

In [None]:
df.dtypes

In [None]:
# Drop target column for box plot visualization
X = df.drop('STATISTICAL_MURDER_FLAG', axis=1)

In [None]:
# Boxplot for visualizing outliers
X.plot(kind='box', subplots=True, layout=(5, 5), figsize=(20, 20))
plt.show()

In [None]:
X = df.drop('STATISTICAL_MURDER_FLAG', axis=1)
y = df['STATISTICAL_MURDER_FLAG']

In [None]:
!pip install scikit-learn


In [None]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
print(f"the shape of X_train {X_train.shape}")
print(f"the shape of X_test {X_test.shape}")
print(f"the shape of y_train {y_train.shape}")
print(f"the shape of y_test {y_test.shape}")

In [None]:
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.fit_transform(X_test)

print(f"the shape of X_train_scaled {X_train_scaled.shape}")
print(f"the shape of X_test_scaled {X_test_scaled.shape}")
print(f"the shape of y_train {y_train.shape}")
print(f"the shape of y_test {y_test.shape}")


In [None]:
X_train_scaled

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score


models = {
    "Logistic Regression": LogisticRegression(max_iter=1000),
    "Decision Tree": DecisionTreeClassifier(),
    "Random Forest": RandomForestClassifier(),
    "SVM": SVC(),
    "KNN": KNeighborsClassifier(),
    "Gradient Boosting": GradientBoostingClassifier(),
    "Naive Bayes": GaussianNB()
}

# Function to evaluate a model
def evaluate_model(model, X_test, y_test):
    y_pred = model.predict(X_test)
    return {
        "Accuracy": accuracy_score(y_test, y_pred),
        "Precision": precision_score(y_test, y_pred),
        "Recall": recall_score(y_test, y_pred),
        "F1 Score": f1_score(y_test, y_pred)
    }

# Train and evaluate each model
evaluation_metrics = {}
for model_name, model in models.items():
    model.fit(X_train_scaled, y_train)
    evaluation_metrics[model_name] = evaluate_model(model, X_test_scaled, y_test)

# Convert the evaluation metrics to a DataFrame for better visualization
evaluation_df = pd.DataFrame(evaluation_metrics).T



In [None]:
evaluation_df

In [None]:
plt.figure(figsize=(15,8))
sns.heatmap(X_train.corr(),annot=True,cmap="YlGnBu")

In [None]:
# Boxplot for visualizing outliers
X_train.plot(kind='box', subplots=True, layout=(5, 5), figsize=(20, 20))
plt.show()

### Outlier Treatment

In [None]:
!pip install scikit-learn

In [None]:
import pandas as pd
import numpy as np
from sklearn.ensemble import IsolationForest
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score
import joblib

In [None]:
df

In [None]:
# Preprocessing
#Since the STATISTICAL_MURDER_FLAG was a categorical column , used label encoder for safer side to change to numerical column ...
df = df.dropna(subset=['X_COORD_CD', 'Y_COORD_CD', 'Latitude', 'Longitude', 'STATISTICAL_MURDER_FLAG'])
label_encoder = LabelEncoder()
df['STATISTICAL_MURDER_FLAG'] = label_encoder.fit_transform(df['STATISTICAL_MURDER_FLAG'])

In [None]:
print(df.isnull().sum())

In [None]:
# Selecting features and target
features = ['X_COORD_CD', 'Y_COORD_CD', 'Latitude', 'Longitude']
X = df[features]
y = df['STATISTICAL_MURDER_FLAG']

In [None]:
# Scaling features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

In [None]:
# Remove outliers using Isolation Forest
iso_forest = IsolationForest(contamination=0.05)
outliers = iso_forest.fit_predict(X_scaled)
mask = outliers != -1

X_clean = X_scaled[mask]
y_clean = y[mask]


In [None]:
from sklearn.decomposition import PCA
X['outlier'] = outliers
pca = PCA(n_components=2)
X_pca = pca.fit_transform(X.drop(columns=['outlier']))
X_pca_df = pd.DataFrame(X_pca, columns=['PCA1', 'PCA2'])
X_pca_df['outlier'] = outliers

# Visualize Outliers Before Removal
plt.figure(figsize=(14, 6))

plt.subplot(1, 2, 1)
sns.scatterplot(x='PCA1', y='PCA2', hue='outlier', data=X_pca_df, palette=['blue', 'red'])
plt.title('Outliers Before Removal')
plt.legend(['Inlier', 'Outlier'])

# Remove Outliers
X_clean = X[X['outlier'] == 1]

# Apply PCA again to the cleaned data
X_clean_pca = pca.fit_transform(X_clean.drop(columns=['outlier']))
X_clean_pca_df = pd.DataFrame(X_clean_pca, columns=['PCA1', 'PCA2'])

# Visualize Data After Outlier Removal
plt.subplot(1, 2, 2)
sns.scatterplot(x='PCA1', y='PCA2', data=X_clean_pca_df, color='blue')
plt.title('Outliers After Removal')

plt.tight_layout()
plt.show()

In [None]:
# Visualize correlation heatmap
plt.figure(figsize=(10, 8))
sns.heatmap(pd.DataFrame(X_scaled, columns=features).corr(), annot=True, cmap='coolwarm')
plt.title('Correlation Heatmap of Features')
plt.show()

In [None]:
# Continue with model evaluation and selection
X_clean = X_clean.drop(columns=['outlier'])
X_clean_scaled = scaler.fit_transform(X_clean)

In [None]:
X_clean.head(25)

In [None]:
y_clean.head()

In [None]:
!pip install imblearn


In [None]:
from imblearn.over_sampling import SMOTE
sm = SMOTE(random_state=42)
X_res, y_res = sm.fit_resample(X_clean,y_clean)


In [None]:
# Train/test split
X_train, X_test, y_train, y_test = train_test_split(X_res, y_res, test_size=0.2, random_state=42)

In [None]:
# Models to evaluate
models = {
    'Logistic Regression': LogisticRegression(),
    'Decision Tree': DecisionTreeClassifier(),
    'Random Forest': RandomForestClassifier(),
     'Gradient Boosting': GradientBoostingClassifier(),
    'Support Vector Machine': SVC(),
    'Naive Bayes': GaussianNB(),
    'K-Nearest Neighbors': KNeighborsClassifier()
}

In [None]:
from sklearn.metrics import classification_report, accuracy_score
import joblib

In [None]:
# Evaluate models
best_model_name = None
best_model = None
best_f1_score = 0

results = []

for name, model in models.items():
    pipeline = Pipeline([
        ('scaler', StandardScaler()),
        ('classifier', model)
    ])
    pipeline.fit(X_train, y_train)
    y_pred = pipeline.predict(X_test)
    
    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred)
    recall = recall_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred)
    
    results.append((name, accuracy, precision, recall, f1))
    
    if f1 > best_f1_score:  # Change to F1 Score for model selection
        best_f1_score = f1
        best_model_name = name
        best_model = pipeline

# Print results in a tabular format
print(f"{'Model':<25}{'Accuracy':<10}{'Precision':<10}{'Recall':<10}{'F1 Score':<10}")
print("="*65)
for name, accuracy, precision, recall, f1 in results:
    print(f"{name:<25}{accuracy:<10.4f}{precision:<10.4f}{recall:<10.4f}{f1:<10.4f}")



### choosing the best model 

In [None]:
# Save the best model
joblib.dump(best_model, 'best_model.pkl')

In [None]:
# Save the name of the best model
with open('best_model_name.txt', 'w') as f:
    f.write(best_model_name)

In [None]:
# Load the name of the best model
with open('best_model_name.txt', 'r') as f:
    best_model_name = f.read()

print(f"The best model is: {best_model_name}")

In [None]:
!pip install typing_extensions==4.8.0 --upgrade
!pip install gradio
!pip install fastapi

In [None]:
pip install --upgrade typing_extensions


In [None]:
!pip install --upgrade pydantic


In [None]:
!pip install --upgrade gradio


### PIPELINE

In [None]:
import gradio as gr

In [None]:
# Create Gradio inference pipeline
def predict(X_COORD_CD, Y_COORD_CD, Latitude, Longitude):
    model = joblib.load('best_model.pkl')
    input_data = pd.DataFrame({
        'X_COORD_CD': [X_COORD_CD],
        'Y_COORD_CD': [Y_COORD_CD],
        'Latitude': [Latitude],
        'Longitude': [Longitude]        
    })
    input_data_scaled = scaler.transform(input_data)
    prediction = model.predict(input_data_scaled)
    return 'Murder' if prediction[0]  else 'Not Murder'


In [None]:
# Define Gradio interface
interface = gr.Interface(
    fn=predict,
    inputs=[
        gr.Number(label='X_COORD_CD'),
        gr.Number(label='Y_COORD_CD'),
        gr.Number(label='Latitude'),
        gr.Number(label='Longitude')
    ],
    outputs='text',
    title='NYPD Shooting Incident Prediction',
    description='Predict whether a shooting incident is classified as murder or not based on coordinates.'
)

# Launch the interface
interface.launch(share=True)