<a href="https://colab.research.google.com/github/jackschreib/J-K-Data-219-FInal-Project/blob/main/J_and_K_Data_219_Full_Project.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.compose import make_column_transformer
from sklearn.pipeline import make_pipeline
from sklearn.neighbors import KNeighborsRegressor
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import cross_val_score
from sklearn.metrics import mean_squared_error
from itertools import combinations
from sklearn.impute import SimpleImputer
import matplotlib.pyplot as plt
import plotly.express as px

url = "https://raw.githubusercontent.com/jackschreib/Project/main/CSV_Crime_Data_from_2020_to_Present.csv"
df = pd.read_csv(url)

df_clean = df[['TIME OCC', 'AREA NAME', 'Premis Desc', 'Crm Cd Desc', 'Vict Age', 'Vict Sex']].dropna()

value_counts = df_clean['Crm Cd Desc'].value_counts()
rare = value_counts[value_counts < 2].index
df_clean = df_clean[~df_clean['Crm Cd Desc'].isin(rare)]

df_crime_clean = df_clean[(df_clean['Crm Cd Desc'] == 'BURGLARY') | (df_clean['Crm Cd Desc'] == 'ROBBERY') | (df_clean['Crm Cd Desc'] == 'TRESPASSING')]

In [None]:
top = df_clean['Crm Cd Desc'].value_counts().head(5).index
df_sub = df_clean[df_clean['Crm Cd Desc'].isin(top)]

X_train_sub = df_sub[['TIME OCC', 'AREA NAME', 'Premis Desc', 'Vict Age', 'Vict Sex']]
y_train_sub = df_sub['Crm Cd Desc']

col_transformer = make_column_transformer(
    (StandardScaler(), ['Vict Age']),
    (OneHotEncoder(handle_unknown='ignore'), ['AREA NAME', 'Premis Desc', 'Vict Sex']),
    remainder='passthrough')

pipeline = make_pipeline(col_transformer, KNeighborsClassifier(n_neighbors=10))

scores = cross_val_score(pipeline, X_train_sub, y_train_sub, cv=10, scoring='accuracy')
pipeline.fit(X_train_sub, y_train_sub)

predicted_probs = pipeline.predict_proba(X_train_sub)
predicted_probs_df = pd.DataFrame(predicted_probs, columns=pipeline.classes_)

average_probs = predicted_probs_df.mean()

labels = [' '.join(label.split()[:4]) + ('...' if len(label.split()) > 3 else '') for label in average_probs.index]

chart = px.bar(x=labels, y=average_probs.values * 100,
             labels={'x': 'Crm Cd Desc', 'y': 'Average Predicted Probability Percent'},
             title='Average Predicted Probabilities for Each Crime',
             color=labels,
             color_discrete_sequence=['blue']*len(average_probs),
             width=800, height=500)

chart.update_layout(xaxis_tickangle=45,
                  yaxis=dict(title='Average Predicted Probability Percent'),
                  xaxis=dict(title='Crm Cd Desc'))

chart.show()

In [None]:
X_train = df_clean[['TIME OCC', 'Crm Cd Desc', 'Premis Desc', 'Vict Age', 'Vict Sex']]
y_train = df_clean['AREA NAME']

col_transformer = make_column_transformer(
    (StandardScaler(), ['Vict Age']),
    (OneHotEncoder(handle_unknown='ignore'), ['Crm Cd Desc', 'Premis Desc', 'Vict Sex']),
    remainder='passthrough')

distances = ['euclidean','manhattan','minkowski','cosine']
for distance in distances:
  pipeline = make_pipeline(
    col_transformer,
    KNeighborsClassifier(n_neighbors=20, metric = distance)
)

  scores = cross_val_score(
    pipeline,
    X=X_train,
    y=y_train,
    cv=10,
    scoring="accuracy"
)

  print(distance, scores.mean())

euclidean 0.0813447534141527
manhattan 0.08368523484761674
minkowski 0.0813447534141527
cosine 0.12438946104749107


In [None]:
def estimate_test_error(k):
  pipeline_knn = make_pipeline(
    col_transformer,
    KNeighborsClassifier(n_neighbors=k, metric = "cosine")
    )
  return cross_val_score(
      pipeline_knn,
      X=X_train,
      y=y_train,
      cv=10,
      scoring="accuracy"
  ).mean()

test_errors = pd.Series([])
for x in range(1, 21):
  error = estimate_test_error(x)
  test_errors[x] = error
ideal_k = test_errors.idxmin()
ideal_k

In [None]:
pipeline = make_pipeline(
  col_transformer,
  KNeighborsClassifier(n_neighbors=ideal_k, metric="cosine"))

scores = cross_val_score(
  pipeline,
  X = X_train,
  y = y_train,
  scoring="accuracy",
  cv=10
)

scores.mean()

fold_numbers = range(1, len(scores) + 1)
df_scores = pd.DataFrame({'Fold': fold_numbers, 'Accuracy': scores})

fig = px.bar(df_scores, x='Fold', y='Accuracy',
             title='Accuracy Scores for Each Fold of Cross-Validation',
             labels={'Fold': 'Fold Number', 'Accuracy': 'Accuracy Score'})

fig.show()

In [None]:
X_train = df_clean[['Vict Age', 'Vict Sex']]
y_train = df_clean['AREA NAME']

col_transformer = make_column_transformer(
    (StandardScaler(), ['Vict Age']),
    (OneHotEncoder(handle_unknown='ignore'), ['Vict Sex']),
    remainder='passthrough')

distances = ['euclidean','manhattan','minkowski','cosine']
for distance in distances:
  pipeline = make_pipeline(
    col_transformer,
    KNeighborsClassifier(n_neighbors=20, metric = distance)
)

  scores = cross_val_score(
    pipeline,
    X=X_train,
    y=y_train,
    cv=10,
    scoring="accuracy"
)

  print(distance, scores.mean())

euclidean 0.06974405622027681
manhattan 0.06974405622027681
minkowski 0.06974405622027681
cosine 0.07195732413907104


In [None]:
def estimate_test_error(k):
  pipeline_knn = make_pipeline(
    col_transformer,
    KNeighborsClassifier(n_neighbors=k, metric = "cosine")
    )
  return cross_val_score(
      pipeline_knn,
      X=X_train,
      y=y_train,
      cv=10,
      scoring="accuracy"
  ).mean()

test_errors = pd.Series([])
for x in range(1, 41):
  error = estimate_test_error(x)
  test_errors[x] = error
ideal_k = test_errors.idxmin()
ideal_k

In [None]:
pipeline = make_pipeline(
  col_transformer,
  KNeighborsClassifier(n_neighbors=ideal_k, metric="cosine"))


scores = cross_val_score(
  pipeline,
  X = X_train,
  y = y_train,
  scoring="accuracy",
  cv=10
)

scores.mean()

fold_numbers = range(1, len(scores) + 1)
df_scores = pd.DataFrame({'Fold': fold_numbers, 'Accuracy': scores})

fig = px.bar(df_scores, x='Fold', y='Accuracy',
             title='Accuracy Scores for Each Fold of Cross-Validation',
             labels={'Fold': 'Fold Number', 'Accuracy': 'Accuracy Score'})

fig.show()

In [None]:
X_train = df_clean[['Crm Cd Desc', 'Premis Desc']]
y_train = df_clean['AREA NAME']

col_transformer = make_column_transformer(
    (OneHotEncoder(handle_unknown='ignore'), ['Crm Cd Desc', 'Premis Desc']),
    remainder='passthrough')

distances = ['euclidean','manhattan','minkowski','cosine']
for distance in distances:
  pipeline = make_pipeline(
    col_transformer,
    KNeighborsClassifier(n_neighbors=20, metric = distance)
)

  scores = cross_val_score(
    pipeline,
    X=X_train,
    y=y_train,
    cv=10,
    scoring="accuracy"
)

  print(distance, scores.mean())


euclidean 0.10930323134494921
manhattan 0.10930323134494921
minkowski 0.10930323134494921
cosine 0.11194919543139378


In [None]:
def estimate_test_error(k):
  pipeline_knn = make_pipeline(
    col_transformer,
    KNeighborsClassifier(n_neighbors=k, metric = "cosine")
    )
  return cross_val_score(
      pipeline_knn,
      X=X_train,
      y=y_train,
      cv=10,
      scoring="accuracy"
  ).mean()

test_errors = pd.Series([])
for x in range(1, 41):
  error = estimate_test_error(x)
  test_errors[x] = error
ideal_k = test_errors.idxmin()
ideal_k

In [None]:
pipeline = make_pipeline(
  col_transformer,
  KNeighborsClassifier(n_neighbors=10, metric="cosine"))


scores = cross_val_score(
  pipeline,
  X = X_train,
  y = y_train,
  scoring="accuracy",
  cv=10
)

scores.mean()

fold_numbers = range(1, len(scores) + 1)
df_scores = pd.DataFrame({'Fold': fold_numbers, 'Accuracy': scores})

fig = px.bar(df_scores, x='Fold', y='Accuracy',
             title='Accuracy Scores for Each Fold of Cross-Validation',
             labels={'Fold': 'Fold Number', 'Accuracy': 'Accuracy Score'})

fig.show()

In [None]:
X_train = df_clean[['AREA NAME', 'Crm Cd Desc', 'Premis Desc', 'Vict Age', 'Vict Sex']]
y_train = df_clean['TIME OCC']

col_transformer = make_column_transformer(
    (StandardScaler(), ['Vict Age']),
    (OneHotEncoder(handle_unknown='ignore'), ['Crm Cd Desc', 'Premis Desc', 'Vict Sex','AREA NAME']),
    remainder='passthrough')

distances = ['euclidean','manhattan','minkowski','cosine']
for distance in distances:
  pipeline = make_pipeline(
    col_transformer,
    KNeighborsRegressor(n_neighbors=20, metric = distance)
)

  scores = cross_val_score(
    pipeline,
    X=X_train,
    y=y_train,
    cv=10,
    scoring="neg_mean_squared_error"
)

  print(distance, np.sqrt(-scores.mean()))

euclidean 657.00833323198
manhattan 656.8464646804752
minkowski 657.00833323198
cosine 656.9618766895911


In [None]:
def estimate_test_error(k):
  pipeline_knn = make_pipeline(
    col_transformer,
    KNeighborsRegressor(n_neighbors=k, metric = "manhattan")
    )
  return np.sqrt(-cross_val_score(
      pipeline_knn,
      X=X_train,
      y=y_train,
      cv=10,
      scoring="neg_mean_squared_error"
  ).mean())

test_errors = pd.Series([])
for x in range(1, 41):
  error = estimate_test_error(x)
  test_errors[x] = error
ideal_k = test_errors.idxmin()
ideal_k

In [None]:
pipeline = make_pipeline(
  col_transformer,
  KNeighborsRegressor(n_neighbors=ideal_k, metric="manhattan"))


scores = -cross_val_score(
    pipeline,
    X = X_train,
    y = y_train,
    scoring="neg_mean_squared_error",
    cv=10
)

scores.mean()

fold_numbers = range(1, len(scores) + 1)
df_scores = pd.DataFrame({'Fold': fold_numbers, 'Accuracy': scores})

fig = px.bar(df_scores, x='Fold', y='Accuracy',
             title='Accuracy Scores for Each Fold of Cross-Validation',
             labels={'Fold': 'Fold Number', 'Accuracy': 'Accuracy Score'})

fig.show()