<a href="https://colab.research.google.com/github/jackschreib/J-K-Data-219-FInal-Project/blob/main/3_Visualization.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.compose import make_column_transformer
from sklearn.pipeline import make_pipeline
from sklearn.neighbors import KNeighborsRegressor
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import cross_val_score
from sklearn.metrics import mean_squared_error
from itertools import combinations
from sklearn.impute import SimpleImputer
import matplotlib.pyplot as plt
import plotly.express as px

url = "https://raw.githubusercontent.com/jackschreib/Project/main/CSV_Crime_Data_from_2020_to_Present.csv"
df = pd.read_csv(url)

df_clean = df[['TIME OCC', 'AREA NAME', 'Premis Desc', 'Crm Cd Desc', 'Vict Age', 'Vict Sex']].dropna()

value_counts = df_clean['Crm Cd Desc'].value_counts()
rare = value_counts[value_counts < 2].index
df_clean = df_clean[~df_clean['Crm Cd Desc'].isin(rare)]

In [2]:
#Model 1 Graph

X_train = df_clean[['TIME OCC', 'Crm Cd Desc', 'Premis Desc', 'Vict Age', 'Vict Sex']]
y_train = df_clean['AREA NAME']

col_transformer = make_column_transformer(
    (StandardScaler(), ['Vict Age']),
    (OneHotEncoder(handle_unknown='ignore'), ['Crm Cd Desc', 'Premis Desc', 'Vict Sex']),
    remainder='passthrough')

distances = ['euclidean','manhattan','minkowski','cosine']
for distance in distances:
  pipeline = make_pipeline(
    col_transformer,
    KNeighborsClassifier(n_neighbors=20, metric = distance)
)

  scores = cross_val_score(
    pipeline,
    X=X_train,
    y=y_train,
    cv=10,
    scoring="accuracy"
)

  distance, scores.mean()

def estimate_test_error(k):
  pipeline_knn = make_pipeline(
    col_transformer,
    KNeighborsClassifier(n_neighbors=k, metric = "cosine")
    )
  return cross_val_score(
      pipeline_knn,
      X=X_train,
      y=y_train,
      cv=10,
      scoring="accuracy"
  ).mean()

test_errors = pd.Series([])
for x in range(1, 21):
  error = estimate_test_error(x)
  test_errors[x] = error
ideal_k = test_errors.idxmin()
ideal_k

pipeline = make_pipeline(
  col_transformer,
  KNeighborsClassifier(n_neighbors=ideal_k, metric="cosine"))

scores = cross_val_score(
  pipeline,
  X = X_train,
  y = y_train,
  scoring="accuracy",
  cv=10
)

scores.mean()

fold_numbers = range(1, len(scores) + 1)
df_scores = pd.DataFrame({'Fold': fold_numbers, 'Accuracy': scores})

fig = px.bar(df_scores, x='Fold', y='Accuracy',
             title='Accuracy Scores for Each Fold of Cross-Validation',
             labels={'Fold': 'Fold Number', 'Accuracy': 'Accuracy Score'})

fig.show()

In [3]:
#Model 2 Graph

X_train = df_clean[['Vict Age', 'Vict Sex']]
y_train = df_clean['AREA NAME']

col_transformer = make_column_transformer(
    (StandardScaler(), ['Vict Age']),
    (OneHotEncoder(handle_unknown='ignore'), ['Vict Sex']),
    remainder='passthrough')

distances = ['euclidean','manhattan','minkowski','cosine']
for distance in distances:
  pipeline = make_pipeline(
    col_transformer,
    KNeighborsClassifier(n_neighbors=20, metric = distance)
)

  scores = cross_val_score(
    pipeline,
    X=X_train,
    y=y_train,
    cv=10,
    scoring="accuracy"
)

  distance, scores.mean()

def estimate_test_error(k):
  pipeline_knn = make_pipeline(
    col_transformer,
    KNeighborsClassifier(n_neighbors=k, metric = "cosine")
    )
  return cross_val_score(
      pipeline_knn,
      X=X_train,
      y=y_train,
      cv=10,
      scoring="accuracy"
  ).mean()

test_errors = pd.Series([])
for x in range(1, 41):
  error = estimate_test_error(x)
  test_errors[x] = error
ideal_k = test_errors.idxmin()
ideal_k

pipeline = make_pipeline(
  col_transformer,
  KNeighborsClassifier(n_neighbors=ideal_k, metric="cosine"))


scores = cross_val_score(
  pipeline,
  X = X_train,
  y = y_train,
  scoring="accuracy",
  cv=10
)

scores.mean()

fold_numbers = range(1, len(scores) + 1)
df_scores = pd.DataFrame({'Fold': fold_numbers, 'Accuracy': scores})

fig = px.bar(df_scores, x='Fold', y='Accuracy',
             title='Accuracy Scores for Each Fold of Cross-Validation',
             labels={'Fold': 'Fold Number', 'Accuracy': 'Accuracy Score'})

fig.show()

In [6]:
#Model 3 Graph

X_train = df_clean[['Crm Cd Desc', 'Premis Desc']]
y_train = df_clean['AREA NAME']

col_transformer = make_column_transformer(
    (OneHotEncoder(handle_unknown='ignore'), ['Crm Cd Desc', 'Premis Desc']),
    remainder='passthrough')

distances = ['euclidean','manhattan','minkowski','cosine']
for distance in distances:
  pipeline = make_pipeline(
    col_transformer,
    KNeighborsClassifier(n_neighbors=20, metric = distance)
)

  scores = cross_val_score(
    pipeline,
    X=X_train,
    y=y_train,
    cv=10,
    scoring="accuracy"
)

  distance, scores.mean

def estimate_test_error(k):
  pipeline_knn = make_pipeline(
    col_transformer,
    KNeighborsClassifier(n_neighbors=k, metric = "cosine")
    )
  return cross_val_score(
      pipeline_knn,
      X=X_train,
      y=y_train,
      cv=10,
      scoring="accuracy"
  ).mean()

test_errors = pd.Series([])
for x in range(1, 41):
  error = estimate_test_error(x)
  test_errors[x] = error
ideal_k = test_errors.idxmin()
ideal_k

pipeline = make_pipeline(
  col_transformer,
  KNeighborsClassifier(n_neighbors=10, metric="cosine"))


scores = cross_val_score(
  pipeline,
  X = X_train,
  y = y_train,
  scoring="accuracy",
  cv=10
)

scores.mean()

fold_numbers = range(1, len(scores) + 1)
df_scores = pd.DataFrame({'Fold': fold_numbers, 'Accuracy': scores})

fig = px.bar(df_scores, x='Fold', y='Accuracy',
             title='Accuracy Scores for Each Fold of Cross-Validation',
             labels={'Fold': 'Fold Number', 'Accuracy': 'Accuracy Score'})

fig.show()

In [None]:
#Model 4 Graph

X_train = df_clean[['AREA NAME', 'Crm Cd Desc', 'Premis Desc', 'Vict Age', 'Vict Sex']]
y_train = df_clean['TIME OCC']

col_transformer = make_column_transformer(
    (StandardScaler(), ['Vict Age']),
    (OneHotEncoder(handle_unknown='ignore'), ['Crm Cd Desc', 'Premis Desc', 'Vict Sex','AREA NAME']),
    remainder='passthrough')

distances = ['euclidean','manhattan','minkowski','cosine']
for distance in distances:
  pipeline = make_pipeline(
    col_transformer,
    KNeighborsRegressor(n_neighbors=20, metric = distance)
)

  scores = cross_val_score(
    pipeline,
    X=X_train,
    y=y_train,
    cv=10,
    scoring="neg_mean_squared_error"
)

  distance, np.sqrt(-scores.mean())

def estimate_test_error(k):
  pipeline_knn = make_pipeline(
    col_transformer,
    KNeighborsRegressor(n_neighbors=k, metric = "manhattan")
    )
  return np.sqrt(-cross_val_score(
      pipeline_knn,
      X=X_train,
      y=y_train,
      cv=10,
      scoring="neg_mean_squared_error"
  ).mean())

test_errors = pd.Series([])
for x in range(1, 41):
  error = estimate_test_error(x)
  test_errors[x] = error
ideal_k = test_errors.idxmin()
ideal_k

pipeline = make_pipeline(
  col_transformer,
  KNeighborsRegressor(n_neighbors=ideal_k, metric="manhattan"))


scores = -cross_val_score(
    pipeline,
    X = X_train,
    y = y_train,
    scoring="neg_mean_squared_error",
    cv=10
)

scores.mean()

fold_numbers = range(1, len(scores) + 1)
df_scores = pd.DataFrame({'Fold': fold_numbers, 'Accuracy': scores})

fig = px.bar(df_scores, x='Fold', y='Accuracy',
             title='Accuracy Scores for Each Fold of Cross-Validation',
             labels={'Fold': 'Fold Number', 'Accuracy': 'Accuracy Score'})

fig.show()

We chose this simple but effective method of showing that the outcomes of our models were consistent every fold, varying by only the smallest bit each time. While for all but our last model we weren't able to get a good predicition, we still believe that these visualizations showcase that our models did function correctly in sorting through the large amount of data per factors involved. We overall wanted to show the consistancy to which our code was able to produce our values.