# Part I: Classification

## Gradient Descent

### Problem 1

In [1]:
from sklearn import linear_model
def grad_descent(X, y, T, alpha):
    m, n = X.shape # m = #examples, n = #features
    theta = np.zeros(n) # initialize parameters
    f = np.zeros(T) # track loss over time
    for i in range(T):
        # loss for current parameter vector theta
        f[i] = 0.5*np.linalg.norm(X.dot(theta) - y)**2
        # compute steepest ascent at f(theta)
        g = np.transpose(X).dot(X.dot(theta) - y)
        # step down the gradient
        theta = theta - alpha*g
    return theta, f

### Problem 2

$log\left(\frac{p_i(\beta^k)}{1 - p_i(\beta^k)}\right) = f_i\left(\beta^k\right)$

$\frac{p_i(\beta^k)}{1 - p_i(\beta^k)} = e^\left(f_i\left(\beta^k\right)\right)$ Thus,

$p_i(\beta^k) = \frac{e^\left(f_i\left(\beta^k\right)\right)}{1 + e^\left(f_i\left(\beta^k\right)\right)}$

Given that:

$L\left(\beta^0, \beta^1\right) = \prod_{i = 1}^{n} {p_i(\beta^k)^(y_i)((1 - p_i(\beta^k))^\left(1 - y_i\right)}$

Applying log on the equation above creates the log likelihood loss function:

$l\left(\beta^0, \beta^1\right) = \sum_{i = 1}^{n} {y_i(f_i(\beta^k))^(y_i)} - \sum_{i = 1}^{n} {log(1 +f_i(\beta^k))}$

Take the derivative of the log likelihood loss function:

$\frac{\partial l} {\partial \beta^k} = -\sum_{i = 1}^{n}{\frac{e^\left(f_i\left(\beta^k\right)\right)}{1 + e^\left(f_i\left(\beta^k\right)\right)}} - \sum_{i = 1}^{n} {y_ix_i} = -\sum_{i = 1}^{n}{p_i(\beta^k)x_i} - \sum_{i = 1}^{n} {y_ix_i} = \sum_{i = 1}^{n} {(y_i - p_i(\beta^k))x_i}$

If we apply Newton's method with stepsize $\alpha$ we obtain:

$\beta^\left(k+1\right) = \beta^k + \alpha\sum_{i = 1}^{n}{(y_i-p_i(\beta^k))x_i}$


### Problem 3

In [16]:
import math
def grad_descent_log(X, y, T, alpha):
    m, n = X.shape
    theta = np.zeros(n)
    f = np.zeros(m) 
    for t in range(T):
        # Compute loss and gradient by taking partial derivative of our loss function with respect to our weights theta
        for i in range(m):
            f[i] = y - (math.exp(np.dot(theta, x))) / (1 + math.exp(np.dot(theta, x)))
            g = f[i] * X[i]
        # Update theta by taking a step in direction of gradient
        theta = (theta + alpha * g)/ np.linalg.norm(theta)
    return theta, f



### Problem 4

In [1]:
from sklearn import datasets
import numpy as np
import matplotlib.pyplot as plt
#simulate data for linear regression

gen_data_x, gen_data_y = sklearn.datasets.make_regression(n_samples=100, n_features=20, noise = 1.5)

#simulate data for logistic regression.  This is similar to linear, only now values are either 0 or 1.  
log_gen_data_x, dump_y = sklearn.datasets.make_regression(n_samples=100, n_features=20, noise = 1.5)
log_gen_data_y = [0 if i>0 else 1 for i in dump_y]


#a really bad estimator
#returns random vector as estimated parameters
dummy = np.ndarray([100, 20])
for index, row in enumerate(dummy):
    dummy[index] = np.random.normal(0, .1, 20)
plt.plot(gen_data_x, dummy)

(theta, f) = grad_descent(gen_data_x, gen_data_y, 500, 0.01)
fig, ax = plt.subplots(figsize = (15,10))
r = linear_model.LinearRegression()
r.fit(gen_data_x, gen_data_y)
ax.scatter(r.coef_, theta)
ax.set_title("Linear Regression Model")
ax.set_xlabel("Simulation Parameter")
ax.set_ylabel("Estimated Parameter")
fig.show()

(theta1, f1) = grad_descent_log(log_gen_data_x, log_gen_data_y, 500, 0.01)
fig1, ax1 = plt.subplots(figsize = (15,10))
r1 = linear_model.LogisticRegression()
r1.fit(log_gen_data_x, log_gen_data_y)
ax1.scatter(r1.coef_, theta)
ax1.set_title("Logistic Regression Model")
ax1.set_xlabel("Simulation Parameter")
ax1.set_ylabel("Estimated Parameter")
fig1.show()

NameError: name 'sklearn' is not defined

## Try it out

In [None]:
from sklearn.model_selection import cross_val_predict
from sklearn import tree
from sklearn import metrics
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import SGDClassifier
from sklearn.model_selection import KFold
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn import svm
from sklearn.ensemble import RandomForestClassifier

# Import dataset (classification)
iris = datasets.load_iris()
gen_data_x, gen_data_y = datasets.make_regression(n_samples=20, n_features=20, noise = 1.5)

# Simulate data for logistic regression
X, gen_y = datasets.make_regression(n_samples=100, n_features=20, noise = 1.5)
y = [0 if i>20 else 1 for i in gen_y]

In [None]:
(theta1, f1) = grad_descent_log(log_gen_data_x, log_gen_data_y, 500, 0.01)
fig1, ax1 = plt.subplots(figsize = (15,10))
r1 = linear_model.LogisticRegression()
r1.fit(log_gen_data_x, log_gen_data_y)
ax1.scatter(r1.coef_, theta)
ax1.set_title("Logistic Regression Model")
ax1.set_xlabel("Simulation Parameter")
ax1.set_ylabel("Estimated Parameter")
fig1.show()

### Decision Tree Model

In [None]:
# Scores when using gradient descent
(theta,loss) = grad_descent_log(X, y, 20, 20)
estimated = np.zeros(len(y))
for i in range(len(X[0])):
    estimated += X[:, i] * theta[i]

In [None]:
model = tree.DecisionTreeClassifier()

# Get scores for ten fold validation (cv=10 means ten fold)
scores1 = cross_val_predict(model, X, y, cv=10)
print(scores1)
# T-test
stats.ttest_rel(scores1, estimated)

### Linear SVM Model

In [None]:
model2 = svm.LinearSVC()

# Get scores for ten fold validation (cv=10 means ten fold)
scores2 = cross_val_predict(model2, X, y, cv=10)
print(scores2)
# T-test
stats.ttest_rel(scores2, estimated)

Based on the 10-fold validation tests and well as the T-tests, the Linear SVM model was the best.

### 10-Fold Cross-Validation

# Part II: Interactive Data Maps

In [None]:
!pip install folium

In [None]:
import folium
import requests
import pandas as pd
import json
import re
from folium.map import *
from folium.plugins import MarkerCluster
from folium.plugins import HeatMap

arrest_table = pd.read_csv("http://www.hcbravo.org/IntroDataSci/misc/BPD_Arrests.csv")
arrest_table = arrest_table[pd.notnull(arrest_table["Location 1"])]

arrest_table["lat"], arrest_table["long"] = arrest_table["Location 1"].str.split(",").str
arrest_table["lat"] = arrest_table["lat"].str.replace("(", "").astype(float)
arrest_table["long"] = arrest_table["long"].str.replace(")", "").astype(float)
arrest_table['count'] = 1 # arrest occurence 
arrest_table     

In [None]:
map_osm = folium.Map(location=[39.29, -76.61], zoom_start=11)   
map_osm

In [13]:

df = arrest_table['neighborhood'].value_counts().reset_index()
df.columns = ['neighborhood', 'Total_Arrests']
df.set_index('neighborhood', inplace=True)
# MB = Black Male, FB = Black Female, MW = White Male, FW = White Female, MU = Unknow Male, FU = Unknown Female
# MA = Asian Male, FA = Asian Female, MI = American Indian or Alaska Native Male, FI = American Indian or Alaska Native Female
# MH = Hispanic or Latino Male, FH = Hispanic or Latino Female
cols = ['MB', 'FB', 'MW','FW', 'MU','FU', 'MA','FA', 'MI','FI', 'MH', 'FH']
for col in cols:
    df[col] = 0
    
arrest_table2 = arrest_table.copy()
for i,row in arrest_table2.iterrows():
    arrest_table2.at[i, 'sex_and_race'] = row[3] + row[2]  
arrest_table2 = arrest_table2.drop(labels = 'race', axis = 1)
arrest_table2 = arrest_table2.groupby('neighborhood') 

for i, row in arrest_table2:
    # Count number of arrests for each individual sex and race
    sex_and_race = row['sex_and_race'].value_counts()
    for s in sex_and_race.index:
        df.loc[i, s] = sex_and_race[s]
df

Unnamed: 0_level_0,Total_Arrests,MB,FB,MW,FW,MU,FU,MA,FA,MI,FI,MH,FH
neighborhood,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
Downtown,3221,2108,474,452,138,28,9,4,1,6,1,0,0
Sandtown-Winchester,2705,2296,351,41,11,2,0,1,1,1,0,1,0
Central Park Heights,1771,1521,193,34,15,4,0,3,0,0,0,0,0
Broadway East,1617,1315,223,41,27,4,1,3,2,1,0,0,0
Belair-Edison,1534,1213,240,51,15,12,1,0,0,2,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
Dundalk Marine Terminal,3,2,0,0,1,0,0,0,0,0,0,0,0
Taylor Heights,3,0,1,2,0,0,0,0,0,0,0,0,0
Blythewood,2,2,0,0,0,0,0,0,0,0,0,0,0
Wyndhurst,2,1,1,0,0,0,0,0,0,0,0,0,0


In [35]:
# Gets the geolocation of Baltimore's neighborhoods
geo_json = 'https://opendata.arcgis.com/datasets/fc5d183b20a145009eae8f8b171eeb0d_0.geojson'
geo_data = json.loads(requests.get(geo_json).text)

# Creates the feature labels for the choropleth map
for feature in geo_data['features']:
    neighborhood = feature['properties']['LABEL']
    feature['properties']['Neighborhood'] = str(neighborhood)
    if neighborhood in df.index:
        feature['properties']['Total Arrests'] = str(df['Total_Arrests'][neighborhood])
        feature['properties']['Black Males'] = str(df['MB'][neighborhood])
        feature['properties']['Black Females'] = str(df['FB'][neighborhood])
        feature['properties']['White Males'] = str(df['MW'][neighborhood])
        feature['properties']['White Females'] = str(df['FW'][neighborhood])
        feature['properties']['Unknown Males'] = str(df['MU'][neighborhood])
        feature['properties']['Unknown Females'] = str(df['FU'][neighborhood])
        feature['properties']['Asian Males'] = str(df['MA'][neighborhood])
        feature['properties']['Asian Females'] = str(df['FA'][neighborhood])
        feature['properties']['American Indian or Alaska Native Males'] = str(df['MI'][neighborhood])
        feature['properties']['American Indian or Alaska Native Females'] = str(df['FI'][neighborhood])
        feature['properties']['Hispanic or Latino Males'] = str(df['MH'][neighborhood])
        feature['properties']['Hispanic or Latino Females'] = str(df['FH'][neighborhood])
    else:
        feature['properties']['Total Arrests'] = '0'
        feature['properties']['Black Males'] = '0'
        feature['properties']['Black Females'] ='0'
        feature['properties']['White Males'] = '0'
        feature['properties']['White Females'] = '0'
        feature['properties']['Unknown Males'] = '0'
        feature['properties']['Unknown Females'] = '0'
        feature['properties']['Asian Males'] = '0'
        feature['properties']['Asian Females'] = '0'
        feature['properties']['American Indian or Alaska Native Males'] = '0'
        feature['properties']['American Indian or Alaska Native Females'] = '0'
        feature['properties']['Hispanic or Latino Males'] = '0'
        feature['properties']['Hispanic or Latino Females'] ='0'

labels = ['Total Arrests','Black Males','Black Females','White Males','White Females','Unknown Males','Unknown Females','Asian Males','Asian Females','American Indian or Alaska Native Males','American Indian or Alaska Native Females','Hispanic or Latino Males','Hispanic or Latino Females']
# Creates the choropleth map
map_choropleth = folium.Map(location=[39.29, -76.61], zoom_start=11)
choropleth = folium.Choropleth(
    geo_data = json.dumps(geo_data),
    data = df,
    columns = [df.index, 'Total_Arrests'],
    key_on = 'feature.properties.LABEL',
    fill_color = 'YlGnBu',
    fill_opacity = 0.8,
    line_opacity = 0.2,
    legend_name = 'Number of Arrests per Neighborhood',
    bins = list(df['Total_Arrests'].quantile([0, 0.2, 0.4, 0.6, 0.8, 0.9, 1])),
    highlight = True,
    name = 'Crimes Committed per Neighborhood'
).add_to(map_choropleth)

choropleth.geojson.add_child(
    folium.features.GeoJsonTooltip(labels)
)

map_choropleth

The way I classified the reported crimes was through neighborhood, race, and sex. I grouped the crimes commited by their neighborhoods and I combined race and sex in my data and counted the different "identities" of people that commited those crimes in each neighborhood. The different "identities" are Black Male, Black Female, White Male, White Female, Unknow Male, Unknown Female, Asian Male, Asian Female,American Indian or Alaska Native Male, American Indian or Alaska Native Female, Hispanic or Latino Male, Hispanic or Latino Female. The data was displayed as a choropleth map where each neighborhood was distinctly seperated with borders and hovering over the area would display a pop up that showed the total number of crimes commited as well as the crimed commited by each "identity". The neighborhoods with more crimes have darker shades of blue. Neighborhoods that are displayed as light green imply that they have little to no crime. Neighborhoods that are dispayed as black imply that there isn't any information about them regarding crimes.