In [157]:
import pandas as pd
import numpy as np
import warnings
import statsmodels.formula.api as smf
from scipy import stats




# Suppress annoying harmless error.
warnings.filterwarnings(
    action="ignore",
    module="scipy",
    message="^internal gelsd"
)


#import my different data sets.
df = pd.read_csv('ny.csv')
dfIL = pd.read_excel('illinois.xls', index = False)
#Name columns and drop ones that are text or have significant NAs making them unusable. Illinios does


df.columns = (['City', 'Population', 'ViolentCrime', 'Murder', 'Rape1', 'Rape', 'Robbery', 'AggravatedAssault',
               'PropertyCrime', 'Burglary', 'Larceny', 'MVTheft', 'Arson'])

df = df.drop(['City', 'Rape1' ], axis=1)

dfIL.columns = (['City', 'Population', 'ViolentCrime', 'Murder', 'Rape1', 'Rape', 'Robbery', 'AggravatedAssault',
               'PropertyCrime', 'Burglary', 'Larceny', 'MVTheft', 'Arson'])

dfIL = dfIL.drop(['City', 'Rape1'], axis=1)


#Drop na's at the end
df = df.dropna(subset = ['Population'])
dfIL = dfIL.dropna(subset = ['Population'])



#Drop weird header data
df = df.iloc[4:]
dfIL = dfIL.iloc[4:]






In [158]:
#Clean data in a more pythonic way
import math
df.fillna(0, inplace = True)
dfIL.fillna(0, inplace = True)

#function to strip and convert vales to floats
def fix(df):
    for col in df.columns:
        df[col]= df[col].apply(lambda x: str(x).replace(',', ''))
        df[col]= df[col].apply(lambda x: str(x).strip())
        df[col]= df[col].apply(lambda x: float(x))
        
#Fix data    
fix(df)
fix(dfIL)

df['State'] = 'NY'
dfIL['State'] = 'IL'

df = pd.concat([df, dfIL])

In [159]:
#Drop NY and Chicago. 
df.drop(df[df.Population > 1000000].index, inplace=True)
df.sort_values('Population').tail(4)

Unnamed: 0,Population,ViolentCrime,Murder,Rape,Robbery,AggravatedAssault,PropertyCrime,Burglary,Larceny,MVTheft,Arson,State
350,199134.0,1036.0,6.0,25.0,390.0,615.0,2368.0,470.0,1662.0,236.0,10.0,NY
21,200551.0,601.0,4.0,53.0,119.0,425.0,2972.0,576.0,2285.0,111.0,20.0,IL
276,210562.0,2107.0,42.0,92.0,918.0,1055.0,10051.0,2587.0,6855.0,609.0,132.0,NY
39,258789.0,3249.0,47.0,145.0,1322.0,1735.0,12491.0,3458.0,8076.0,957.0,0.0,NY


In [160]:
#Enhance
columns = (['Population', 'ViolentCrime', 'Murder', 'Rape', 'Robbery', 'AggravatedAssault',
               'PropertyCrime', 'Burglary', 'Larceny', 'MVTheft', 'Arson'])
for col in columns:
    df[col] = df[col].apply(lambda x: 1 if x <1 else x)
df.head(5)

#Transform data
df['PopulationLog'] = np.log(df['Population'])
df['ViolentCrimeLog'] = np.log(df['ViolentCrime'])
df['MurderLog'] = np.log(df['Murder'])
df['BurglaryLog'] = np.log(df['Burglary'])
df['LarcenyLog'] = np.log(df['Larceny'])

#Transform and define class column
#Using a 0 as low crime and 1 as high crime (threshold is the median)
df['PropertyCrime'] = df['PropertyCrime'].apply(lambda x: 0 if x < 114 else 1)
#split databack to NY and IL 
dfIL = df.loc[df['State'] == 'IL']
dfNY = df.loc[df['State'] == 'NY']

y_IL = dfIL['PropertyCrime']
y_NY = dfNY['PropertyCrime']
dfIL = dfIL.drop(['PropertyCrime', 'State' ], axis=1)
dfNY = dfNY.drop(['PropertyCrime', 'State' ], axis=1)


In [161]:
# Create training and test sets
X_train = dfNY
X_test = dfIL
y_train = y_NY
y_test = y_IL


## Logistic Regression

In [162]:
# Import model
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_score

In [163]:
# Instantiate and set regularization coefficient to high
lr = LogisticRegression(C=1e9)


# Fit model
fit = lr.fit(X_train, y_train)

# Get results
print('Coefficient')
coef = fit.coef_
print(coef)
print(fit.intercept_)

# Get predictions
pred_y = lr.predict(X_test)

print('\nAccuracy of Low/High Crime')
print(pd.crosstab(pred_y, y_test))

lr_scores = cross_val_score(lr, X_test, y_test, cv=5)
print('\nPercentage accuracy')
print(lr_scores)
print('Mean:', lr_scores.mean())

Coefficient
[[  2.12009357e-05   5.04982559e-02  -9.25377860e-02  -7.66275882e-02
   -4.36145638e-02  -4.02407277e-02   8.27619529e-02   6.41689848e-02
    4.23801540e-02  -9.06488028e-02  -7.03253296e-01  -2.01719651e-02
    1.36532403e-04  -1.14356464e-01  -2.65666416e-01]]
[-0.09273953]

Accuracy of Low/High Crime
PropertyCrime    0    1
row_0                  
0              250    8
1                2  241

Percentage accuracy
[ 0.99009901  1.          0.99        0.99        0.95959596]
Mean: 0.985938993899


Logistic Classification produces only 8 false positives and 2 false negatives from 500 rows. The accuracy holds rather well with the exception of the last fold.

## Ridge Logistic Regression

In [164]:
# Generate range of alpha values
alphas = np.arange(0.1, 10, 1)
lr_ridge = LogisticRegression(penalty='l2')
ridge_r_squared = []

# Train/test with different alphas
for a in alphas:
    lr_ridge.set_params(C=a, fit_intercept=False)
    lr_ridge.fit(X_train, y_train)
    y_pred = lr_ridge.predict(X_test)
    ridge_r_squared.append(lr_ridge.score(X_test, y_test))

In [165]:
# Get values
ridge_r_squared
#Why are all my r2's the same?

[0.98003992015968067,
 0.98003992015968067,
 0.98003992015968067,
 0.98003992015968067,
 0.98003992015968067,
 0.98003992015968067,
 0.98003992015968067,
 0.98003992015968067,
 0.98003992015968067,
 0.98003992015968067]

In [166]:
# Instantiate and set regularization coefficient 
lr_ridge = LogisticRegression(penalty='l2', C=0.1, fit_intercept=False)
#Performs slightly better with l2 instead of l1

# Fit model
lr_ridge.fit(X_train, y_train)

# Get results
print('Coefficient')
print(lr_ridge.coef_)
print(lr_ridge.intercept_)

# Get predictions
pred_y_r = lr_ridge.predict(X_test)

print('\nAccuracy by Low/High Crime')
print(pd.crosstab(pred_y_r, y_test))

ridge_scores = cross_val_score(lr_ridge, X_test, y_test, cv=5)
print('\nPercentage accuracy')
print(ridge_scores)
print('Mean:', ridge_scores.mean())

Coefficient
[[  1.64578284e-05   4.88358350e-02  -8.93850053e-02  -7.40533677e-02
   -4.21861359e-02  -3.84152048e-02   7.88506877e-02   6.15930135e-02
    3.97926467e-02  -8.75623915e-02  -6.79090898e-01  -1.94607590e-02
    1.29590302e-04  -1.10243090e-01  -2.56242785e-01]]
0.0

Accuracy by Low/High Crime
PropertyCrime    0    1
row_0                  
0              250    8
1                2  241

Percentage accuracy
[ 0.99009901  1.          0.99        0.99        0.96969697]
Mean: 0.98795919592


The number of errors is consistent, my accuracy went up slightly(how?) as did the performance across folds.

### LASSO Logistic Regression


In [167]:
# Generate range of alphas
alphas = np.arange(0.01, 1, 0.1)
lr_lasso = LogisticRegression(penalty='l2')
#Again l2 performs better than l1
lasso_r_squared = []

# Train model with different regularization values
for a in alphas:
    lr_lasso.set_params(C=a, fit_intercept=False)
    lr_lasso.fit(X_train, y_train)
    y_pred = lr_lasso.predict(X_test)
    lasso_r_squared.append(lr_lasso.score(X_test, y_test))

In [168]:
# Get values
lasso_r_squared

[0.98602794411177641,
 0.98003992015968067,
 0.98003992015968067,
 0.98003992015968067,
 0.98003992015968067,
 0.98003992015968067,
 0.98003992015968067,
 0.98003992015968067,
 0.98003992015968067,
 0.98003992015968067]

In [169]:
# Instantiate and set regularization coefficient to .01
lr_lasso = LogisticRegression(penalty='l2', C=.01)

# Fit model
lr_lasso.fit(X_train, y_train)

# Get results
print('Coefficient')
print(lr_lasso.coef_)
print(lr_lasso.intercept_)

# Get predictions
pred_y_l = lr_lasso.predict(X_test)

print('\nAccuracy by Low/High Crime')
print(pd.crosstab(pred_y_l, y_test))

lasso_scores = cross_val_score(lr_lasso, X_test, y_test, cv=5)
print('\nPercentage accuracy')
print(lasso_scores)
print('Mean:', lasso_scores.mean())

Coefficient
[[ -1.14518072e-05   2.80233478e-02  -5.38823713e-02  -4.57595809e-02
   -2.21077679e-02  -2.10519708e-02   4.38965202e-02   4.05173926e-02
    5.89818439e-03  -5.27797499e-02  -4.09779555e-01  -1.28688373e-02
    1.13637578e-04  -6.31342569e-02  -1.49538664e-01]]
[-0.05405021]

Accuracy by Low/High Crime
PropertyCrime    0    1
row_0                  
0              250    5
1                2  244

Percentage accuracy
[ 0.98019802  1.          1.          0.99        0.94949495]
Mean: 0.983938593859


Suprisingly, the accuracy goes down slightly, but the number of type 1 erros drops by 3, and the last fold performs worse than the other models, which makes no sense.

I would choose the Lasso Regression model. It reduced my false positives and having worked with this dataset, I know some of the variables didnt correlate very well with the outcome and I would prefer to reduce the number of features in my model as much as I can.