In [2]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
import sqlalchemy
from sqlalchemy.ext.automap import automap_base
from sqlalchemy.orm import session
from sqlalchemy import create_engine, func
import psycopg2
from psycopg2 import OperationalError, errorcodes, errors
import numpy as np

In [1]:
from config import db_password

In [4]:
db_string = f"postgresql://postgres:{db_password}@127.0.0.1:5433/postgres"
engine = create_engine(db_string)

In [6]:
covid_crime_df = pd.read_sql_table('complete_2020', engine)

In [7]:
covid_crime_df.head()

Unnamed: 0,zipcode,mar2020,apr2020,may2020,jun2020,jul2020,aug2020,sep2020,oct2020,nov2020,dec2020,incident_number,offense_description,family_violence,date,location_type,category_description
0,78704,62,1523,3098,5391,17394,24714,25147,30246,36122,51696,20205000000.0,ROBBERY BY ASSAULT,N,2020-01-01,STREETS / HWY / ROAD / ALLEY,Robbery
1,78756,12,106,190,392,1742,2529,2465,3161,4016,5891,20205000000.0,THEFT FROM BUILDING,N,2020-01-01,RESIDENCE / HOME,Theft
2,78759,34,446,649,1413,7266,12513,12781,15712,18367,25876,20205000000.0,THEFT,N,2020-01-01,RESIDENCE / HOME,Theft
3,78723,19,576,2009,4303,14693,21775,21202,25520,28927,36150,202021300.0,AUTO THEFT,N,2020-01-01,PARKING LOTS / GARAGE,Auto Theft
4,78759,34,446,649,1413,7266,12513,12781,15712,18367,25876,202031300.0,BURGLARY OF RESIDENCE,N,2020-01-01,RESIDENCE / HOME,Burglary


In [8]:
# Check data types
covid_crime_df.dtypes

zipcode                         object
mar2020                          int64
apr2020                          int64
may2020                          int64
jun2020                          int64
jul2020                          int64
aug2020                          int64
sep2020                          int64
oct2020                          int64
nov2020                          int64
dec2020                          int64
incident_number                float64
offense_description             object
family_violence                 object
date                    datetime64[ns]
location_type                   object
category_description            object
dtype: object

In [9]:
covid_crime_df["rates"] = covid_crime_df["mar2020"] + covid_crime_df["apr2020"] + covid_crime_df["may2020"] + covid_crime_df["jun2020"] + covid_crime_df["jul2020"] + covid_crime_df["aug2020"] + covid_crime_df["sep2020"] + covid_crime_df["oct2020"] + covid_crime_df["nov2020"] + covid_crime_df["dec2020"]

In [10]:
covid_crime_df.head()

Unnamed: 0,zipcode,mar2020,apr2020,may2020,jun2020,jul2020,aug2020,sep2020,oct2020,nov2020,dec2020,incident_number,offense_description,family_violence,date,location_type,category_description,rates
0,78704,62,1523,3098,5391,17394,24714,25147,30246,36122,51696,20205000000.0,ROBBERY BY ASSAULT,N,2020-01-01,STREETS / HWY / ROAD / ALLEY,Robbery,195393
1,78756,12,106,190,392,1742,2529,2465,3161,4016,5891,20205000000.0,THEFT FROM BUILDING,N,2020-01-01,RESIDENCE / HOME,Theft,20504
2,78759,34,446,649,1413,7266,12513,12781,15712,18367,25876,20205000000.0,THEFT,N,2020-01-01,RESIDENCE / HOME,Theft,95057
3,78723,19,576,2009,4303,14693,21775,21202,25520,28927,36150,202021300.0,AUTO THEFT,N,2020-01-01,PARKING LOTS / GARAGE,Auto Theft,155174
4,78759,34,446,649,1413,7266,12513,12781,15712,18367,25876,202031300.0,BURGLARY OF RESIDENCE,N,2020-01-01,RESIDENCE / HOME,Burglary,95057


In [11]:
#Drop columns
covid_crime_df = covid_crime_df.drop(columns=['mar2020', 'apr2020', 'may2020', 'jun2020', 'jul2020', 'aug2020', 'sep2020', 'oct2020','nov2020', 'dec2020'], axis=1)

In [12]:
# Drop additional columns
covid_crime_df = covid_crime_df.drop(columns=['offense_description', 'family_violence', 'date', 'location_type', 'category_description'])

In [13]:
covid_crime_df

Unnamed: 0,zipcode,incident_number,rates
0,78704,2.020500e+10,195393
1,78756,2.020500e+10,20504
2,78759,2.020500e+10,95057
3,78723,2.020213e+08,155174
4,78759,2.020313e+08,95057
...,...,...,...
40153,78644,,513
40154,78634,,1671
40155,78663,,24
40156,78640,,2108


In [14]:
covid_crime_df.dropna()

Unnamed: 0,zipcode,incident_number,rates
0,78704,2.020500e+10,195393
1,78756,2.020500e+10,20504
2,78759,2.020500e+10,95057
3,78723,2.020213e+08,155174
4,78759,2.020313e+08,95057
...,...,...,...
40148,78759,2.021207e+08,95057
40149,78701,2.020366e+10,56747
40150,78758,2.020366e+10,306412
40151,78759,2.020505e+10,95057


In [17]:
covid_crime_df = covid_crime_df.reset_index()

In [18]:
y = covid_crime_df['incident_number']
X = covid_crime_df.drop(columns= 'incident_number')

In [19]:
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.30, random_state=75)

In [20]:
classifier = LogisticRegression(solver='lbfgs', random_state=1)
classifier

LogisticRegression(random_state=1)

In [21]:
LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
   intercept_scaling=1, max_iter=100, multi_class='warn', penalty='12',
   random_state=1, solver='lbfgs', tol=0.0001, warm_start=False)

LogisticRegression(multi_class='warn', penalty='12', random_state=1)

In [22]:
classifier.fit(X_train, y_train)



ValueError: Input contains NaN, infinity or a value too large for dtype('float64').

In [None]:
predictions = classifier.predict(X_test)
pd.DataFrame({"Prediction": predictions, "Actual": y_test})

In [None]:
accuracy_score(y_test, predictions)