# Run this notebook after running "Crime_EDA_K" and "HousingData_EDA_K"

In [1]:
#initial imports
import pandas as pd 
import numpy as np
from sklearn.linear_model import LinearRegression
from sklearn import linear_model
from sklearn.metrics import r2_score
import matplotlib.pyplot as plt

In [2]:
#import cleaned df 
housing_df = pd.read_csv('Resources/HousingData_clean.csv', index_col = [0])
crime_df = pd.read_csv('Resources/2018To2021CrimeData_clean.csv', index_col = [0])

In [3]:
#preview housing df 
housing_df.head()

Unnamed: 0,city,streetAddress,zipcode,description,latitude,longitude,propertyTaxRate,hasAssociation,homeType,yearBuilt,latestPrice,numPriceChanges,latest_saledate,latest_salemonth,latest_saleyear,latestPriceSource,lotSizeSqFt,livingAreaSqFt
0,pflugerville,14424 Lake Victor Dr,78660,"14424 Lake Victor Dr, Pflugerville, TX 78660 i...",30.430632,-97.663078,1.98,True,Single Family,2012,305000.0,5,2019-09-02,9,2019,"Coldwell Banker United, Realtors - South Austin",6011.0,2601.0
1,pflugerville,1104 Strickling Dr,78660,Absolutely GORGEOUS 4 Bedroom home with 2 full...,30.432673,-97.661697,1.98,True,Single Family,2013,295000.0,1,2020-10-13,10,2020,Agent Provided,6185.0,1768.0
2,pflugerville,1408 Fort Dessau Rd,78660,Under construction - estimated completion in A...,30.409748,-97.639771,1.98,True,Single Family,2018,256125.0,1,2019-07-31,7,2019,Agent Provided,7840.0,1478.0
3,pflugerville,1025 Strickling Dr,78660,Absolutely darling one story home in charming ...,30.432112,-97.661659,1.98,True,Single Family,2013,240000.0,4,2018-08-08,8,2018,Agent Provided,6098.0,1678.0
4,pflugerville,15005 Donna Jane Loop,78660,Brimming with appeal & warm livability! Sleek ...,30.437368,-97.65686,1.98,True,Single Family,2002,239900.0,3,2018-10-31,10,2018,Agent Provided,6708.0,2132.0


In [4]:
#create an hour column for the ocurred time
crime_df['hour'] = pd.DatetimeIndex(crime_df['Occurred_Date_Time']).hour

In [6]:
#preview crime df 
crime_df.head()

Unnamed: 0,Incident_Number,Highest_Offense_Description,Highest_Offense_Code,Family_Violence,Occurred_Date_Time,Occurred_Date,Occurred_Time,Report_Date_Time,Report_Date,Report_Time,...,Clearance_Date,UCR_Category,Category_Description,X-coordinate,Y-coordinate,Latitude,Longitude,Location,year,hour
1,20195014472,BURGLARY OF VEHICLE,601,N,2019-04-08 22:00:00,2019-04-08,1970-01-01 00:00:00.000002200,2019-04-09 13:09:00,2019-04-09,1970-01-01 00:00:00.000001309,...,04/09/2019,23F,Theft,3119486.0,3119486.0,30.292247,-97.725763,"(30.29224653, -97.72576272)",2019.0,22
5,20191561862,POSSESSION OF MARIJUANA,1803,N,2019-06-05 22:35:00,2019-06-05,1970-01-01 00:00:00.000002235,2019-06-05 22:35:00,2019-06-05,1970-01-01 00:00:00.000002235,...,01/01/1888,0,,3108421.0,3108421.0,30.193941,-97.763449,"(30.19394123, -97.76344868)",2019.0,22
6,20191511203,EVADING / FOOT,2723,N,2019-05-31 14:58:00,2019-05-31,1970-01-01 00:00:00.000001458,2019-05-31 14:58:00,2019-05-31,1970-01-01 00:00:00.000001458,...,06/04/2019,0,,3099373.0,3099373.0,30.225732,-97.791259,"(30.22573152, -97.79125883)",2019.0,14
7,2019990714,BURGLARY OF RESIDENCE,500,N,2019-04-09 12:06:00,2019-04-09,1970-01-01 00:00:00.000001206,2019-04-09 12:06:00,2019-04-09,1970-01-01 00:00:00.000001206,...,06/04/2019,220,Burglary,3111980.0,3111980.0,30.201607,-97.751976,"(30.2016066, -97.75197579)",2019.0,12
10,20191561277,POSS CONTROLLED SUB/NARCOTIC,1800,N,2019-06-05 17:53:00,2019-06-05,1970-01-01 00:00:00.000001753,2019-06-05 17:53:00,2019-06-05,1970-01-01 00:00:00.000001753,...,06/07/2019,0,,3179352.0,3179352.0,30.349163,-97.534373,"(30.3491627, -97.5343733)",2019.0,17


# Merge crime and housing data sets by zipcode

In [10]:
#change crime_df zipcode column to match housing_df
crime_df.rename(columns = {'Zip_Code':'zipcode'}, inplace = True)

In [11]:
#change zipcode in crime_df from float to int 
crime_df['zipcode'] = crime_df['zipcode'].astype(int)

In [12]:
#group crime_df on zipcodes
group_df = crime_df.groupby(by = crime_df['zipcode'])

In [13]:
#get the count of occurances by zipcode
group_df2 = group_df.count()

In [14]:
#preview grouped df 
group_df2.head()

Unnamed: 0_level_0,Incident_Number,Highest_Offense_Description,Highest_Offense_Code,Family_Violence,Occurred_Date_Time,Occurred_Date,Occurred_Time,Report_Date_Time,Report_Date,Report_Time,...,Clearance_Date,UCR_Category,Category_Description,X-coordinate,Y-coordinate,Latitude,Longitude,Location,year,hour
zipcode,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
78610,24,24,24,24,24,24,24,24,24,24,...,24,24,24,24,24,24,24,24,24,24
78612,2,2,2,2,2,2,2,2,2,2,...,2,2,2,2,2,2,2,2,2,2
78613,1963,1963,1963,1963,1963,1963,1963,1963,1963,1963,...,1963,1963,1963,1963,1963,1940,1940,1940,1963,1963
78616,1,1,1,1,1,1,1,1,1,1,...,1,1,1,1,1,1,1,1,1,1
78617,3497,3497,3497,3497,3497,3497,3497,3497,3497,3497,...,3497,3497,3497,3497,3497,3417,3417,3417,3497,3497


# Practice merging on pandas -- final merge will be on SQL

In [15]:
#merge the dataframes on grouping 
new_merge = pd.merge(housing_df[['latestPrice','propertyTaxRate','zipcode']], group_df2[['Incident_Number','hour']], on ='zipcode',how = 'left')


In [16]:
#change name of Incident_Number column to Count 
new_merge = new_merge.rename(columns={"Incident_Number": "Count_Crimes"})

In [17]:
#preview merge 
new_merge.sample(5)

Unnamed: 0,latestPrice,propertyTaxRate,zipcode,Count_Crimes,hour
4294,533000.0,2.01,78737,65.0,65.0
2747,320000.0,1.98,78724,6777.0,6777.0
8128,749000.0,2.21,78717,3593.0,3593.0
14736,582000.0,1.98,78751,10431.0,10431.0
9241,780000.0,1.98,78702,17436.0,17436.0


In [18]:
#check datatypes of merged df 
new_merge.dtypes

latestPrice        float64
propertyTaxRate    float64
zipcode              int64
Count_Crimes       float64
hour               float64
dtype: object

In [19]:
#check if theres null values from merge-- nulls will be 0 crimes
new_merge.isnull().sum(axis = 0)

latestPrice         0
propertyTaxRate     0
zipcode             0
Count_Crimes       12
hour               12
dtype: int64

In [20]:
#fill null values with zero 
new_merge[["Count_Crimes","hour"]] = new_merge[["Count_Crimes","hour"]].fillna(0)

In [21]:
#check if theres null values from merge-- nulls will be 0 crimes
new_merge.isnull().sum(axis = 0)

latestPrice        0
propertyTaxRate    0
zipcode            0
Count_Crimes       0
hour               0
dtype: int64

# Machine Learning Model with merged data

In [22]:
new_merge.corr()

Unnamed: 0,latestPrice,propertyTaxRate,zipcode,Count_Crimes,hour
latestPrice,1.0,-0.062179,-0.145946,-0.14445,-0.14445
propertyTaxRate,-0.062179,1.0,-0.172846,-0.185072,-0.185072
zipcode,-0.145946,-0.172846,1.0,0.110361,0.110361
Count_Crimes,-0.14445,-0.185072,0.110361,1.0,1.0
hour,-0.14445,-0.185072,0.110361,1.0,1.0


In [23]:
X = new_merge[["Count_Crimes","hour","propertyTaxRate"]]
y = new_merge[["latestPrice"]]

In [24]:
model = linear_model.LinearRegression()

In [25]:
model.fit(X,y)

LinearRegression()

In [26]:
y_pred = model.predict(X)
y_pred.shape

(15171, 1)

In [27]:
print(model.coef_)
print(model.intercept_)

[[-3.77298222e+00 -3.77298302e+00 -7.85716829e+05]]
[2163779.95823627]


In [28]:
r2_score(y, y_pred)

0.02905161838810788

# Attempting SVM 

In [29]:
#encoding home association to numbers 1= True, 0 = False
to_num = {True:1,False:0}
housing_df["Association"] = housing_df["hasAssociation"].apply(lambda x: to_num[x])
housing_df.sample(5)

Unnamed: 0,city,streetAddress,zipcode,description,latitude,longitude,propertyTaxRate,hasAssociation,homeType,yearBuilt,latestPrice,numPriceChanges,latest_saledate,latest_salemonth,latest_saleyear,latestPriceSource,lotSizeSqFt,livingAreaSqFt,Association
1868,austin,11102 Bending Bough Trl,78758,One story in highly sought after Quail Hollow!...,30.385311,-97.699959,1.98,False,Single Family,1978,459225.0,8,2020-06-04,6,2020,Agent Provided,10306.0,2100.0,0
13229,austin,11101 Cusseta Ln,78739,"Former model home for DR Horton, \r\nDramatic ...",30.190371,-97.905396,1.98,True,Single Family,2002,479900.0,9,2019-10-23,10,2019,Agent Provided,13503.6,4237.0,1
6509,austin,9305 Lightwood Loop,78748,Lovely 4 bedroom 2 bath single story home in C...,30.189196,-97.841728,1.98,True,Single Family,1993,425000.0,1,2020-05-29,5,2020,Broker Provided,8319.0,2170.0,1
11033,austin,11908 Lansdowne Rd,78754,Charming one story with open floor plan. Forma...,30.357208,-97.610085,1.98,True,Single Family,2007,235000.0,3,2019-09-04,9,2019,Agent Provided,7884.0,1858.0,1
13192,austin,5412 Korth Dr,78749,Single story home in Village at Western Oaks. ...,30.204681,-97.869492,1.98,True,Single Family,2001,405000.0,1,2018-05-29,5,2018,Agent Provided,7187.0,1902.0,1


In [50]:
housing_df["Association"].value_counts()

1    8007
0    7164
Name: Association, dtype: int64

In [30]:
svm_merge = pd.merge(housing_df[['latestPrice','propertyTaxRate','zipcode'
                                 ,'Association']], group_df2[[
    'Incident_Number','hour']], on ='zipcode',how = 'left')

In [31]:
svm_merge = svm_merge.rename(columns={"Incident_Number": "Count_Crimes"})

In [32]:
svm_merge.isnull().sum(axis = 0)

latestPrice         0
propertyTaxRate     0
zipcode             0
Association         0
Count_Crimes       12
hour               12
dtype: int64

In [33]:
svm_merge[["Count_Crimes","hour"]] = new_merge[["Count_Crimes","hour"]].fillna(0)

In [34]:
from sklearn.svm import SVC
from sklearn.model_selection import train_test_split

In [35]:
X = svm_merge[["Count_Crimes"]]
y = svm_merge[["Association"]]
X_train, X_test, y_train, y_test = train_test_split(
    X, y, stratify = y, random_state=42)

In [36]:
model = SVC(kernel = 'linear')
model.fit(X_train, y_train)

  return f(*args, **kwargs)


SVC(kernel='linear')

In [37]:
y_pred = model.predict(X_test)

In [43]:
from sklearn.metrics import accuracy_score
accuracy_score(y_test,y_pred)

0.5264961771684682

In [44]:
from sklearn.metrics import confusion_matrix
confusion_matrix(y_test,y_pred)

array([[   0, 1791],
       [   5, 1997]])

In [42]:
from sklearn.metrics import classification_report
print(classification_report(y_test,y_pred))

              precision    recall  f1-score   support

           0       0.00      0.00      0.00      1791
           1       0.53      1.00      0.69      2002

    accuracy                           0.53      3793
   macro avg       0.26      0.50      0.34      3793
weighted avg       0.28      0.53      0.36      3793

