# Run this notebook after running "Crime_EDA_K" and "HousingData_EDA_K"

In [1]:
#initial imports
import pandas as pd 
import numpy as np
from sklearn.linear_model import LinearRegression
from sklearn import linear_model
from sklearn.metrics import r2_score
import matplotlib.pyplot as plt

In [2]:
#import cleaned df 
housing_df = pd.read_csv('Resources/HousingData_clean.csv', index_col = [0])
crime_df = pd.read_csv('Resources/2018To2021CrimeData_clean.csv', index_col = [0])

In [3]:
#preview housing df 
housing_df.head()

Unnamed: 0,city,streetAddress,zipcode,description,latitude,longitude,propertyTaxRate,hasAssociation,homeType,yearBuilt,latestPrice,numPriceChanges,latest_saledate,latest_salemonth,latest_saleyear,latestPriceSource,lotSizeSqFt,livingAreaSqFt
0,pflugerville,14424 Lake Victor Dr,78660,"14424 Lake Victor Dr, Pflugerville, TX 78660 i...",30.430632,-97.663078,1.98,True,Single Family,2012,305000.0,5,2019-09-02,9,2019,"Coldwell Banker United, Realtors - South Austin",6011.0,2601.0
1,pflugerville,1104 Strickling Dr,78660,Absolutely GORGEOUS 4 Bedroom home with 2 full...,30.432673,-97.661697,1.98,True,Single Family,2013,295000.0,1,2020-10-13,10,2020,Agent Provided,6185.0,1768.0
2,pflugerville,1408 Fort Dessau Rd,78660,Under construction - estimated completion in A...,30.409748,-97.639771,1.98,True,Single Family,2018,256125.0,1,2019-07-31,7,2019,Agent Provided,7840.0,1478.0
3,pflugerville,1025 Strickling Dr,78660,Absolutely darling one story home in charming ...,30.432112,-97.661659,1.98,True,Single Family,2013,240000.0,4,2018-08-08,8,2018,Agent Provided,6098.0,1678.0
4,pflugerville,15005 Donna Jane Loop,78660,Brimming with appeal & warm livability! Sleek ...,30.437368,-97.65686,1.98,True,Single Family,2002,239900.0,3,2018-10-31,10,2018,Agent Provided,6708.0,2132.0


In [4]:
#create an hour column for the ocurred time
crime_df['hour'] = pd.DatetimeIndex(crime_df['Occurred_Date_Time']).hour

In [5]:
#preview crime df 
crime_df.head()

Unnamed: 0,Incident_Number,Highest_Offense_Description,Highest_Offense_Code,Family_Violence,Occurred_Date_Time,Occurred_Date,Occurred_Time,Report_Date_Time,Report_Date,Report_Time,...,Clearance_Date,UCR_Category,Category_Description,X-coordinate,Y-coordinate,Latitude,Longitude,Location,year,hour
1,20195014472,BURGLARY OF VEHICLE,601,N,2019-04-08 22:00:00,2019-04-08,1970-01-01 00:00:00.000002200,2019-04-09 13:09:00,2019-04-09,1970-01-01 00:00:00.000001309,...,04/09/2019,23F,Theft,3119486.0,3119486.0,30.292247,-97.725763,"(30.29224653, -97.72576272)",2019.0,22
5,20191561862,POSSESSION OF MARIJUANA,1803,N,2019-06-05 22:35:00,2019-06-05,1970-01-01 00:00:00.000002235,2019-06-05 22:35:00,2019-06-05,1970-01-01 00:00:00.000002235,...,01/01/1888,0,,3108421.0,3108421.0,30.193941,-97.763449,"(30.19394123, -97.76344868)",2019.0,22
6,20191511203,EVADING / FOOT,2723,N,2019-05-31 14:58:00,2019-05-31,1970-01-01 00:00:00.000001458,2019-05-31 14:58:00,2019-05-31,1970-01-01 00:00:00.000001458,...,06/04/2019,0,,3099373.0,3099373.0,30.225732,-97.791259,"(30.22573152, -97.79125883)",2019.0,14
7,2019990714,BURGLARY OF RESIDENCE,500,N,2019-04-09 12:06:00,2019-04-09,1970-01-01 00:00:00.000001206,2019-04-09 12:06:00,2019-04-09,1970-01-01 00:00:00.000001206,...,06/04/2019,220,Burglary,3111980.0,3111980.0,30.201607,-97.751976,"(30.2016066, -97.75197579)",2019.0,12
10,20191561277,POSS CONTROLLED SUB/NARCOTIC,1800,N,2019-06-05 17:53:00,2019-06-05,1970-01-01 00:00:00.000001753,2019-06-05 17:53:00,2019-06-05,1970-01-01 00:00:00.000001753,...,06/07/2019,0,,3179352.0,3179352.0,30.349163,-97.534373,"(30.3491627, -97.5343733)",2019.0,17


# Merge crime and housing data sets by zipcode

In [6]:
#change crime_df zipcode column to match housing_df
crime_df.rename(columns = {'Zip_Code':'zipcode'}, inplace = True)

In [7]:
#change zipcode in crime_df from float to int 
crime_df['zipcode'] = crime_df['zipcode'].astype(int)

In [8]:
#group crime_df on zipcodes
group_df = crime_df.groupby(by = crime_df['zipcode'])

In [9]:
#get the count of occurances by zipcode
group_df2 = group_df.count()

In [10]:
#preview grouped df 
group_df2.head()

Unnamed: 0_level_0,Incident_Number,Highest_Offense_Description,Highest_Offense_Code,Family_Violence,Occurred_Date_Time,Occurred_Date,Occurred_Time,Report_Date_Time,Report_Date,Report_Time,...,Clearance_Date,UCR_Category,Category_Description,X-coordinate,Y-coordinate,Latitude,Longitude,Location,year,hour
zipcode,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
78610,24,24,24,24,24,24,24,24,24,24,...,24,24,24,24,24,24,24,24,24,24
78612,2,2,2,2,2,2,2,2,2,2,...,2,2,2,2,2,2,2,2,2,2
78613,1963,1963,1963,1963,1963,1963,1963,1963,1963,1963,...,1963,1963,1963,1963,1963,1940,1940,1940,1963,1963
78616,1,1,1,1,1,1,1,1,1,1,...,1,1,1,1,1,1,1,1,1,1
78617,3497,3497,3497,3497,3497,3497,3497,3497,3497,3497,...,3497,3497,3497,3497,3497,3417,3417,3417,3497,3497


# Practice merging on pandas -- final merge will be on SQL

In [11]:
#merge the dataframes on grouping 
new_merge = pd.merge(housing_df[['latestPrice','propertyTaxRate','zipcode']], group_df2[['Incident_Number','hour']], on ='zipcode',how = 'left')


In [12]:
#change name of Incident_Number column to Count 
new_merge = new_merge.rename(columns={"Incident_Number": "Count_Crimes"})

In [13]:
#preview merge 
new_merge.sample(5)

Unnamed: 0,latestPrice,propertyTaxRate,zipcode,Count_Crimes,hour
8989,275000.0,1.98,78728,261.0,261.0
789,850000.0,1.98,78732,71.0,71.0
13186,309900.0,1.98,78749,7040.0,7040.0
1263,199000.0,1.98,78617,3497.0,3497.0
12515,339000.0,1.98,78736,1055.0,1055.0


In [14]:
#check datatypes of merged df 
new_merge.dtypes

latestPrice        float64
propertyTaxRate    float64
zipcode              int64
Count_Crimes       float64
hour               float64
dtype: object

In [15]:
#check if theres null values from merge-- nulls will be 0 crimes
new_merge.isnull().sum(axis = 0)

latestPrice         0
propertyTaxRate     0
zipcode             0
Count_Crimes       12
hour               12
dtype: int64

In [16]:
#fill null values with zero 
new_merge[["Count_Crimes","hour"]] = new_merge[["Count_Crimes","hour"]].fillna(0)

In [17]:
#check if theres null values from merge-- nulls will be 0 crimes
new_merge.isnull().sum(axis = 0)

latestPrice        0
propertyTaxRate    0
zipcode            0
Count_Crimes       0
hour               0
dtype: int64

# Machine Learning Model with merged data

In [18]:
new_merge.corr()

Unnamed: 0,latestPrice,propertyTaxRate,zipcode,Count_Crimes,hour
latestPrice,1.0,-0.062179,-0.145946,-0.14445,-0.14445
propertyTaxRate,-0.062179,1.0,-0.172846,-0.185072,-0.185072
zipcode,-0.145946,-0.172846,1.0,0.110361,0.110361
Count_Crimes,-0.14445,-0.185072,0.110361,1.0,1.0
hour,-0.14445,-0.185072,0.110361,1.0,1.0


In [19]:
X = new_merge[["Count_Crimes","hour","propertyTaxRate"]]
y = new_merge[["latestPrice"]]

In [20]:
model = linear_model.LinearRegression()

In [21]:
model.fit(X,y)

LinearRegression()

In [22]:
y_pred = model.predict(X)
y_pred.shape

(15171, 1)

In [23]:
print(model.coef_)
print(model.intercept_)

[[-3.77298222e+00 -3.77298302e+00 -7.85716829e+05]]
[2163779.95823627]


In [24]:
r2_score(y, y_pred)

0.02905161838810788

# Attempting SVM 

In [25]:
#encoding home association to numbers 1= True, 0 = False
to_num = {True:1,False:0}
housing_df["Association"] = housing_df["hasAssociation"].apply(lambda x: to_num[x])
housing_df.sample(5)

Unnamed: 0,city,streetAddress,zipcode,description,latitude,longitude,propertyTaxRate,hasAssociation,homeType,yearBuilt,latestPrice,numPriceChanges,latest_saledate,latest_salemonth,latest_saleyear,latestPriceSource,lotSizeSqFt,livingAreaSqFt,Association
3227,austin,9821 Childress Dr,78753,"9821 Childress Dr, Austin, TX 78753 is a singl...",30.357437,-97.672852,1.98,False,Single Family,1972,287500.0,2,2019-04-30,4,2019,Broker Provided,9147.0,1402.0,0
9093,austin,1504 Tapadero Ct,78727,Stunning 4 bed 2.5 bath home w/ a pool that si...,30.417452,-97.683182,1.98,True,Single Family,1996,469000.0,1,2019-05-31,5,2019,Agent Provided,11761.2,3042.0,1
10912,austin,11512 Church Canyon Dr,78754,5 spacious bedrooms with 1 downstairs and 4 up...,30.362345,-97.637962,1.98,True,Single Family,2007,265000.0,3,2018-05-18,5,2018,Agent Provided,5662.0,2569.0,1
755,austin,7806 Ryans Way,78726,"Custom-built home by Sendero Homes in gated, 2...",30.4095,-97.843552,1.98,True,Single Family,2019,562985.0,3,2020-02-13,2,2020,Agent Provided,5662.0,2799.0,1
4779,austin,8325 La Plata Loop,78737,"8325 La Plata Loop, Austin, TX 78737 is a sing...",30.204016,-97.918175,1.98,True,Single Family,1994,599500.0,1,2019-02-15,2,2019,Agent Provided,64033.2,2841.0,1


In [26]:
housing_df["Association"].value_counts()

1    8007
0    7164
Name: Association, dtype: int64

In [27]:
svm_merge = pd.merge(housing_df[['latestPrice','propertyTaxRate','zipcode'
                                 ,'Association']], group_df2[[
    'Incident_Number','hour']], on ='zipcode',how = 'left')

In [28]:
svm_merge = svm_merge.rename(columns={"Incident_Number": "Count_Crimes"})

In [29]:
svm_merge.isnull().sum(axis = 0)

latestPrice         0
propertyTaxRate     0
zipcode             0
Association         0
Count_Crimes       12
hour               12
dtype: int64

In [30]:
svm_merge[["Count_Crimes","hour"]] = new_merge[["Count_Crimes","hour"]].fillna(0)

In [31]:
from sklearn.svm import SVC
from sklearn.model_selection import train_test_split

In [32]:
X = svm_merge[["Count_Crimes"]]
y = svm_merge[["Association"]]
X_train, X_test, y_train, y_test = train_test_split(
    X, y, stratify = y, random_state=42)

In [33]:
model = SVC(kernel = 'linear')
model.fit(X_train, y_train)

  return f(*args, **kwargs)


SVC(kernel='linear')

In [34]:
y_pred = model.predict(X_test)

In [35]:
from sklearn.metrics import accuracy_score
accuracy_score(y_test,y_pred)

0.5264961771684682

In [36]:
from sklearn.metrics import confusion_matrix
confusion_matrix(y_test,y_pred)

array([[   0, 1791],
       [   5, 1997]])

In [37]:
from sklearn.metrics import classification_report
print(classification_report(y_test,y_pred))

              precision    recall  f1-score   support

           0       0.00      0.00      0.00      1791
           1       0.53      1.00      0.69      2002

    accuracy                           0.53      3793
   macro avg       0.26      0.50      0.34      3793
weighted avg       0.28      0.53      0.36      3793



# School Data MLM attempt 

In [38]:
school_df = pd.read_csv('Resources/SchoolData_clean.csv', index_col = [0])

In [40]:
school_df.dtypes

city                         object
streetAddress                object
zipcode                       int64
description                  object
latitude                    float64
longitude                   float64
homeType                     object
yearBuilt                     int64
latestPrice                 float64
numOfPrimarySchools           int64
numOfElementarySchools        int64
numOfMiddleSchools            int64
numOfHighSchools              int64
avgSchoolDistance           float64
avgSchoolRating             float64
avgSchoolSize                 int64
MedianStudentsPerTeacher      int64
dtype: object

In [42]:
school_merge = pd.merge(school_df[['latestPrice','zipcode'
                                 ,'avgSchoolDistance']], group_df2[[
    'Incident_Number','hour']], on ='zipcode',how = 'left')

# Percent Increase in Property Value per year 

In [73]:
price = housing_df.groupby(['zipcode','latest_saleyear']).agg("mean")
price = price['latestPrice']
price

zipcode  latest_saleyear
78617    2018               193990.482759
         2019               200321.181818
         2020               198750.391304
         2021               137500.000000
78619    2018               650000.000000
                                ...      
78758    2021               357998.333333
78759    2018               495563.892655
         2019               512099.535000
         2020               544940.038462
         2021               597000.000000
Name: latestPrice, Length: 171, dtype: float64

In [129]:
price_avg = housing_df.pivot_table(index='zipcode', columns='latest_saleyear', values=['latestPrice'], aggfunc='mean')
#price_avg.loc[:,"2018"]

price_avg.columns = price_avg.columns.droplevel(0) #remove amount
price_avg.columns.name = None               #remove categories
price_avg = price_avg.reset_index()                #index to columns
#price_avg[[2018]]
price_avg[[2018]] = price_avg[[2018]].fillna('0')
price_avg[[2019]] = price_avg[[2019]].fillna('0')
price_avg[[2020]] = price_avg[[2020]].fillna('0')
price_avg[[2021]] = price_avg[[2021]].fillna('0')
price_avg = price_avg.astype(int)
price_avg["price_dif_18_19"] = ((price_avg[2019] - price_avg[2018])/ price_avg[2018])*100
price_avg["price_dif_19_20"] = ((price_avg[2020] - price_avg[2019])/ price_avg[2019])*100
price_avg["price_dif_20_21"] = ((price_avg[2021] - price_avg[2020])/ price_avg[2020])*100


price_avg

Unnamed: 0,zipcode,2018,2019,2020,2021,price_dif_18_19,price_dif_19_20,price_dif_20_21
0,78617,193990,200321,198750,137500,3.26357,-0.784241,-30.81761
1,78619,650000,743965,784800,0,14.456154,5.488833,-100.0
2,78652,0,0,375339,0,,inf,-100.0
3,78653,0,369900,297490,0,inf,-19.575561,-100.0
4,78660,250109,285695,303125,0,14.228197,6.100912,-100.0
5,78701,571300,3495000,0,1049000,511.762647,-100.0,inf
6,78702,448604,498686,573461,468150,11.163966,14.994405,-18.364108
7,78703,1122031,1181184,1258904,1829333,5.271958,6.579839,45.311557
8,78704,744140,801809,924361,1021842,7.749751,15.284438,10.545772
9,78705,506881,642471,893618,0,26.749868,39.090792,-100.0


# Percent Difference in Crime 

In [None]:
#group by zip code and year; count and unstack
#looks like your already group by above - maybe use that insead?
crime = clean_df.groupby(['Zip_Code','year']).count().unstack()
crime = crime[['Incident_Number']].rename(columns={'Incident_Number': 'Crime_Count'})
crime.head()

In [None]:
#fill NaN with 0
crime = crime.fillna(0)
crime.head()

In [None]:
crime['crime_diff_18_19'] = round(((crime['Crime_Count'][2019] - crime['Crime_Count'][2018])/crime['Crime_Count'][2018])*100,0)
crime['crime_diff_19_20'] = round(((crime['Crime_Count'][2020] - crime['Crime_Count'][2019])/crime['Crime_Count'][2019])*100,0)
crime['crime_diff_20_21'] = round(((crime['Crime_Count'][2021] - crime['Crime_Count'][2020])/crime['Crime_Count'][2020])*100,0)
crime

# One sample t-test 