In [1]:
from sklearn.ensemble import RandomForestClassifier
import psycopg2 as pg
import pandas.io.sql as psql
import pandas as pd

In [2]:
### Create database connection
from sqlalchemy import create_engine
connection = create_engine('postgresql://test:test@localhost/domestic_violence')


In [3]:
dv_dataset = psql.read_sql('SELECT * FROM dv_real_test', connection)

dv_dataset.head()

Unnamed: 0,id,premise,relationship,area,v_gender,v_age,day,time,alcoho,month,result,premise_id,v_gender_id,v_gender_age_id,relationship_id,day_id,time_id,alcohol_id,location_id
0,25092,Residential,Sibling,Kempsey,Female,40 - 49,Sunday,12am - < 6am,N,6,0,16,2,5,5,7,1,2,60
1,40200,Residential,Sibling,Sydney,Female,40 - 49,Sunday,6am - < 12pm,N,6,1,16,2,5,5,7,2,2,106
2,47656,Residential,Unknown/Not Stated,Tweed,Male,40 - 49,Sunday,6am - < 12pm,N,12,0,16,1,5,13,7,2,2,111
3,56306,Residential,Not Known To Victim,Lane Cove,Male,60+,Sunday,6am - < 12pm,N,1,0,16,1,7,12,7,2,2,66
4,79668,Residential,Person In Authority,Gwydir,Male,Under 18,Sunday,12pm - < 6pm,N,9,0,16,1,1,10,7,3,2,51


In [4]:
# Load libraries
from pandas.plotting import scatter_matrix
from matplotlib import pyplot
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC

In [5]:
# Drop categorical columns
data = dv_dataset.drop(["id","premise","relationship","area","v_gender","v_age","day","time","alcoho"], axis=1)
feature_names = data.columns
data.head()

Unnamed: 0,month,result,premise_id,v_gender_id,v_gender_age_id,relationship_id,day_id,time_id,alcohol_id,location_id
0,6,0,16,2,5,5,7,1,2,60
1,6,1,16,2,5,5,7,2,2,106
2,12,0,16,1,5,13,7,2,2,111
3,1,0,16,1,7,12,7,2,2,66
4,9,0,16,1,1,10,7,3,2,51


In [6]:
# drop all rows with any NaN and NaT values
data = data.dropna()
#76159 instances and 10 attributes:
print(data.shape)

(76159, 10)


In [7]:
target = data["result"]
target_names = ["negative", "positive"]

In [8]:
# Drop result columns
data = data.drop("result", axis=1)

In [9]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(data, target)

In [10]:
from sklearn.preprocessing import MinMaxScaler
scaling = MinMaxScaler(feature_range=(-1,1)).fit(X_train)
X_train = scaling.transform(X_train)
X_test = scaling.transform(X_test)

In [11]:
# Spot Check Algorithms
models = []
models.append(('LR', LogisticRegression(solver='liblinear', multi_class='ovr')))
models.append(('LDA', LinearDiscriminantAnalysis()))
models.append(('KNN', KNeighborsClassifier()))
models.append(('CART', DecisionTreeClassifier()))
models.append(('NB', GaussianNB()))
models.append(('SVM', SVC(kernel='linear')))
# evaluate each model in turn
results = []
names = []
for name, model in models:
	kfold = StratifiedKFold(n_splits=10, random_state=1, shuffle=True)
	cv_results = cross_val_score(model, X_train, y_train, cv=kfold, scoring='accuracy')
	results.append(cv_results)
	names.append(name)
	print('%s: %f (%f)' % (name, cv_results.mean(), cv_results.std()))

LR: 0.867260 (0.004259)
LDA: 0.842959 (0.004490)
KNN: 0.910940 (0.003045)
CART: 0.882158 (0.003709)
NB: 0.862007 (0.004152)
SVM: 0.916087 (0.001635)


In [12]:
# Make predictions on validation dataset with SVM as it yields the best result
model = SVC(kernel='linear')
model.fit(X_train, y_train)
predictions = model.predict(X_test)
# Evaluate predictions
print(accuracy_score(y_test, predictions))
print(confusion_matrix(y_test, predictions))
print(classification_report(y_test, predictions))

0.9184873949579831
[[11012   955]
 [  597  6476]]
              precision    recall  f1-score   support

           0       0.95      0.92      0.93     11967
           1       0.87      0.92      0.89      7073

    accuracy                           0.92     19040
   macro avg       0.91      0.92      0.91     19040
weighted avg       0.92      0.92      0.92     19040



In [13]:
# Save the model and scale
import joblib
model_file = "domestic_violence_model_trained.pkl"  
joblib.dump(model, model_file)
scaled_file = "domestic_violence_model_scaled.pkl"  
joblib.dump(scaling, scaled_file)


['domestic_violence_model_scaled.pkl']

In [14]:
# Load the saved model and scale files
storedModel = joblib.load("domestic_violence_model_trained.pkl")
storedScaler = joblib.load("domestic_violence_model_scaled.pkl")
my_prediction1 = [[4,6,2,5,6,7,1,2,127]]

# Prediction 1 Profile
# 4 -   Month   :"April"
# 6 -  Premise :"Carpark"
# 2 -   Gender  :"Female"
# 5 -	Victum Age: "40 - 49"
# 6 -  Relationship with Victim:   "Member Of Family - Other"	
# 7 -   Day     :"Sunday"
# 1 -   Time    :"12am - < 6am"
# 2 -   Alcohol :"N"
# 127 -  Location:"Wollongong"	

my_prediction2 = [[3,16,1,4,12,7,4,2,60]]
# Prediction 2 Profile
# 3 -   Month   :"March"
# 16 -  Premise :"Residential"
# 1 -   Gender  :"Male"
# 4 -	Victum Age: "30 - 39"
# 12 -  Relationship with Victim:   "Not Known To Victim"	
# 7 -   Day     :"Sunday"
# 4 -   Time    :"6pm - < 12pm"
# 2 -   Alcohol :"N"
# 60 -  Location:"Kempsey"	

my_scaled_prediction1 = storedScaler.transform(my_prediction1)
my_scaled_prediction2 = storedScaler.transform(my_prediction2)
p1 = storedModel.predict(my_scaled_prediction1)
p2 = storedModel.predict(my_scaled_prediction2)

def result(prediction):
    result = 'False'
    if prediction[0] == 1:
        result = 'True'
    return result    

print("1st prediction is: " + result(p1))
print("2nd prediction is: " + result(p2))



1st prediction is: True
2nd prediction is: False
