<a href="https://colab.research.google.com/github/jygre51/AI_Two_Datasets_Assignment/blob/main/Assignment_base.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

Neural net using measures of economic freedom as a predictor for financially motivated crimes.

measures of economic freedom source: https://www.heritage.org/index/explore

crime data: https://dataunodc.un.org/

use datasets from documents/university/year_3/mike/assignment_datasets

Create template for model:

In [109]:
import pandas as pd
import numpy as np
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from keras.models import Sequential
from keras.layers import Dense

# Import separate data files
freedom_data = pd.read_csv('freedom_scores_2019.csv')
crime_data = pd.read_csv('UN_ODC_Crime_Data.csv')

# Merge by country and clean data (replace null values with median for feature)
merged_data = pd.merge(freedom_data, crime_data, on='Country', how='inner')
crime_and_freedom_nums = merged_data.drop("Country", axis=1).replace(0)

imputer = SimpleImputer(strategy="median")
imputer.fit(crime_and_freedom_nums)
X = imputer.transform(crime_and_freedom_nums)
clean_data = pd.DataFrame(X, columns=crime_and_freedom_nums.columns)

# Separate indicator data and target data
freedom_measures = ['Overall Score', 'Property Rights', 'Judicial Effectiveness', 'Government Integrity', 'Tax Burden', 'Government Spending', 'Fiscal Health', 'Business Freedom', 'Labor Freedom', 'Monetary Freedom', 'Trade Freedom', 'Investment Freedom', 'Financial Freedom']
fmc_measures = ['Burglary', 'Theft', 'Fraud']

X = clean_data[freedom_measures].values
Y = clean_data[fmc_measures].values

# Standardization of data
predictor_scaler = StandardScaler()
target_var_scaler = StandardScaler()

# Storing the fit object for later reference
predictor_scaler_fit = predictor_scaler.fit(X)
target_var_scaler_fit = target_var_scaler.fit(Y)

# Generating the standardized values of X and y
X = predictor_scaler_fit.transform(X)
Y = target_var_scaler_fit.transform(Y)

# Split the data into training and testing set
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, random_state=47)


# Quick sanity check with the shapes of Training and testing datasets
print(X_train.shape)
print(Y_train.shape)
print(X_test.shape)
print(Y_test.shape)

# create ANN model
model = Sequential()

# Defining the Input layer and FIRST hidden layer, both are same!
model.add(Dense(units=20, input_dim=13, kernel_initializer='normal', activation='relu'))

# Defining the Second layer of the model
# after the first layer we don't have to specify input_dim as keras configure it automatically
model.add(Dense(units=20, kernel_initializer='normal', activation='relu'))

# Output will be 3 fully connected nodes
model.add(Dense(3, kernel_initializer='normal'))

# Compiling the model
model.compile(loss='mean_squared_error', optimizer='adam')

# Fitting the ANN to the Training set (batch size is 3 to adjust for small dataset)
# model.fit(X_train, Y_train ,batch_size = 3, epochs = 100, verbose=0)


(69, 13)
(69, 3)
(18, 13)
(18, 3)


Find best hyperparameters:

- manual grid search cross validation

In [None]:
np.set_printoptions(suppress=True)

def find_best_params(X_train, Y_train, X_test, Y_test):

    # Defining the list of hyper parameters to try
    batch_size_list = [4, 5, 6]
    epoch_list = [4, 5, 6, 7, 8]
    neuron_count_list = [5, 10, 15, 20]

    search_results_data = pd.DataFrame(columns=['trial_number', 'parameters', 'accuracy'])

    # initializing the trials
    trial_number=0
    for batch_size_trial in batch_size_list:
        for epochs_trial in epoch_list:
          for neuron_count in neuron_count_list:
              trial_number += 1
              # create ANN model
              model = Sequential()
              # Defining the first layer of the model
              model.add(Dense(units=neuron_count, input_dim=X_train.shape[1], kernel_initializer='normal', activation='relu'))

              # Defining the Second layer of the model
              model.add(Dense(units=neuron_count, kernel_initializer='normal', activation='relu'))

              # Output will be 3 fully connected nodes
              model.add(Dense(3, kernel_initializer='normal'))

              # Compiling the model
              model.compile(loss='mean_squared_error', optimizer='adam')

              # Fitting the ANN to the Training set
              model.fit(X_train, Y_train ,batch_size = batch_size_trial, epochs = epochs_trial, verbose=0)

              MAPE = np.mean(100 * (np.abs(Y_test-model.predict(X_test) - Y_test)/Y_test))

              # print(Y_test-model.predict(X_test))

              # printing the results of the current iteration
              print(trial_number, 'Parameters:','batch_size:', batch_size_trial,'-', 'epochs:',epochs_trial, 'neurons:',neuron_count, 'Accuracy:', 100-MAPE)

              search_results_data=search_results_data.append(pd.DataFrame(data=[[trial_number, str(batch_size_trial)+'-'+str(epochs_trial)+'-'+str(neuron_count), 100-MAPE]],
                                                                      columns=['trial_number', 'parameters', 'accuracy'] ))
    return(search_results_data)

# Calling the function
hyperparameter_tuning_results = find_best_params(X_train, Y_train, X_test, Y_test)

In [None]:
print(hyperparameter_tuning_results)

this reveals that 4-5-20 are the optimal hyperparameters.

train model on optimal parameters:

In [None]:
# Fitting the ANN to the Training set
model.fit(X_train, Y_train ,batch_size = 4, epochs = 5, verbose=0)

# Generating predictions on testing data
predictions=model.predict(X_test)

# Scaling the predicted Price data back to original price scale
predictions=target_var_scaler_fit.inverse_transform(predictions)

# Scaling the y_test Price data back to original price scale
y_test_orig=target_var_scaler_fit.inverse_transform(Y_test)

# Scaling the test data back to original scale
Test_Data=predictor_scaler_fit.inverse_transform(X_test)
print(y_test_orig[:,0])
TestingData=pd.DataFrame(data=Test_Data, columns=freedom_measures)
TestingData['Burglary']=y_test_orig[:,0]
TestingData['Theft']=y_test_orig[:,1]
TestingData['Fraud']=y_test_orig[:,2]
TestingData['pred_Burglary']=predictions[:,0]
TestingData['pred_Theft']=predictions[:,1]
TestingData['pred_Fraud']=predictions[:,2]
TestingData.head()

In [116]:
# Computing the absolute percent error
APE=100*(abs(TestingData['Burglary']-TestingData['pred_Burglary'])/TestingData['Burglary'])
TestingData['Burglary_APE']=APE

APE=100*(abs(TestingData['Theft']-TestingData['pred_Theft'])/TestingData['Theft'])
TestingData['Theft_APE']=APE

APE=100*(abs(TestingData['Fraud']-TestingData['pred_Fraud'])/TestingData['Fraud'])
TestingData['Fraud_APE']=APE

print('The Accuracy of ANN model is:', 100-np.mean(APE))
TestingData.head()

The Accuracy of ANN model is: 30.69625364457937


Unnamed: 0,Overall Score,Property Rights,Judicial Effectiveness,Government Integrity,Tax Burden,Government Spending,Fiscal Health,Business Freedom,Labor Freedom,Monetary Freedom,...,Financial Freedom,Burglary,Theft,Fraud,pred_Burglary,pred_Theft,pred_Fraud,Burglary_APE,Theft_APE,Fraud_APE
0,68.6,66.7,51.9,39.8,89.7,69.0,89.3,63.1,64.5,82.7,...,50.0,149.271077,491.415505,58.086854,182.950134,329.109589,50.343346,22.562346,33.028245,13.330914
1,62.2,71.7,49.8,43.7,55.6,26.5,71.3,71.7,51.1,84.0,...,50.0,276.80349,1517.802425,406.878309,298.806305,852.3302,306.184509,7.948894,43.844457,24.747891
2,61.9,40.2,37.9,30.2,84.3,46.1,96.6,49.7,67.0,83.1,...,60.0,246.849322,529.84911,82.520112,170.852295,290.757782,43.792641,30.786808,45.124418,46.930949
3,51.1,39.5,26.6,18.2,91.8,75.6,96.9,47.9,46.5,78.1,...,50.0,246.849322,529.84911,82.520112,162.080612,262.569824,37.354729,34.340264,50.444415,54.732577
4,74.2,73.6,61.2,47.8,86.4,65.1,97.3,75.2,63.6,84.6,...,70.0,57.59748,382.649435,104.630156,194.330536,372.196747,69.585632,237.394166,2.731662,33.493713
