<a href="https://colab.research.google.com/github/jygre51/AI_Two_Datasets_Assignment/blob/main/Assignment_base.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

Neural net using measures of economic freedom as a predictor for financially motivated crimes.

measures of economic freedom source: https://www.heritage.org/index/explore

crime data: https://dataunodc.un.org/

use datasets from documents/university/year_3/mike/assignment_datasets

In [None]:
import pandas as pd
import numpy as np
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from keras.models import Sequential
from keras.layers import Dense

# Import separate data files
freedom_data = pd.read_csv('freedom_scores_2019.csv')
crime_data = pd.read_csv('UN_ODC_Crime_Data.csv')

# Merge by country and clean data (replace null values with median for feature)
merged_data = pd.merge(freedom_data, crime_data, on='Country', how='inner')
crime_and_freedom_nums = merged_data.drop("Country", axis=1).replace(0)

imputer = SimpleImputer(strategy="median")
imputer.fit(crime_and_freedom_nums)
X = imputer.transform(crime_and_freedom_nums)
clean_data = pd.DataFrame(X, columns=crime_and_freedom_nums.columns)

# Separate indicator data and target data
freedom_measures = ['Overall Score', 'Property Rights', 'Judicial Effectiveness', 'Government Integrity', 'Tax Burden', 'Government Spending', 'Fiscal Health', 'Business Freedom', 'Labor Freedom', 'Monetary Freedom', 'Trade Freedom', 'Investment Freedom', 'Financial Freedom']
fmc_measures = ['Burglary', 'Theft', 'Fraud']

X = clean_data[freedom_measures].values
Y = clean_data[fmc_measures].values

# Standardization of data
predictor_scaler = StandardScaler()
target_var_scaler = StandardScaler()

# Storing the fit object for later reference
predictor_scaler_fit = predictor_scaler.fit(X)
target_var_scaler_fit = target_var_scaler.fit(Y)

# Generating the standardized values of X and y
predictor_scaler_fit.transform(X)
target_var_scaler_fit.transform(Y)

# Split the data into training and testing set
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, random_state=47)

# Quick sanity check with the shapes of Training and testing datasets
print(X_train.shape)
print(Y_train.shape)
print(X_test.shape)
print(Y_test.shape)

# create ANN model
model = Sequential()

# Defining the Input layer and FIRST hidden layer, both are same!
model.add(Dense(units=10, input_dim=13, kernel_initializer='normal', activation='relu'))

# Defining the Second layer of the model
# after the first layer we don't have to specify input_dim as keras configure it automatically
model.add(Dense(units=10, kernel_initializer='normal', activation='tanh'))

# Output will be 3 fully connected nodes
model.add(Dense(3, kernel_initializer='normal'))

# Compiling the model
model.compile(loss='mean_squared_error', optimizer='adam')

# Fitting the ANN to the Training set (batch size is 3 to adjust for small dataset)
model.fit(X_train, Y_train ,batch_size = 3, epochs = 100, verbose=1)


Find best hyperparameters:

In [None]:
def find_best_params(X_train, Y_train, X_test, Y_test):

    # Defining the list of hyper parameters to try
    batch_size_list = [2, 4, 6, 8, 10]
    epoch_list = [5, 10, 50, 100]

    search_results_data = pd.DataFrame(columns=['trial_number', 'parameters', 'accuracy'])

    # initializing the trials
    trial_number=0
    for batch_size_trial in batch_size_list:
        for epochs_trial in epoch_list:
            trial_number += 1
            # create ANN model
            model = Sequential()
            # Defining the first layer of the model
            model.add(Dense(units=10, input_dim=X_train.shape[1], kernel_initializer='normal', activation='relu'))

            # Defining the Second layer of the model
            model.add(Dense(units=10, kernel_initializer='normal', activation='relu'))

            # Output will be 3 fully connected nodes
            model.add(Dense(3, kernel_initializer='normal'))

            # Compiling the model
            model.compile(loss='mean_squared_error', optimizer='adam')

            # Fitting the ANN to the Training set
            model.fit(X_train, Y_train ,batch_size = batch_size_trial, epochs = epochs_trial, verbose=0)

            MAPE = np.mean(100 * (np.abs(Y_test-model.predict(X_test))/Y_test))

            # printing the results of the current iteration
            print(trial_number, 'Parameters:','batch_size:', batch_size_trial,'-', 'epochs:',epochs_trial, 'Accuracy:', 100-MAPE)

            SearchResultsData=SearchResultsData.append(pd.DataFrame(data=[[trial_number, str(batch_size_trial)+'-'+str(epochs_trial), 100-MAPE]],
                                                                    columns=['trial_number', 'parameters', 'accuracy'] ))
    return(SearchResultsData)

# Calling the function
results_data = find_best_params(X_train, Y_train, X_test, Y_test)