# Credit Risk Resampling Techniques

In [1]:
import warnings
warnings.filterwarnings('ignore')

In [2]:
import numpy as np
import pandas as pd
from pathlib import Path
from collections import Counter

# Read the CSV and Perform Basic Data Cleaning

In [3]:
file_path_1 = Path('Data/fatal-police-shootings-data.csv')
df1 = pd.read_csv(file_path_1)
df1.head()

Unnamed: 0,age,gender,signs_of_mental_illness,threat_level,flee,body_camera
0,53.0,1.0,1,1,0.0,False
1,47.0,1.0,0,1,0.0,False
2,23.0,1.0,0,0,0.0,False
3,32.0,1.0,1,1,0.0,False
4,39.0,1.0,0,1,0.0,False


In [4]:
file_path_2 = Path('Data/body-camera.csv')
df2 = pd.read_csv(file_path_2)
df2.head()

Unnamed: 0,City,Number of Sworn Officers,Number Wearing Body Cameras,Coverage,Fairness,Section of Provision,Policy Language/ Explanation,Transparency,Section of Provision(s),Policy Language/ Explanation.1,Privacy,Section of Provision.1,Policy Language/ Explanation.2,Accountability,Section of Provision.2,Policy Language/ Explanation.3
0,Austin,2300,20,1%,No,303.4.a.b,Body camera footage may be reviewed by an offi...,No,TX SB 158 Sec. 1701.660 and Sec. 1701.661.e,"Under TX law, police ""may"" (but are not requir...",Yes,303.2.4,All footage not depicting a criminal investiga...,No,Not specified in policy,No disciplinary guidelines or statement saying...
1,Baltimore,3080,155,5%,No,"Review of Recordings Section 2,3, and 4","Officers can view their own video footage ""to ...",Yes,Maryland Public Information Act § 10-618(f) ; ...,Requested footage should be made public once a...,Unclear,Not specified in policy,While footage is clearly being captured and st...,No,Not specified in policy,No statement that failure to adhere to the pol...
2,Charlotte,1849,1849,100%,No,3.c,"""Officers may review BWC video to aid in compl...",No,J.4.a and V.F,"As per CMPD policy, any footage showing a dead...",Yes,J.7,"All footage not depicting a misdemeanor, felon...",No,II.b,"Policy states that ""any violation of this dire..."
3,Chicago,11944,30,0.3%,No,IL Law Enforcement Officer-Worn Body Camera Ac...,"""The recording officer and his or her supervis...",Yes,IL Law Enforcement Officer-Worn Body Camera Ac...,Redacted footage will be released with permiss...,Yes,IL Law Enforcement Officer-Worn Body Camera Ac...,Footage deleted after 90 days unless it has ev...,No,VIII.A,"Policy states that ""unauthorized duplicating, ..."
4,Dallas,3474,77,2%,No,3XX.06F,"""During any administrative or criminal investi...",No,TX SB 158 Sec. 1701.660 and Sec. 1701.661.e,"Under TX law, police ""may"" (but are not requir...",No,3XX.06C,Footage deleted after 90 days if it is not cat...,No,3XX.05,While a number of activities are prohibited fr...


In [5]:
df = pd.merge(df1, df2, left_on="city", right_on="City").drop('City', axis=1)
df

KeyError: 'city'

In [None]:
df.to_csv('Data/combined_data.csv',index=False)


In [None]:
len(df)

In [None]:
columns = [
    'date',
    'manner_of_death',
    'armed',
    'age',
    'gender',
    'race',
    'city',
    'state',
    'signs_of_mental_illness',
    'threat_level',
    'flee',
    'body_camera',
    'is_geocoding_exact',
    'Number of Sworn Officers',
    'Number Wearing Body Cameras',
    'Coverage',
    'Fairness',
    'Section of Provision',
    'Policy Language/ Explanation,Transparency',
    'Section of Provision(s)',
    'Policy Language/ Explanation.1,Privacy',
    'Section of Provision.1',
    'Policy Language/ Explanation.2,Accountability',
    'Section of Provision.2',
    'Policy Language/ Explanation.3'
]

target = ["body_camera"]

In [None]:

# Drop the null columns where all values are null
df = df.dropna(axis='columns', how='all')

# Drop the null rows
df = df.dropna()

# Converting variables from string
df['Coverage'] = df['Coverage'].str.replace('%', '')
df['Coverage'] = df['Coverage'].astype('float') / 100

df['date']= pd.to_datetime(df['date'])

df['age'] = df['age'].astype('float')
df['Number of Sworn Officers'] = df['Number of Sworn Officers'].astype('float')
df['Number Wearing Body Cameras'] = df['Number Wearing Body Cameras'].astype('float')


df.head()

# Split the Data into Training and Testing

In [None]:
from path import Path
import numpy as np
import pandas as pd

In [None]:
# Create our features
X = df.drop(columns="body_camera")

# Create our target
y = df["body_camera"]


In [None]:
X.describe()

In [None]:
y.describe()

In [None]:
# Split into training and testing sets

from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X,
   y,  random_state=1, stratify=y)
X_train.shape

In [None]:
from sklearn.svm import SVC
model = SVC(kernel='linear')

In [None]:
model.fit(X_train, y_train)

In [None]:
predictions = classifier.predict(X_test)
pd.DataFrame({"Prediction": predictions, "Actual": y_test})

In [None]:
y.describe()

# Oversampling

In this section, you will compare two oversampling algorithms to determine which algorithm results in the best performance. You will oversample the data using the naive random oversampling algorithm and the SMOTE algorithm. For each algorithm, be sure to complete the folliowing steps:

1. View the count of the target classes using `Counter` from the collections library. 
3. Use the resampled data to train a logistic regression model.
3. Calculate the balanced accuracy score from sklearn.metrics.
4. Print the confusion matrix from sklearn.metrics.
5. Generate a classication report using the `imbalanced_classification_report` from imbalanced-learn.

Note: Use a random state of 1 for each sampling algorithm to ensure consistency between tests

### Naive Random Oversampling

In [None]:
# Resample the training data with the RandomOversampler
from imblearn.over_sampling import RandomOverSampler

ros = RandomOverSampler(random_state=1)
X_resampled, y_resampled = ros.fit_resample(X_train, y_train)

Counter(y_resampled)

In [None]:
# Train the Logistic Regression model using the resampled data

from sklearn.linear_model import LogisticRegression
model = LogisticRegression(solver='lbfgs', random_state=1)
model.fit(X_resampled, y_resampled)

In [None]:
# Calculated the balanced accuracy score
from sklearn.metrics import balanced_accuracy_score
y_pred = model.predict(X_test)
balanced_accuracy_score(y_test, y_pred)

In [None]:
# Display the confusion matrix
from sklearn.metrics import confusion_matrix
y_pred = model.predict(X_test)
confusion_matrix(y_test, y_pred)

In [None]:
# Print the imbalanced classification report
from imblearn.metrics import classification_report_imbalanced
print(classification_report_imbalanced(y_test, y_pred))

### SMOTE Oversampling

In [None]:
# Resample the training data with SMOTE
from imblearn.over_sampling import SMOTE
X_resampled, y_resampled = SMOTE(random_state=1,
sampling_strategy='auto').fit_resample(
   X_train, y_train)

Counter(y_resampled)

In [None]:
# Train the Logistic Regression model using the resampled data
model = LogisticRegression(solver='lbfgs', random_state=1)
model.fit(X_resampled, y_resampled)



In [None]:
# Calculated the balanced accuracy score
y_pred = model.predict(X_test)
balanced_accuracy_score(y_test, y_pred)


In [None]:
# Display the confusion matrix
confusion_matrix(y_test, y_pred)

In [None]:
# Print the imbalanced classification report
print(classification_report_imbalanced(y_test, y_pred))

# Undersampling

In this section, you will test an undersampling algorithms to determine which algorithm results in the best performance compared to the oversampling algorithms above. You will undersample the data using the Cluster Centroids algorithm and complete the folliowing steps:

1. View the count of the target classes using `Counter` from the collections library. 
3. Use the resampled data to train a logistic regression model.
3. Calculate the balanced accuracy score from sklearn.metrics.
4. Print the confusion matrix from sklearn.metrics.
5. Generate a classication report using the `imbalanced_classification_report` from imbalanced-learn.

Note: Use a random state of 1 for each sampling algorithm to ensure consistency between tests

In [None]:
# Resample the data using the ClusterCentroids resampler
# Warning: This is a large dataset, and this step may take some time to complete
from imblearn.under_sampling import ClusterCentroids

cc = ClusterCentroids(random_state=1)
X_resampled, y_resampled = cc.fit_resample(X_train, y_train)
Counter(y_resampled)

In [None]:
# Train the Logistic Regression model using the resampled data
model = LogisticRegression(solver='lbfgs', random_state=1)
model.fit(X_resampled, y_resampled)

In [None]:
# Calculated the balanced accuracy score
y_pred = model.predict(X_test)
balanced_accuracy_score(y_test, y_pred)

In [None]:
# Display the confusion matrix

confusion_matrix(y_test, y_pred)

In [None]:
# Print the imbalanced classification report
print(classification_report_imbalanced(y_test, y_pred))

# Combination (Over and Under) Sampling

In this section, you will test a combination over- and under-sampling algorithm to determine if the algorithm results in the best performance compared to the other sampling algorithms above. You will resample the data using the SMOTEENN algorithm and complete the folliowing steps:

1. View the count of the target classes using `Counter` from the collections library. 
3. Use the resampled data to train a logistic regression model.
3. Calculate the balanced accuracy score from sklearn.metrics.
4. Print the confusion matrix from sklearn.metrics.
5. Generate a classication report using the `imbalanced_classification_report` from imbalanced-learn.

Note: Use a random state of 1 for each sampling algorithm to ensure consistency between tests

In [None]:
# Resample the training data with SMOTEENN
# Warning: This is a large dataset, and this step may take some time to complete
from imblearn.combine import SMOTEENN

smote_enn = SMOTEENN(random_state=1)
X_resampled, y_resampled = smote_enn.fit_resample(X, y)

Counter(y_resampled)

In [None]:
# Train the Logistic Regression model using the resampled data
model = LogisticRegression(solver='lbfgs')
model.fit(X_resampled, y_resampled)

In [None]:
# Calculated the balanced accuracy score
y_pred = model.predict(X_test)
balanced_accuracy_score(y_test, y_pred)

In [None]:
# Display the confusion matrix

confusion_matrix(y_test, y_pred)

In [None]:
# Print the imbalanced classification report
print(classification_report_imbalanced(y_test, y_pred))

In [None]:
df2.head(18)

In [None]:
df1.head()