In [1]:
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder, StandardScaler, FunctionTransformer
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
import pandas as pd
import matplotlib.pyplot as plt

In [2]:
df = pd.read_csv("../community_policing_data.csv")

  df = pd.read_csv("../community_policing_data.csv")


In [3]:
df.head()

Unnamed: 0,STOP_DATE,AGENCY NAME,LOCATION,JURISDICTION,REASON FOR STOP,PERSON TYPE,RACE,ETHNICITY,AGE,GENDER,...,ACTION TAKEN,SPECIFIC VIOLATION,VIRGINIA CRIME CODE,PERSON SEARCHED,VEHICLE SEARCHED,ADDITIONAL ARREST,FORCE USED BY OFFICER,FORCE USED BY SUBJECT,RESIDENCY,VIOLATION TYPE
0,7/1/2020,Broadway Police Department,S MAIN/FORTH ST,ROCKINGHAM CO,TRAFFIC VIOLATION,,WHITE,NOT HISPANIC OR LATINO,18.0,MALE,...,WARNING ISSUED,46.2-833,,NO,NO,NO,,,,COMMONWEALTH
1,7/1/2020,Isle Of Wight County Sheriff's Office,36.967676 -76.511763,ISLE OF WIGHT CO,TRAFFIC VIOLATION,,BLACK OR AFRICAN AMERICAN,NOT HISPANIC OR LATINO,26.0,MALE,...,WARNING ISSUED,46.2-804,,NO,NO,NO,,,,COMMONWEALTH
2,7/1/2020,Isle Of Wight County Sheriff's Office,36.95375 -76.540314,ISLE OF WIGHT CO,EQUIPMENT VIOLATION,,UNKNOWN,UNKNOWN,82.0,,...,WARNING ISSUED,46.2-1157,,NO,NO,NO,,,,COMMONWEALTH
3,7/1/2020,Isle Of Wight County Sheriff's Office,36.969732 -76.569873,ISLE OF WIGHT CO,EQUIPMENT VIOLATION,,WHITE,UNKNOWN,61.0,MALE,...,WARNING ISSUED,46.2-1157,,NO,NO,NO,,,,COMMONWEALTH
4,7/1/2020,Isle Of Wight County Sheriff's Office,36.949132 -76.596691,ISLE OF WIGHT CO,TRAFFIC VIOLATION,,WHITE,NOT HISPANIC OR LATINO,37.0,MALE,...,CITATION/SUMMONS,46.2-1157,,NO,NO,NO,,,,COMMONWEALTH


In [4]:
# Size information
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3371015 entries, 0 to 3371014
Data columns (total 21 columns):
 #   Column                 Dtype  
---  ------                 -----  
 0   STOP_DATE              object 
 1   AGENCY NAME            object 
 2   LOCATION               object 
 3   JURISDICTION           object 
 4   REASON FOR STOP        object 
 5   PERSON TYPE            object 
 6   RACE                   object 
 7   ETHNICITY              object 
 8   AGE                    float64
 9   GENDER                 object 
 10  ENGLISH SPEAKING       object 
 11  ACTION TAKEN           object 
 12  SPECIFIC VIOLATION     object 
 13  VIRGINIA CRIME CODE    object 
 14  PERSON SEARCHED        object 
 15  VEHICLE SEARCHED       object 
 16  ADDITIONAL ARREST      object 
 17  FORCE USED BY OFFICER  object 
 18  FORCE USED BY SUBJECT  object 
 19  RESIDENCY              object 
 20  VIOLATION TYPE         object 
dtypes: float64(1), object(20)
memory usage: 540.1+ MB


In [5]:
df.shape

(3371015, 21)

In [6]:
# Listing all categorical columns by filtering columns of type object
categorical_columns = df.select_dtypes(include=['object']).columns
categorical_columns

Index(['STOP_DATE', 'AGENCY NAME', 'LOCATION', 'JURISDICTION',
       'REASON FOR STOP', 'PERSON TYPE', 'RACE', 'ETHNICITY', 'GENDER',
       'ENGLISH SPEAKING', 'ACTION TAKEN', 'SPECIFIC VIOLATION',
       'VIRGINIA CRIME CODE', 'PERSON SEARCHED', 'VEHICLE SEARCHED',
       'ADDITIONAL ARREST', 'FORCE USED BY OFFICER', 'FORCE USED BY SUBJECT',
       'RESIDENCY', 'VIOLATION TYPE'],
      dtype='object')

In [7]:
# Listing all columns with numerical values by filtering columns of type int64 and float64
numerical_columns = df.select_dtypes(include=['int64', 'float64']).columns
numerical_columns

Index(['AGE'], dtype='object')

In [8]:
# Create df that contains columnds with NaN values
missing_values = df.isnull().sum()
columns_with_missing_values = missing_values[missing_values > 0]
columns_with_missing_values

LOCATION                    4963
JURISDICTION                   8
REASON FOR STOP            22366
PERSON TYPE               958398
RACE                          15
ETHNICITY                     22
AGE                           56
GENDER                      2906
ENGLISH SPEAKING          966913
ACTION TAKEN                4214
SPECIFIC VIOLATION         73011
VIRGINIA CRIME CODE      2052157
PERSON SEARCHED            24128
VEHICLE SEARCHED           28504
ADDITIONAL ARREST        2442425
FORCE USED BY OFFICER     966413
FORCE USED BY SUBJECT     966431
RESIDENCY                1474357
VIOLATION TYPE           2427194
dtype: int64

In [9]:
df.describe()

Unnamed: 0,AGE
count,3370959.0
mean,36.47217
std,15.75296
min,0.0
25%,24.0
50%,34.0
75%,47.0
max,99.0


In [10]:
# Drop rows where the target column 'ACTION TAKEN' is NaN
df = df.dropna(subset=['ACTION TAKEN'])

df[categorical_columns] = df[categorical_columns].astype(str)

# Convert the target column to numeric labels
label_encoder = LabelEncoder()
df['ACTION TAKEN'] = label_encoder.fit_transform(df['ACTION TAKEN'])

# Separate features and target
X = df.drop(columns=['ACTION TAKEN'])  # Replace 'ACTION TAKEN' with your actual target column name
y = df['ACTION TAKEN']

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=27)

# Identify categorical and numerical columns
categorical_cols = X.select_dtypes(include=['object']).columns
numerical_cols = X.select_dtypes(exclude=['object']).columns

# Define preprocessing for numeric and categorical data
numerical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())
])

# Convert all categorical columns to string before processing
categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

# Combine preprocessing steps
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_transformer, numerical_cols),
        ('cat', categorical_transformer, categorical_cols)
    ]
)

X_train_prepared = preprocessor.fit_transform(X_train)
X_test_prepared = preprocessor.transform(X_test)

In [11]:
X_train_prepared.shape

(2693440, 771425)

In [12]:
X_test_prepared.shape

(673361, 771425)

In [13]:
from sklearn.model_selection import GridSearchCV
from sklearn.svm import SVC
from sklearn.metrics import classification_report, accuracy_score

# Define a parameter grid, using 'auto' and 'scale' options for gamma and different C values
param_grid = {
    'C': [1],  # Experiment with a range of C values
    'gamma': ['auto'],
}

# Create an SVM model
svm_clf = SVC(kernel='rbf', decision_function_shape='ovr', random_state=27)

# Use GridSearchCV with parallel processing (e.g., using all cores)
grid_search = GridSearchCV(svm_clf, param_grid, cv=5, n_jobs=-1, verbose=2)
grid_search.fit(X_train_prepared, y_train)

# Get the best model from the grid search
best_svm_clf = grid_search.best_estimator_

# Predict on the test data
y_pred = best_svm_clf.predict(X_test_prepared)

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
report = classification_report(y_test, y_pred, target_names=label_encoder.classes_)

print("Best Parameters:", grid_search.best_params_)
print("Accuracy:", accuracy)
print("\nClassification Report:\n", report)

Fitting 5 folds for each of 1 candidates, totalling 5 fits


KeyboardInterrupt: 

hi
