In [None]:
!pip install sklearn
!pip install numpy
!pip install pandas
!pip install tqdm

In [1]:
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
import numpy as np
import pandas as pd
from tqdm.auto import tqdm

In [2]:
df = pd.read_csv('./train.csv')

In [3]:
df

Unnamed: 0,Dates,Category,Descript,DayOfWeek,PdDistrict,Resolution,Address,X,Y
0,2015-05-13 23:53:00,WARRANTS,WARRANT ARREST,Wednesday,NORTHERN,"ARREST, BOOKED",OAK ST / LAGUNA ST,-122.425892,37.774599
1,2015-05-13 23:53:00,OTHER OFFENSES,TRAFFIC VIOLATION ARREST,Wednesday,NORTHERN,"ARREST, BOOKED",OAK ST / LAGUNA ST,-122.425892,37.774599
2,2015-05-13 23:33:00,OTHER OFFENSES,TRAFFIC VIOLATION ARREST,Wednesday,NORTHERN,"ARREST, BOOKED",VANNESS AV / GREENWICH ST,-122.424363,37.800414
3,2015-05-13 23:30:00,LARCENY/THEFT,GRAND THEFT FROM LOCKED AUTO,Wednesday,NORTHERN,NONE,1500 Block of LOMBARD ST,-122.426995,37.800873
4,2015-05-13 23:30:00,LARCENY/THEFT,GRAND THEFT FROM LOCKED AUTO,Wednesday,PARK,NONE,100 Block of BRODERICK ST,-122.438738,37.771541
...,...,...,...,...,...,...,...,...,...
878044,2003-01-06 00:15:00,ROBBERY,ROBBERY ON THE STREET WITH A GUN,Monday,TARAVAL,NONE,FARALLONES ST / CAPITOL AV,-122.459033,37.714056
878045,2003-01-06 00:01:00,LARCENY/THEFT,GRAND THEFT FROM LOCKED AUTO,Monday,INGLESIDE,NONE,600 Block of EDNA ST,-122.447364,37.731948
878046,2003-01-06 00:01:00,LARCENY/THEFT,GRAND THEFT FROM LOCKED AUTO,Monday,SOUTHERN,NONE,5TH ST / FOLSOM ST,-122.403390,37.780266
878047,2003-01-06 00:01:00,VANDALISM,"MALICIOUS MISCHIEF, VANDALISM OF VEHICLES",Monday,SOUTHERN,NONE,TOWNSEND ST / 2ND ST,-122.390531,37.780607


In [4]:
df['Category'].value_counts()

Category
LARCENY/THEFT                  174900
OTHER OFFENSES                 126182
NON-CRIMINAL                    92304
ASSAULT                         76876
DRUG/NARCOTIC                   53971
VEHICLE THEFT                   53781
VANDALISM                       44725
WARRANTS                        42214
BURGLARY                        36755
SUSPICIOUS OCC                  31414
MISSING PERSON                  25989
ROBBERY                         23000
FRAUD                           16679
FORGERY/COUNTERFEITING          10609
SECONDARY CODES                  9985
WEAPON LAWS                      8555
PROSTITUTION                     7484
TRESPASS                         7326
STOLEN PROPERTY                  4540
SEX OFFENSES FORCIBLE            4388
DISORDERLY CONDUCT               4320
DRUNKENNESS                      4280
RECOVERED VEHICLE                3138
KIDNAPPING                       2341
DRIVING UNDER THE INFLUENCE      2268
RUNAWAY                          1946
LIQ

Top 10 categories of crime:
1. Larceny/theft
2. Assault
3. Drug/narcotic
4. Vehicle Theft
5. Vandalism
6. Warrants
7. Burglary
8. Robbery
9. Fraud
10. Forgery/Counterfeiting

In [5]:
dates = df['Dates'][0]
dates

'2015-05-13 23:53:00'

In [6]:
int(dates[:4])

2015

In [7]:
int(dates[5:7])

5

In [8]:
def get_date(date_i):
    year = int(date_i[:4])
    month = int(date_i[5:7])
    return year, month

In [9]:
def get_pd_district_onehot(district_i):
    all_police_districts = df['PdDistrict'].unique()
    list_pd_districts = all_police_districts.tolist()
    total_pd_dists = len(list_pd_districts)
    one_hot = np.zeros(total_pd_dists, dtype=int)
    index_one = list_pd_districts.index(district_i)
    one_hot[index_one] = 1
    return one_hot

In [10]:
def get_day_of_the_week_onehot(dotw_i):
    dotw = df['DayOfWeek'].unique()
    dotw_list = dotw.tolist()
    total_dotw = 7
    one_hot = np.zeros(total_dotw, dtype = int)
    index_one = dotw_list.index(dotw_i)
    one_hot[index_one] = 1
    return one_hot

In [11]:
def get_crime_category(crime_i):
    crime_cats = df['Category'].unique()
    crime_cats_list = crime_cats.tolist()
    crime_cat = crime_cats_list.index(crime_i)
    return crime_cat

In [12]:
def get_features(X):
    #Features: Year, Month, DayOfWeek, PdDistrict, X, Y
    #DayOfWeek is 1x7 one-hot for each data point
    #PdDistrict 1x10 one-hot for each data point
    #Feature vector length for each datapoint: 1+1+7+10+1+1=21
    num_rows = X.shape[0]
    X_unscaled = np.zeros((num_rows, 21))
    for i in tqdm(range(num_rows)):
        date_i = X[i, 0]
        year_i, month_i = get_date(date_i)
        dotw_i = X[i, 1]
        dotw_i_oh = get_day_of_the_week_onehot(dotw_i)
        pd_i = X[i, 2]
        pd_i_oh = get_pd_district_onehot(pd_i)
        coordx_i = X[i, 3]
        coordy_i = X[i, 4]
        x_i = np.hstack((year_i, month_i, dotw_i_oh, pd_i_oh, coordx_i, coordy_i))
        X_unscaled[i] = x_i
    return X_unscaled

In [13]:
def get_target(y):
    num_rows = y.shape[0]
    y_proc = []
    for i in tqdm(range(num_rows)):
        crime_i = y[i]
        y_proc.append(get_crime_category(crime_i))
    return np.array(y_proc)

In [14]:
def get_top_10_cats_x_y(df):
    X = np.vstack((df['Dates'], df['DayOfWeek'], df['PdDistrict'], df['X'], df['Y']))
    X = X.T
    y = np.array(df['Category'])
    top_10_cats = ['LARCENY/THEFT', 'ASSAULT', 'DRUG/NARCOTIC', 'VEHICLE THEFT', 'VANDALISM', 'WARRANTS', 'BURGLARY', 'ROBBERY', 'FRAUD', 'FORGERY/COUNTERFEITING']
    mask = np.isin(y, top_10_cats)
    X_filt = X[mask]
    y_filt = y[mask]
    return X_filt, y_filt

In [15]:
X, y = get_top_10_cats_x_y(df)

In [16]:
#90/10 train/validate split
X_sample, _, y_sample, _ = train_test_split(X, y, train_size = 10000, random_state=42, stratify = y)
X_train_raw, X_val_raw, y_train_raw, y_val_raw = train_test_split(X_sample, y_sample, test_size = 0.1, random_state=42, stratify = y_sample)

In [17]:
X_train = get_features(X_train_raw)
X_val = get_features(X_val_raw)
y_train = get_target(y_train_raw)
y_val = get_target(y_val_raw)

  0%|          | 0/9000 [00:00<?, ?it/s]

  0%|          | 0/1000 [00:00<?, ?it/s]

  0%|          | 0/9000 [00:00<?, ?it/s]

  0%|          | 0/1000 [00:00<?, ?it/s]

In [18]:
np.save('X_train.npy', X_train)
np.save('X_test.npy', X_val)
np.save('y_train.npy', y_train)
np.save('y_test.npy', y_val)