In [1]:
from sklearn import *
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import LabelBinarizer
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.linear_model import Perceptron
from sklearn.svm import SVC
import datetime
import calendar
import gensim

In [2]:
data = pd.read_csv('train.csv')

In [3]:
data.head()

Unnamed: 0,Dates,Category,Descript,DayOfWeek,PdDistrict,Resolution,Address,X,Y
0,2015-05-13 23:53:00,WARRANTS,WARRANT ARREST,Wednesday,NORTHERN,"ARREST, BOOKED",OAK ST / LAGUNA ST,-122.425892,37.774599
1,2015-05-13 23:53:00,OTHER OFFENSES,TRAFFIC VIOLATION ARREST,Wednesday,NORTHERN,"ARREST, BOOKED",OAK ST / LAGUNA ST,-122.425892,37.774599
2,2015-05-13 23:33:00,OTHER OFFENSES,TRAFFIC VIOLATION ARREST,Wednesday,NORTHERN,"ARREST, BOOKED",VANNESS AV / GREENWICH ST,-122.424363,37.800414
3,2015-05-13 23:30:00,LARCENY/THEFT,GRAND THEFT FROM LOCKED AUTO,Wednesday,NORTHERN,NONE,1500 Block of LOMBARD ST,-122.426995,37.800873
4,2015-05-13 23:30:00,LARCENY/THEFT,GRAND THEFT FROM LOCKED AUTO,Wednesday,PARK,NONE,100 Block of BRODERICK ST,-122.438738,37.771541


In [4]:
years = []
months = []
days = []
hours = []
minutes = []
beautiful_endings = []
is_weekends = []
nights = []
mornings = []
middle_days = []
afternoons = []
intersections = []
for i in range(data.shape[0]):
    s = data['Dates'][i]
    dates = s.split()
    year = int((dates[0].split('-'))[0])
    years.append(year)
    month = int((dates[0].split('-'))[1])
    months.append(month)
    day = int((dates[0].split('-'))[2])
    days.append(day)
    hour = int((dates[1].split(':'))[0])
    hours.append(hour)
    minute = int((dates[1].split(':'))[1])
    minutes.append(minute)
    if(minute == 30 or minute == 0):
        beautiful_endings.append(1)
    else:
        beautiful_endings.append(0)
    weekday = calendar.day_abbr[datetime.date(year, month, day).weekday()]
    if(weekday == 'Sun' or weekday == 'Sat'):
        is_weekends.append(1)
    else:
        is_weekends.append(0)
    if(hour > 22 or (hour < 6)):
        nights.append(1)
        mornings.append(0)
        middle_days.append(0)
        afternoons.append(0)
    elif(hour >=6 and hour <= 10):
        nights.append(0)
        mornings.append(1)
        middle_days.append(0)
        afternoons.append(0)
    elif(hour > 10 and hour <=17):
        nights.append(0)
        mornings.append(0)
        middle_days.append(1)
        afternoons.append(0)
    else:
        nights.append(0)
        mornings.append(0)
        middle_days.append(0)
        afternoons.append(1)
    if(data['Address'][i].find('/')>-1):
        intersections.append(1)
    else:
        intersections.append(0)

In [5]:
data['BeautifulEndings'] = beautiful_endings
data['IsWeekend'] = is_weekends
data['Nights'] = nights
data['Mornings'] = mornings
data['MiddleDays'] = middle_days
data['Afternoons'] = afternoons
data['Day'] = days
data['Months'] = months

In [6]:
data = pd.get_dummies(data = data, columns = ['Months'])

In [7]:
data['Hours'] = hours
#data['Minutes'] = minutes
#data['Intersections'] = intersections

In [8]:
data = pd.get_dummies(data = data, columns = ['DayOfWeek'])

In [9]:
data = pd.get_dummies(data = data, columns = ['PdDistrict'])
#data = data.drop('PdDistrict', 1)

In [10]:
data.columns

Index(['Dates', 'Category', 'Descript', 'Resolution', 'Address', 'X', 'Y',
       'BeautifulEndings', 'IsWeekend', 'Nights', 'Mornings', 'MiddleDays',
       'Afternoons', 'Day', 'Months_1', 'Months_2', 'Months_3', 'Months_4',
       'Months_5', 'Months_6', 'Months_7', 'Months_8', 'Months_9', 'Months_10',
       'Months_11', 'Months_12', 'Hours', 'DayOfWeek_Friday',
       'DayOfWeek_Monday', 'DayOfWeek_Saturday', 'DayOfWeek_Sunday',
       'DayOfWeek_Thursday', 'DayOfWeek_Tuesday', 'DayOfWeek_Wednesday',
       'PdDistrict_BAYVIEW', 'PdDistrict_CENTRAL', 'PdDistrict_INGLESIDE',
       'PdDistrict_MISSION', 'PdDistrict_NORTHERN', 'PdDistrict_PARK',
       'PdDistrict_RICHMOND', 'PdDistrict_SOUTHERN', 'PdDistrict_TARAVAL',
       'PdDistrict_TENDERLOIN'],
      dtype='object')

In [11]:
def make_zones(X, Y):
    maxX = max(X)
    maxY = max(Y)
    minX = min(X)
    minY = min(Y)
    x_spread = max(X) - min(X)
    y_spread = max(Y) - min(Y)
    x_step = x_spread/15
    y_step = y_spread/15
    zones = []
    for i in range(len(X)):
        x_zone = int((X[i] - minX)/x_step)
        y_zone = int((Y[i] - minY)/y_step)
        if(x_zone == 15):
            x_zone = 14
        if(y_zone == 15):
            y_zone = 14
        zones.append(15*x_zone+y_zone)
    return zones   

In [12]:
data = data[data['Y'] < 80] # удаление записей с выбросами

In [13]:
zones = make_zones(list(data['X']), list(data['Y']))

In [14]:
data['Zones'] = zones

In [15]:
data = pd.get_dummies(data, columns=['Zones'])

In [16]:
data = data.drop_duplicates(subset = ['X', 'Y', 'Dates', 'Address'])

In [17]:
data = data.drop(['Dates', 'Descript', 'Resolution', 'Address'], axis = 1)

In [18]:
data = data.reindex(range(data.shape[0]), method='ffill')

In [19]:
relevant_categories = ['LARCENY/THEFT', 'ROBBERY', 'STOLEN PROPERTY', 'EMBEZZLEMENT', 'OTHER OFFENSES', 
                      'NON-CRIMINAL', 'ASSAULT', 'SEX OFFENSES FORCIBLE', 'DRUG/NARCOTIC', 'VEHICLE THEFT', 
                      'VANDALISM', 'ARSON', 'DRUNKENNESS', 'BURGLARY', 'MISSING PERSON', 'KIDNAPPING', 
                      'FRAUD', 'WEAPON LAWS', 'PROSTITUTION', 'DISORDERLY CONDUCT','DRIVING UNDER THE INFLUENCE',
                      'SUICIDE', 'EXTORTION']
relevant_categories = np.array(relevant_categories)

In [20]:
index_rows_to_delete = []
answers = data['Category']
for i in range(len(list(answers))):
    if(answers[i] not in relevant_categories):
        index_rows_to_delete.append(i)

In [21]:
data = data.drop(index_rows_to_delete)

In [22]:
data = data.reindex(range(data.shape[0]), method='bfill')

In [23]:
data

Unnamed: 0,Category,X,Y,BeautifulEndings,IsWeekend,Nights,Mornings,MiddleDays,Afternoons,Day,...,Zones_196,Zones_197,Zones_198,Zones_199,Zones_200,Zones_201,Zones_212,Zones_213,Zones_223,Zones_224
0,OTHER OFFENSES,-122.424363,37.800414,0,0,1,0,0,0,13,...,0,0,0,0,0,0,0,0,0,0
1,OTHER OFFENSES,-122.424363,37.800414,0,0,1,0,0,0,13,...,0,0,0,0,0,0,0,0,0,0
2,OTHER OFFENSES,-122.424363,37.800414,0,0,1,0,0,0,13,...,0,0,0,0,0,0,0,0,0,0
3,LARCENY/THEFT,-122.426995,37.800873,1,0,1,0,0,0,13,...,0,0,0,0,0,0,0,0,0,0
4,LARCENY/THEFT,-122.438738,37.771541,1,0,1,0,0,0,13,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
606100,VEHICLE THEFT,-122.419616,37.801994,1,0,0,0,1,0,6,...,0,0,0,0,0,0,0,0,0,0
606101,DRUG/NARCOTIC,-122.411071,37.781751,1,0,0,0,1,0,6,...,0,0,0,0,0,0,0,0,0,0
606102,MISSING PERSON,-122.452223,37.720915,1,0,0,0,1,0,6,...,0,0,0,0,0,0,0,0,0,0
606103,VANDALISM,-122.427506,37.786762,1,0,0,0,1,0,6,...,0,0,0,0,0,0,0,0,0,0


In [24]:
data['Category'].describe()

count            606105
unique               23
top       LARCENY/THEFT
freq             153124
Name: Category, dtype: object

In [25]:
d = {}
d['LARCENY/THEFT']=  ['LARCENY/THEFT', 'ROBBERY', 'STOLEN PROPERTY', 'EMBEZZLEMENT']
d['NON-CRIMINAL'] = ['OTHER OFFENSES', 'NON-CRIMINAL']
d['ASSAULT'] = ['ASSAULT', 'SEX OFFENSES FORCIBLE']
d['VANDALISM'] = ['VANDALISM', 'ARSON']
d['KIDNAPPING'] = ['MISSING PERSON', 'KIDNAPPING']

In [26]:
answers = data['Category']
data = data.drop('Category', axis = 1)
list_answers = list(answers)
for i in range(len(list_answers)):
    for key, value in d.items():
        if(list_answers[i] in value):
            list_answers[i] = key
            break

In [27]:
data['Category'] = list_answers

In [28]:
data['Category'].describe()

count            606105
unique               16
top       LARCENY/THEFT
freq             176967
Name: Category, dtype: object

In [29]:
answer = data['Category']
data = data.drop('Category', axis = 1)

In [30]:
X_train, X_test, y_train, y_test = train_test_split(data, answer, random_state=42)
clf = RandomForestClassifier(max_depth=45,n_estimators = 30, random_state=42, criterion = 'entropy')
clf.fit(X_train, y_train)
result = clf.predict(X_test)
accuracy_score(result, y_test)

0.6496267991843038

In [31]:
data['XY1'] = (data['X'] - data['X'].min())**2 + (data['Y'] - data['Y'].min())**2

In [32]:
data['XY2'] = (data['X'].max() - data['X'])**2 + (data['Y'] - data['Y'].min())**2

In [33]:
data['XY3'] = (data['X'] - data['X'].min())**2 + (data['Y'].max() - data['Y'])**2

In [34]:
data['XY4'] = (data['X'].max() - data['X'])**2 + (data['Y'].max() - data['Y'])**2

In [35]:
data["XY45_2"] = data["Y"] * np.cos(np.pi / 4) - data["X"] * np.sin(np.pi / 4)

In [36]:
data["XY30_1"] = data["X"] * np.cos(np.pi / 6) + data["Y"] * np.sin(np.pi / 6)

In [37]:
data["XY30_2"] = data["Y"] * np.cos(np.pi / 6) - data["X"] * np.sin(np.pi / 6)

In [38]:
data["XY60_1"] = data["X"] * np.cos(np.pi / 3) + data["Y"] * np.sin(np.pi / 3)
data["XY60_2"] = data["Y"] * np.cos(np.pi / 3) - data["X"] * np.sin(np.pi / 3)

In [39]:
X_median = data["X"].median()
Y_median = data["Y"].median()

data["XY5"] = (data["X"] - X_median) ** 2 + (data["Y"] - Y_median) ** 2

data["XY_rad"] = np.sqrt(np.power(data['Y'], 2) + np.power(data['X'], 2))


In [40]:
list(data.columns)

['X',
 'Y',
 'BeautifulEndings',
 'IsWeekend',
 'Nights',
 'Mornings',
 'MiddleDays',
 'Afternoons',
 'Day',
 'Months_1',
 'Months_2',
 'Months_3',
 'Months_4',
 'Months_5',
 'Months_6',
 'Months_7',
 'Months_8',
 'Months_9',
 'Months_10',
 'Months_11',
 'Months_12',
 'Hours',
 'DayOfWeek_Friday',
 'DayOfWeek_Monday',
 'DayOfWeek_Saturday',
 'DayOfWeek_Sunday',
 'DayOfWeek_Thursday',
 'DayOfWeek_Tuesday',
 'DayOfWeek_Wednesday',
 'PdDistrict_BAYVIEW',
 'PdDistrict_CENTRAL',
 'PdDistrict_INGLESIDE',
 'PdDistrict_MISSION',
 'PdDistrict_NORTHERN',
 'PdDistrict_PARK',
 'PdDistrict_RICHMOND',
 'PdDistrict_SOUTHERN',
 'PdDistrict_TARAVAL',
 'PdDistrict_TENDERLOIN',
 'Zones_2',
 'Zones_3',
 'Zones_4',
 'Zones_5',
 'Zones_6',
 'Zones_7',
 'Zones_8',
 'Zones_9',
 'Zones_15',
 'Zones_16',
 'Zones_17',
 'Zones_18',
 'Zones_19',
 'Zones_20',
 'Zones_21',
 'Zones_22',
 'Zones_23',
 'Zones_24',
 'Zones_25',
 'Zones_30',
 'Zones_31',
 'Zones_32',
 'Zones_33',
 'Zones_34',
 'Zones_35',
 'Zones_36',
 '

In [41]:
from sklearn.ensemble import GradientBoostingClassifier

In [42]:
X_train, X_test, y_train, y_test = train_test_split(data, answer, random_state=142)
clf = RandomForestClassifier(max_depth=45,n_estimators = 200, random_state=42,
                             max_features = 'auto')
clf.fit(X_train, y_train)
result = clf.predict(X_test)
accuracy_score(result, y_test)

0.6657559378856573

In [43]:
result_p = clf.predict_proba(X_test)

In [44]:
res = 0
s = list(y_test)
for i in range(len(y_test)):
    ind = np.argpartition(result_p[i], -3)[-3:]
    if clf.classes_[ind[0]] == s[i] or clf.classes_[ind[1]] ==s[i] or clf.classes_[ind[2]] ==s[i]:
        res += 1
res/len(y_test)

0.8609356748302283

In [45]:
list(X_train.columns)

['X',
 'Y',
 'BeautifulEndings',
 'IsWeekend',
 'Nights',
 'Mornings',
 'MiddleDays',
 'Afternoons',
 'Day',
 'Months_1',
 'Months_2',
 'Months_3',
 'Months_4',
 'Months_5',
 'Months_6',
 'Months_7',
 'Months_8',
 'Months_9',
 'Months_10',
 'Months_11',
 'Months_12',
 'Hours',
 'DayOfWeek_Friday',
 'DayOfWeek_Monday',
 'DayOfWeek_Saturday',
 'DayOfWeek_Sunday',
 'DayOfWeek_Thursday',
 'DayOfWeek_Tuesday',
 'DayOfWeek_Wednesday',
 'PdDistrict_BAYVIEW',
 'PdDistrict_CENTRAL',
 'PdDistrict_INGLESIDE',
 'PdDistrict_MISSION',
 'PdDistrict_NORTHERN',
 'PdDistrict_PARK',
 'PdDistrict_RICHMOND',
 'PdDistrict_SOUTHERN',
 'PdDistrict_TARAVAL',
 'PdDistrict_TENDERLOIN',
 'Zones_2',
 'Zones_3',
 'Zones_4',
 'Zones_5',
 'Zones_6',
 'Zones_7',
 'Zones_8',
 'Zones_9',
 'Zones_15',
 'Zones_16',
 'Zones_17',
 'Zones_18',
 'Zones_19',
 'Zones_20',
 'Zones_21',
 'Zones_22',
 'Zones_23',
 'Zones_24',
 'Zones_25',
 'Zones_30',
 'Zones_31',
 'Zones_32',
 'Zones_33',
 'Zones_34',
 'Zones_35',
 'Zones_36',
 '

In [46]:
X_train.head(1)

Unnamed: 0,X,Y,BeautifulEndings,IsWeekend,Nights,Mornings,MiddleDays,Afternoons,Day,Months_1,...,XY2,XY3,XY4,XY45_2,XY30_1,XY30_2,XY60_1,XY60_2,XY5,XY_rad
18568,-122.41873,37.785008,1,0,0,0,1,0,16,0,...,0.00881,0.010231,0.004084,113.281149,-87.125226,93.932141,-28.486588,124.910233,9.7e-05,128.117337


In [47]:
X_train[1:2]['DayOfWeek_Monday']


79149    1
Name: DayOfWeek_Monday, dtype: uint8

In [48]:
list(X_train.columns)

['X',
 'Y',
 'BeautifulEndings',
 'IsWeekend',
 'Nights',
 'Mornings',
 'MiddleDays',
 'Afternoons',
 'Day',
 'Months_1',
 'Months_2',
 'Months_3',
 'Months_4',
 'Months_5',
 'Months_6',
 'Months_7',
 'Months_8',
 'Months_9',
 'Months_10',
 'Months_11',
 'Months_12',
 'Hours',
 'DayOfWeek_Friday',
 'DayOfWeek_Monday',
 'DayOfWeek_Saturday',
 'DayOfWeek_Sunday',
 'DayOfWeek_Thursday',
 'DayOfWeek_Tuesday',
 'DayOfWeek_Wednesday',
 'PdDistrict_BAYVIEW',
 'PdDistrict_CENTRAL',
 'PdDistrict_INGLESIDE',
 'PdDistrict_MISSION',
 'PdDistrict_NORTHERN',
 'PdDistrict_PARK',
 'PdDistrict_RICHMOND',
 'PdDistrict_SOUTHERN',
 'PdDistrict_TARAVAL',
 'PdDistrict_TENDERLOIN',
 'Zones_2',
 'Zones_3',
 'Zones_4',
 'Zones_5',
 'Zones_6',
 'Zones_7',
 'Zones_8',
 'Zones_9',
 'Zones_15',
 'Zones_16',
 'Zones_17',
 'Zones_18',
 'Zones_19',
 'Zones_20',
 'Zones_21',
 'Zones_22',
 'Zones_23',
 'Zones_24',
 'Zones_25',
 'Zones_30',
 'Zones_31',
 'Zones_32',
 'Zones_33',
 'Zones_34',
 'Zones_35',
 'Zones_36',
 '

In [49]:
X_train['Y'].median()

37.775420706711

### Сохранение модели

In [50]:
import pickle 
from joblib import dump, load

In [51]:
dump(clf, 'model.joblib')

['model.joblib']