In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.preprocessing import LabelBinarizer, StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression,SGDClassifier
from sklearn.svm import SVC 
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import f1_score as score
from sklearn.metrics import accuracy_score, precision_score, recall_score, roc_auc_score

  from numpy.core.umath_tests import inner1d


In [2]:
train_data =pd.read_csv("C:\\Users\\mohamedelhedi\\Desktop\\python san fansico\\train.csv", parse_dates =['Dates'])
test_data = pd.read_csv("C:\\Users\\mohamedelhedi\\Desktop\\python san fansico\\test.csv", parse_dates =['Dates'])

print("The size of the train data is:", train_data.shape)
print("The size of the test data is:", test_data.shape)

The size of the train data is: (878049, 9)
The size of the test data is: (884262, 7)


In [3]:
train_data.head()

Unnamed: 0,Dates,Category,Descript,DayOfWeek,PdDistrict,Resolution,Address,X,Y
0,2015-05-13 23:53:00,WARRANTS,WARRANT ARREST,Wednesday,NORTHERN,"ARREST, BOOKED",OAK ST / LAGUNA ST,-122.425892,37.774599
1,2015-05-13 23:53:00,OTHER OFFENSES,TRAFFIC VIOLATION ARREST,Wednesday,NORTHERN,"ARREST, BOOKED",OAK ST / LAGUNA ST,-122.425892,37.774599
2,2015-05-13 23:33:00,OTHER OFFENSES,TRAFFIC VIOLATION ARREST,Wednesday,NORTHERN,"ARREST, BOOKED",VANNESS AV / GREENWICH ST,-122.424363,37.800414
3,2015-05-13 23:30:00,LARCENY/THEFT,GRAND THEFT FROM LOCKED AUTO,Wednesday,NORTHERN,NONE,1500 Block of LOMBARD ST,-122.426995,37.800873
4,2015-05-13 23:30:00,LARCENY/THEFT,GRAND THEFT FROM LOCKED AUTO,Wednesday,PARK,NONE,100 Block of BRODERICK ST,-122.438738,37.771541


In [4]:
test_data.head()

Unnamed: 0,Id,Dates,DayOfWeek,PdDistrict,Address,X,Y
0,0,2015-05-10 23:59:00,Sunday,BAYVIEW,2000 Block of THOMAS AV,-122.399588,37.735051
1,1,2015-05-10 23:51:00,Sunday,BAYVIEW,3RD ST / REVERE AV,-122.391523,37.732432
2,2,2015-05-10 23:50:00,Sunday,NORTHERN,2000 Block of GOUGH ST,-122.426002,37.792212
3,3,2015-05-10 23:45:00,Sunday,INGLESIDE,4700 Block of MISSION ST,-122.437394,37.721412
4,4,2015-05-10 23:45:00,Sunday,INGLESIDE,4700 Block of MISSION ST,-122.437394,37.721412


In [5]:
train_data.dtypes.value_counts()

object            6
float64           2
datetime64[ns]    1
dtype: int64

In [6]:
test_data.dtypes.value_counts()

object            3
float64           2
int64             1
datetime64[ns]    1
dtype: int64

In [7]:
train_data.isnull().sum()

Dates         0
Category      0
Descript      0
DayOfWeek     0
PdDistrict    0
Resolution    0
Address       0
X             0
Y             0
dtype: int64

In [8]:
test_data.isnull().sum()

Id            0
Dates         0
DayOfWeek     0
PdDistrict    0
Address       0
X             0
Y             0
dtype: int64

In [9]:
train_data.columns

Index(['Dates', 'Category', 'Descript', 'DayOfWeek', 'PdDistrict',
       'Resolution', 'Address', 'X', 'Y'],
      dtype='object')

In [10]:
train_data.Category.value_counts()

LARCENY/THEFT                  174900
OTHER OFFENSES                 126182
NON-CRIMINAL                    92304
ASSAULT                         76876
DRUG/NARCOTIC                   53971
VEHICLE THEFT                   53781
VANDALISM                       44725
WARRANTS                        42214
BURGLARY                        36755
SUSPICIOUS OCC                  31414
MISSING PERSON                  25989
ROBBERY                         23000
FRAUD                           16679
FORGERY/COUNTERFEITING          10609
SECONDARY CODES                  9985
WEAPON LAWS                      8555
PROSTITUTION                     7484
TRESPASS                         7326
STOLEN PROPERTY                  4540
SEX OFFENSES FORCIBLE            4388
DISORDERLY CONDUCT               4320
DRUNKENNESS                      4280
RECOVERED VEHICLE                3138
KIDNAPPING                       2341
DRIVING UNDER THE INFLUENCE      2268
RUNAWAY                          1946
LIQUOR LAWS 

In [11]:
from sklearn.preprocessing import LabelEncoder

le = LabelEncoder()

train_data['Category'] = le.fit_transform(train_data.Category)
train_data.Category.head()

0    37
1    21
2    21
3    16
4    16
Name: Category, dtype: int64

In [12]:
train_data.PdDistrict.value_counts()

SOUTHERN      157182
MISSION       119908
NORTHERN      105296
BAYVIEW        89431
CENTRAL        85460
TENDERLOIN     81809
INGLESIDE      78845
TARAVAL        65596
PARK           49313
RICHMOND       45209
Name: PdDistrict, dtype: int64

In [13]:
feature_cols =['DayOfWeek', 'PdDistrict']
train_data = pd.get_dummies(train_data, columns=feature_cols)
test_data = pd.get_dummies(test_data, columns=feature_cols)

train_data

Unnamed: 0,Dates,Category,Descript,Resolution,Address,X,Y,DayOfWeek_Friday,DayOfWeek_Monday,DayOfWeek_Saturday,...,PdDistrict_BAYVIEW,PdDistrict_CENTRAL,PdDistrict_INGLESIDE,PdDistrict_MISSION,PdDistrict_NORTHERN,PdDistrict_PARK,PdDistrict_RICHMOND,PdDistrict_SOUTHERN,PdDistrict_TARAVAL,PdDistrict_TENDERLOIN
0,2015-05-13 23:53:00,37,WARRANT ARREST,"ARREST, BOOKED",OAK ST / LAGUNA ST,-122.425892,37.774599,0,0,0,...,0,0,0,0,1,0,0,0,0,0
1,2015-05-13 23:53:00,21,TRAFFIC VIOLATION ARREST,"ARREST, BOOKED",OAK ST / LAGUNA ST,-122.425892,37.774599,0,0,0,...,0,0,0,0,1,0,0,0,0,0
2,2015-05-13 23:33:00,21,TRAFFIC VIOLATION ARREST,"ARREST, BOOKED",VANNESS AV / GREENWICH ST,-122.424363,37.800414,0,0,0,...,0,0,0,0,1,0,0,0,0,0
3,2015-05-13 23:30:00,16,GRAND THEFT FROM LOCKED AUTO,NONE,1500 Block of LOMBARD ST,-122.426995,37.800873,0,0,0,...,0,0,0,0,1,0,0,0,0,0
4,2015-05-13 23:30:00,16,GRAND THEFT FROM LOCKED AUTO,NONE,100 Block of BRODERICK ST,-122.438738,37.771541,0,0,0,...,0,0,0,0,0,1,0,0,0,0
5,2015-05-13 23:30:00,16,GRAND THEFT FROM UNLOCKED AUTO,NONE,0 Block of TEDDY AV,-122.403252,37.713431,0,0,0,...,0,0,1,0,0,0,0,0,0,0
6,2015-05-13 23:30:00,36,STOLEN AUTOMOBILE,NONE,AVALON AV / PERU AV,-122.423327,37.725138,0,0,0,...,0,0,1,0,0,0,0,0,0,0
7,2015-05-13 23:30:00,36,STOLEN AUTOMOBILE,NONE,KIRKWOOD AV / DONAHUE ST,-122.371274,37.727564,0,0,0,...,1,0,0,0,0,0,0,0,0,0
8,2015-05-13 23:00:00,16,GRAND THEFT FROM LOCKED AUTO,NONE,600 Block of 47TH AV,-122.508194,37.776601,0,0,0,...,0,0,0,0,0,0,1,0,0,0
9,2015-05-13 23:00:00,16,GRAND THEFT FROM LOCKED AUTO,NONE,JEFFERSON ST / LEAVENWORTH ST,-122.419088,37.807802,0,0,0,...,0,1,0,0,0,0,0,0,0,0


In [14]:
test_data

Unnamed: 0,Id,Dates,Address,X,Y,DayOfWeek_Friday,DayOfWeek_Monday,DayOfWeek_Saturday,DayOfWeek_Sunday,DayOfWeek_Thursday,...,PdDistrict_BAYVIEW,PdDistrict_CENTRAL,PdDistrict_INGLESIDE,PdDistrict_MISSION,PdDistrict_NORTHERN,PdDistrict_PARK,PdDistrict_RICHMOND,PdDistrict_SOUTHERN,PdDistrict_TARAVAL,PdDistrict_TENDERLOIN
0,0,2015-05-10 23:59:00,2000 Block of THOMAS AV,-122.399588,37.735051,0,0,0,1,0,...,1,0,0,0,0,0,0,0,0,0
1,1,2015-05-10 23:51:00,3RD ST / REVERE AV,-122.391523,37.732432,0,0,0,1,0,...,1,0,0,0,0,0,0,0,0,0
2,2,2015-05-10 23:50:00,2000 Block of GOUGH ST,-122.426002,37.792212,0,0,0,1,0,...,0,0,0,0,1,0,0,0,0,0
3,3,2015-05-10 23:45:00,4700 Block of MISSION ST,-122.437394,37.721412,0,0,0,1,0,...,0,0,1,0,0,0,0,0,0,0
4,4,2015-05-10 23:45:00,4700 Block of MISSION ST,-122.437394,37.721412,0,0,0,1,0,...,0,0,1,0,0,0,0,0,0,0
5,5,2015-05-10 23:40:00,BROAD ST / CAPITOL AV,-122.459024,37.713172,0,0,0,1,0,...,0,0,0,0,0,0,0,0,1,0
6,6,2015-05-10 23:30:00,100 Block of CHENERY ST,-122.425616,37.739351,0,0,0,1,0,...,0,0,1,0,0,0,0,0,0,0
7,7,2015-05-10 23:30:00,200 Block of BANKS ST,-122.412652,37.739750,0,0,0,1,0,...,0,0,1,0,0,0,0,0,0,0
8,8,2015-05-10 23:10:00,2900 Block of 16TH ST,-122.418700,37.765165,0,0,0,1,0,...,0,0,0,1,0,0,0,0,0,0
9,9,2015-05-10 23:10:00,TAYLOR ST / GREEN ST,-122.413935,37.798886,0,0,0,1,0,...,0,1,0,0,0,0,0,0,0,0


In [15]:
for x in [train_data, test_data]:
    x['years'] = x['Dates'].dt.year
    x['months'] = x['Dates'].dt.month
    x['days'] = x['Dates'].dt.day
    x['hours'] = x['Dates'].dt.hour
    x['minutes'] = x['Dates'].dt.minute
    x['seconds'] = x['Dates'].dt.second

In [16]:
train_data.head()

Unnamed: 0,Dates,Category,Descript,Resolution,Address,X,Y,DayOfWeek_Friday,DayOfWeek_Monday,DayOfWeek_Saturday,...,PdDistrict_RICHMOND,PdDistrict_SOUTHERN,PdDistrict_TARAVAL,PdDistrict_TENDERLOIN,years,months,days,hours,minutes,seconds
0,2015-05-13 23:53:00,37,WARRANT ARREST,"ARREST, BOOKED",OAK ST / LAGUNA ST,-122.425892,37.774599,0,0,0,...,0,0,0,0,2015,5,13,23,53,0
1,2015-05-13 23:53:00,21,TRAFFIC VIOLATION ARREST,"ARREST, BOOKED",OAK ST / LAGUNA ST,-122.425892,37.774599,0,0,0,...,0,0,0,0,2015,5,13,23,53,0
2,2015-05-13 23:33:00,21,TRAFFIC VIOLATION ARREST,"ARREST, BOOKED",VANNESS AV / GREENWICH ST,-122.424363,37.800414,0,0,0,...,0,0,0,0,2015,5,13,23,33,0
3,2015-05-13 23:30:00,16,GRAND THEFT FROM LOCKED AUTO,NONE,1500 Block of LOMBARD ST,-122.426995,37.800873,0,0,0,...,0,0,0,0,2015,5,13,23,30,0
4,2015-05-13 23:30:00,16,GRAND THEFT FROM LOCKED AUTO,NONE,100 Block of BRODERICK ST,-122.438738,37.771541,0,0,0,...,0,0,0,0,2015,5,13,23,30,0


In [17]:
test_data.head()

Unnamed: 0,Id,Dates,Address,X,Y,DayOfWeek_Friday,DayOfWeek_Monday,DayOfWeek_Saturday,DayOfWeek_Sunday,DayOfWeek_Thursday,...,PdDistrict_RICHMOND,PdDistrict_SOUTHERN,PdDistrict_TARAVAL,PdDistrict_TENDERLOIN,years,months,days,hours,minutes,seconds
0,0,2015-05-10 23:59:00,2000 Block of THOMAS AV,-122.399588,37.735051,0,0,0,1,0,...,0,0,0,0,2015,5,10,23,59,0
1,1,2015-05-10 23:51:00,3RD ST / REVERE AV,-122.391523,37.732432,0,0,0,1,0,...,0,0,0,0,2015,5,10,23,51,0
2,2,2015-05-10 23:50:00,2000 Block of GOUGH ST,-122.426002,37.792212,0,0,0,1,0,...,0,0,0,0,2015,5,10,23,50,0
3,3,2015-05-10 23:45:00,4700 Block of MISSION ST,-122.437394,37.721412,0,0,0,1,0,...,0,0,0,0,2015,5,10,23,45,0
4,4,2015-05-10 23:45:00,4700 Block of MISSION ST,-122.437394,37.721412,0,0,0,1,0,...,0,0,0,0,2015,5,10,23,45,0


In [18]:
train_data = train_data.drop(['Dates', 'Address','Resolution'], axis = 1)

In [19]:
train_data = train_data.drop(['Descript'], axis = 1)
train_data.head()

Unnamed: 0,Category,X,Y,DayOfWeek_Friday,DayOfWeek_Monday,DayOfWeek_Saturday,DayOfWeek_Sunday,DayOfWeek_Thursday,DayOfWeek_Tuesday,DayOfWeek_Wednesday,...,PdDistrict_RICHMOND,PdDistrict_SOUTHERN,PdDistrict_TARAVAL,PdDistrict_TENDERLOIN,years,months,days,hours,minutes,seconds
0,37,-122.425892,37.774599,0,0,0,0,0,0,1,...,0,0,0,0,2015,5,13,23,53,0
1,21,-122.425892,37.774599,0,0,0,0,0,0,1,...,0,0,0,0,2015,5,13,23,53,0
2,21,-122.424363,37.800414,0,0,0,0,0,0,1,...,0,0,0,0,2015,5,13,23,33,0
3,16,-122.426995,37.800873,0,0,0,0,0,0,1,...,0,0,0,0,2015,5,13,23,30,0
4,16,-122.438738,37.771541,0,0,0,0,0,0,1,...,0,0,0,0,2015,5,13,23,30,0


In [20]:
test_data = test_data.drop(['Dates', 'Address'], axis = 1)
test_data.head()

Unnamed: 0,Id,X,Y,DayOfWeek_Friday,DayOfWeek_Monday,DayOfWeek_Saturday,DayOfWeek_Sunday,DayOfWeek_Thursday,DayOfWeek_Tuesday,DayOfWeek_Wednesday,...,PdDistrict_RICHMOND,PdDistrict_SOUTHERN,PdDistrict_TARAVAL,PdDistrict_TENDERLOIN,years,months,days,hours,minutes,seconds
0,0,-122.399588,37.735051,0,0,0,1,0,0,0,...,0,0,0,0,2015,5,10,23,59,0
1,1,-122.391523,37.732432,0,0,0,1,0,0,0,...,0,0,0,0,2015,5,10,23,51,0
2,2,-122.426002,37.792212,0,0,0,1,0,0,0,...,0,0,0,0,2015,5,10,23,50,0
3,3,-122.437394,37.721412,0,0,0,1,0,0,0,...,0,0,0,0,2015,5,10,23,45,0
4,4,-122.437394,37.721412,0,0,0,1,0,0,0,...,0,0,0,0,2015,5,10,23,45,0


In [21]:
feature_cols = [x for x in train_data if x!='Category']
X = train_data[feature_cols]
y = train_data['Category']
X_train, x_test,y_train, y_test = train_test_split(X, y)

In [22]:
DTC = DecisionTreeClassifier(criterion = 'gini', max_features = 10, max_depth = 9)
DTC = DTC.fit(X_train, y_train)
y_pred_DTC = DTC.predict(x_test)
y_pred_test_DTC = DTC.predict(X_train)
print("score is {:.3f}".format (score(y_test, y_pred_DTC, average = 'micro')*100))
print("Accuracy for the test data is {:.3f} ".format (accuracy_score(y_test, y_pred_DTC)*100))
print("Accuracy for the train data is {:.3f} ".format (accuracy_score(y_train, y_pred_test_DTC)*100))
print("acc voting classifier:", accuracy_score(y_test, y_pred_DTC))



score is 26.354
Accuracy for the test data is 26.354 
Accuracy for the train data is 26.590 
acc voting classifier: 0.26354247812202464


In [23]:
from sklearn.metrics import f1_score
from sklearn.kernel_approximation import Nystroem
nystroemSVC = Nystroem(kernel = 'rbf')
sgd = SGDClassifier()

X_train_svc = nystroemSVC.fit_transform(X_train)
X_test_svc = nystroemSVC.transform(x_test)

linSVC = sgd.fit(X_train_svc, y_train)
y_pred_svc = linSVC.predict(X_test_svc)
y_pred_test_svc = linSVC.predict(X_train_svc)

print("score is {:.3f}".format (score(y_test, y_pred_svc, average = 'micro')*100))
print("Accuracy for the test data is {:.3f}".format (accuracy_score(y_test, y_pred_svc)*100))
print("Accuracy for the train data is {:.3f}".format (accuracy_score(y_train, y_pred_test_svc)*100))




score is 7.177
Accuracy for the test data is 7.177
Accuracy for the train data is 7.264


In [24]:
from sklearn.linear_model import LogisticRegression
lr = LogisticRegression(dual=False, tol=0.1)
lr.fit(X_train, y_train)
y_pred_test_lr = lr.predict(X_train)
y_pred_lr = lr.predict(x_test)


print("score is {:.3f}".format (score(y_test, y_pred_lr, average = 'micro')*100))
print("Accuracy for the test data is {:.3f}".format (accuracy_score(y_test, y_pred_lr)*100))
print("Accuracy for the train data is {:.3f} ".format (accuracy_score(y_train, y_pred_test_lr)*100))
print("acc voting classifier:", accuracy_score(y_test, y_pred_lr))

score is 19.919
Accuracy for the test data is 19.919
Accuracy for the train data is 19.919 
acc voting classifier: 0.19919093629990023


In [25]:
from sklearn.linear_model import SGDClassifier
svm = SGDClassifier( max_iter=13 ,loss="hinge")
svm.fit(X_train, y_train)
y_pred_test_svm = svm.predict(X_train)
y_pred_svm = svm.predict(x_test)

print("score is {:.3f}".format (score(y_test, y_pred_svm, average = 'micro')*100))
print("Accuracy for the test data is {:.3f}".format (accuracy_score(y_test, y_pred_svm)*100))
print("Accuracy for the train data is {:.3f} ".format (accuracy_score(y_train, y_pred_test_svm)*100))
print("acc voting classifier:", accuracy_score(y_test, y_pred_svm))

score is 19.691
Accuracy for the test data is 19.691
Accuracy for the train data is 19.607 
acc voting classifier: 0.1969131668739437


In [26]:
 from sklearn.neighbors import KNeighborsClassifier
knn = KNeighborsClassifier(n_neighbors=50)
knn.fit(X_train, y_train)
y_pred_test_knn = knn.predict(X_train)
y_pred_knn = knn.predict(x_test)
print("acc voting classifier:", accuracy_score(y_test,y_pred_knn))

acc voting classifier: 0.23616824516087886


In [27]:
from sklearn.ensemble import VotingClassifier
vc = VotingClassifier(estimators=[('lr', lr) ,('knn', knn),('DTC', DTC),('svm', svm)], n_jobs=-1,voting='hard')
y_pred_vc=vc.fit(X_train, y_train)
y_pred_test_vc = vc.predict(x_test)
print("acc voting classifier:", accuracy_score(y_test, y_pred_test_vc))


acc voting classifier: 0.2412613375973177


  if diff:


In [28]:
print("acc voting classifier:", accuracy_score(y_test, y_pred_test_vc)*100)


acc voting classifier: 24.12613375973177


In [29]:
from sklearn.metrics import confusion_matrix
cmat = confusion_matrix(y_test, y_pred_test_vc)
cmat

array([[   0,   49,    0, ...,    7,    2,    0],
       [   0, 1498,    0, ...,  238,   12,    0],
       [   0,    2,    0, ...,    1,    0,    0],
       ...,
       [   0,  305,    0, ..., 1061,    3,    0],
       [   0,  679,    0, ...,   20,   27,    0],
       [   0,  180,    0, ...,   20,    2,    0]], dtype=int64)

In [32]:
X_test =test_data.drop(['Id'], axis = 1)

my_prediction = DTC.predict(X_test)

SFCC_submission_final = pd.DataFrame({'Id': test_data.Id, 'Category': my_prediction})
print(SFCC_submission_final.shape)
SFCC_submission_final.to_csv("C:\\Users\\mohamedelhedi\\Desktop\\python san fansico\\SFCC_prediction.csv", index = False)



(884262, 2)
