In [17]:
import pandas as pd
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import LabelEncoder

train_df = pd.read_csv("data/train.csv").drop(["Descript", "Resolution", "Address"], axis=1)
test_df = pd.read_csv("data/test.csv").drop(["Id", "Address"], axis=1)
train_df.head()

Unnamed: 0,Dates,Category,DayOfWeek,PdDistrict,X,Y
0,2015-05-13 23:53:00,WARRANTS,Wednesday,NORTHERN,-122.425892,37.774599
1,2015-05-13 23:53:00,OTHER OFFENSES,Wednesday,NORTHERN,-122.425892,37.774599
2,2015-05-13 23:33:00,OTHER OFFENSES,Wednesday,NORTHERN,-122.424363,37.800414
3,2015-05-13 23:30:00,LARCENY/THEFT,Wednesday,NORTHERN,-122.426995,37.800873
4,2015-05-13 23:30:00,LARCENY/THEFT,Wednesday,PARK,-122.438738,37.771541


In [18]:
class Preprocessor:
    def __init__(self, train_df, test_df):
        self.train_df = train_df
        self.test_df = test_df
            
        self.le_target = LabelEncoder()
        self.le_district = LabelEncoder()
        self.dow_dict = {
            "Monday": 0,
            "Tuesday": 1,
            "Wednesday": 2,
            "Thursday": 3,
            "Friday": 4,
            "Saturday": 5,
            "Sunday": 6
        }
        
    
    def run(self):
        print("preprocessing step: 1/3", end="\r")
        self.fill_na()
        print("preprocessing step: 2/3", end="\r")
        self.encode_label()
        print("preprocessing step: 3/3")
        self.encode_time()
        
        return self.train_df, self.test_df, self.le_target
    
    
    def fill_na(self):
        self.train_df.DayOfWeek = self.train_df.DayOfWeek.fillna("Friday")
        self.train_df.PdDistrict = self.train_df.PdDistrict.fillna("SOUTHERN")
        self.train_df.X = self.train_df.X.fillna(self.train_df.X.mean())
        self.train_df.Y = self.train_df.Y.fillna(self.train_df.Y.mean())
        
        self.test_df.DayOfWeek = self.test_df.DayOfWeek.fillna("Friday")
        self.test_df.PdDistrict = self.test_df.PdDistrict.fillna("SOUTHERN")
        self.test_df.X = self.test_df.X.fillna(self.train_df.X.mean())
        self.test_df.Y = self.test_df.X.fillna(self.train_df.Y.mean())

    
    def encode_label(self):
        self.train_df.Category = self.le_target.fit_transform(self.train_df.Category)
        self.train_df.PdDistrict = self.le_district.fit_transform(self.train_df.PdDistrict)
        self.test_df.PdDistrict = self.le_district.transform(self.test_df.PdDistrict)
    
    
    def encode_time(self):
        def split_time(df):
            df.Dates = pd.to_datetime(df.Dates)
            df.loc[:, "year"] = df.Dates.dt.year
            df.loc[:, "month"] = df.Dates.dt.month
            df.loc[:, "day"] = df.Dates.dt.day
            df.loc[:, "hour"] = df.Dates.dt.hour
            df.loc[:, "min"] = df.Dates.dt.minute
            
            df = df.drop("Dates", axis=1)
            return df
        
        self.train_df = split_time(self.train_df)
        self.test_df = split_time(self.test_df)
        
        self.train_df.DayOfWeek = self.train_df.DayOfWeek.map(lambda x: self.dow_dict[x])
        self.test_df.DayOfWeek = self.test_df.DayOfWeek.map(lambda x: self.dow_dict[x])
        
        
preprocessor = Preprocessor(train_df, test_df)
train_df, test_df, le_target = preprocessor.run()
train_df.head()

preprocessing step: 3/3


Unnamed: 0,Category,DayOfWeek,PdDistrict,X,Y,year,month,day,hour,min
0,37,2,4,-122.425892,37.774599,2015,5,13,23,53
1,21,2,4,-122.425892,37.774599,2015,5,13,23,53
2,21,2,4,-122.424363,37.800414,2015,5,13,23,33
3,16,2,4,-122.426995,37.800873,2015,5,13,23,30
4,16,2,5,-122.438738,37.771541,2015,5,13,23,30


In [14]:
from pycaret.classification import *

clf = setup(data=train_df, fold_strategy="stratifiedkfold",
            target="Category", fold=3)
lgbm = create_model("lightgbm")
tuned_model = tune_model(lgbm)
final_model = finalize_model(tuned_model)

Unnamed: 0_level_0,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
Fold,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
0,0.2948,0.7262,0.0928,0.2602,0.2177,0.1688,0.1834
1,0.2931,0.7232,0.0916,0.2496,0.2153,0.1665,0.1811
2,0.2888,0.7097,0.0908,0.2325,0.2098,0.1608,0.1759
Mean,0.2922,0.7197,0.0917,0.2475,0.2143,0.1654,0.1801
Std,0.0025,0.0072,0.0008,0.0114,0.0033,0.0034,0.0032




In [19]:
predictions = predict_model(final_model, test_df, raw_score=True)
predictions = predictions.iloc[:, 10:].reset_index()
header = ["Id"] + le_target.inverse_transform(range(39)).tolist()
predictions.columns = header
predictions.to_csv("submission.csv", index=False)
predictions.head()

Unnamed: 0,Id,ARSON,ASSAULT,BAD CHECKS,BRIBERY,BURGLARY,DISORDERLY CONDUCT,DRIVING UNDER THE INFLUENCE,DRUG/NARCOTIC,DRUNKENNESS,...,SEX OFFENSES NON FORCIBLE,STOLEN PROPERTY,SUICIDE,SUSPICIOUS OCC,TREA,TRESPASS,VANDALISM,VEHICLE THEFT,WARRANTS,WEAPON LAWS
0,0,0.0056,0.1166,0.0,0.0002,0.017,0.002,0.0051,0.0327,0.0046,...,0.0,0.0051,0.0002,0.044,0.0,0.0041,0.0678,0.1695,0.0476,0.0338
1,1,0.0122,0.1455,0.0,0.0004,0.0148,0.0037,0.0183,0.056,0.0063,...,0.0,0.0063,0.0002,0.0416,0.0,0.0053,0.0638,0.0259,0.1068,0.0425
2,2,0.0048,0.1815,0.0,0.0007,0.0191,0.0055,0.0077,0.0301,0.0046,...,0.0,0.0069,0.0002,0.0351,0.0,0.0067,0.067,0.0325,0.0415,0.0241
3,3,0.0033,0.1715,0.0,0.0014,0.0234,0.0047,0.0066,0.0227,0.0043,...,0.0,0.0058,0.0005,0.0365,0.0,0.0044,0.0703,0.0767,0.0293,0.0164
4,4,0.0033,0.1715,0.0,0.0014,0.0234,0.0047,0.0066,0.0227,0.0043,...,0.0,0.0058,0.0005,0.0365,0.0,0.0044,0.0703,0.0767,0.0293,0.0164
