In [None]:
%matplotlib inline

In [None]:
import time
import pandas as pd
from sklearn.cross_validation import train_test_split
from sklearn.metrics import log_loss
from sklearn.dummy import DummyClassifier
from sklearn.preprocessing import LabelEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.pipeline import Pipeline

# Turns off annoying warnings. If there are ever serious data
# errors, trying removing this line.
pd.options.mode.chained_assignment = None

In [None]:
def make_Kaggle_file(predict_probabilities, columns, output_file_name="auto", decimal_limit=3):
    """
    Outputs a file that can be submitted to Kaggle. This takes a long time to run, so you 
    shouldn't run it that often. Instead, just have good internal validation techniques so you
    don't have to check the public leaderboard.
    
    Required imports: 
    import time
    import pandas as pd
    
    predict_probabilities: array-like of shape = [n_samples, n_classes]. Is the output of a 
        predict_proba method of a sklearn classifier
        
    columns: array or list of column names that are in the same order as the columns of the 
        predict_probabilities method. If LabelEncoder was used, is accessed via the classes_ 
        attribute. Don't include an "Id" column.
        
    output_file_name: If "auto" names it sf_crime_test_predictions_<YearMonthDay-HourMinuteSecond>, 
        else uses the string entered as the file name.
        
    decimal_limit: If None uses full precision, else formats predictions based on that precision. 
        Can significantly reduce the filesize and make writing the file faster.
        i.e. actual prediction = .2352452435, decimal_limit=2 --> .24, decimal_limit=3 --> .235, etc.
    """
    predictions = pd.DataFrame(predict_probabilities, columns=columns)
    predictions.index.name = "Id"
    if output_file_name == "auto":
        timestr = time.strftime("%Y%m%d-%H%M%S")
        output_file_name = "sf_crime_test_predictions_" + timestr + ".csv"
    if decimal_limit:
        decimal_limit = '%%.%df' % decimal_limit
    predictions.to_csv(output_file_name, float_format=decimal_limit)
    print("Finished writing file: ", output_file_name)

In [None]:
from sklearn.base import TransformerMixin

class FeatureEngineering(TransformerMixin):
    def __init__(self, date_features=True, DayOfWeek_features=True, PdDistrict_features=True, Address_features=True):
        self.date_features = date_features
        self.DayOfWeek_features = DayOfWeek_features
        self.PdDistrict_features = PdDistrict_features
        self.Address_features = Address_features
    
    def fit(self, X, y=None):
        return self
    
    def transform(self, X, y=None):
        # Features from dates
        if self.date_features:
            X['Year'] = X.Dates.apply(lambda x: int(x[:4])) # Hypothesis: The distribution of crimes changed over time
            X['Month'] = X.Dates.apply(lambda x: int(x[5:7])) # H: Certain crimes occur during some months more than others
            X['Hour'] = X.Dates.apply(lambda x: int(x[11:13])) # H: Certain crimes occur at day, others at night
            X['Minute'] = X.Dates.apply(lambda x: int(x[14:16])) # H: Certain crimes are rounded to the nearest hour
            # Idea: Is holiday feature. H: Holidays --> Tourists --> Different types of crimes

        # Features from DayOfWeek
        if self.DayOfWeek_features:
            X['DayOfWeekNum'] = X["DayOfWeek"].map({"Tuesday":0, "Wednesday":1, 
                                                 "Thursday":2, "Friday":3, 
                                                 "Saturday":4, "Sunday":5, 
                                                 "Monday":6}) # H: Different days have different crime distributions
            X['IsWeekend'] = X["DayOfWeekNum"].apply(lambda x: 1*((x == 4) | (x == 5))) # H: Weekends are special

        # Features from PdDistrict
        if self.PdDistrict_features:
            X['PdDistrictNum'] = LabelEncoder().fit_transform(X.PdDistrict) # H: Different districts have different crimes

        # Features from Address
        if self.Address_features:
            X['Intersection'] = X.Address.apply(lambda x: 1*("/" in x)) # H: Intersections have unique crimes
        
        # Idea: Make categorical feature of all addresses based on number of crimes at the address
        # Idea: Make categorical feature of certain popular streets

        # Features from X & Y
        # Idea: Make a feature that corresponds to whether the crime was near the ocean. 

        # Other ideas:
        # Certain crimes result in multiple observations (for example the first and second observation in 
        # the dataset are located) at the same location and occur at the same time. The crimes, warrant arrest 
        # and traffic violation arrest, seem to go with each other. 
        #    Specific feature ideas: Number of observations associated with crime. In this case, the value would be 2.
        #    Specific feature ideas: If these crimes are split between the training and test datasets, perhaps the crimes
        #        in the training data set would inform the crime in the test data set.
        return X
    

class JustNumerics(TransformerMixin):
    def __init__(self):
        pass
    
    def fit(self, X, y=None):
        self.numeric_columns = X.dtypes[X.dtypes != "object"].index
        return self
    
    def transform(self, X, y=None):
        return X[self.numeric_columns]

In [None]:
# Read in training data
X = pd.read_csv("../../data/train.csv")
y = X.pop('Category')

# Convert y labels to integer representations
labels = LabelEncoder()
y = labels.fit_transform(y)

In [None]:
# Make train and test data to evaluate base line model
X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=.8, random_state=42)

# Train a Random Forest model
clf = RandomForestClassifier(n_estimators=100, max_depth=10, random_state=42, n_jobs=-1)
pipe = Pipeline([("feature_engineering", FeatureEngineering()), ("just_numerics", JustNumerics()), ("RF", clf)])
pipe.fit(X_train, y_train)
y_pred = pipe.predict_proba(X_test)

# Expected log loss on test data
log_loss(y_test, y_pred)

In [None]:
%%time
# Read in data to predict and make predictions
X_predict = pd.read_csv("../../data/test.csv")

In [None]:
final_predictions = pipe.predict_proba(X_predict)

In [None]:
%%time
# Export predictions to file to be submitted to Kaggle (Kaggle score = 3.66356)
make_Kaggle_file(final_predictions, labels.classes_, decimal_limit=3)