In [1]:
import time
import pandas as pd
from sklearn.cross_validation import train_test_split
from sklearn.metrics import log_loss
from sklearn.dummy import DummyClassifier
from sklearn.preprocessing import LabelEncoder

In [2]:
def make_Kaggle_file(predict_probabilities, columns, output_file_name="auto", decimal_limit=3, compress=True):
    """
    Required imports: 
    import time
    import pandas as pd
    
    predict_probabilities: array-like of shape = [n_samples, n_classes]. Is the output of a 
        predict_proba method of a sklearn classifier
        
    columns: array or list of column names that are in the same order as the columns of the 
        predict_probabilities method. If LabelEncoder was used, is accessed via the classes_ 
        attribute. Don't include an "Id" column.
        
    output_file_name: If "auto" names it sf_crime_test_predictions_<YearMonthDay-HourMinuteSecond>, 
        else uses the string entered as the file name.
        
    decimal_limit: If None uses full precision, else formats predictions based on that precision. 
        Can significantly reduce the filesize and make writing the file faster.
        i.e. actual prediction = .2352452435, decimal_limit=2 --> .24, decimal_limit=3 --> .235, etc.
        
    compress: If True, gzips the resulting file which can be uploaded to Kaggle. Reduces upload size. 
        Set argument to None to turn off.
    """
    predictions = pd.DataFrame(predict_probabilities, columns=columns)
    predictions.index.name = "Id"
    if output_file_name == "auto":
        timestr = time.strftime("%Y%m%d-%H%M%S")
        output_file_name = "sf_crime_test_predictions_" + timestr + ".csv"
    if decimal_limit:
        decimal_limit = '%%.%df' % decimal_limit
    if compress:
        compression = "gzip"
        output_file_name += ".gz"
    predictions.to_csv(output_file_name, float_format=decimal_limit, compression=compression)
    print("Finished writing file: ", output_file_name)

In [3]:
# Read in training data
X = pd.read_csv("../../data/train.csv", usecols=["Category", "X", "Y"])
y = X.pop('Category')

# Convert y labels to integer representations
labels = LabelEncoder()
y = labels.fit_transform(y)

# Make train and test data to evaluate base line model
X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=.2, random_state=42)

# Train a simple model that predicts all classes as equally likely
clf = DummyClassifier(strategy="uniform", random_state=42)
clf.fit(X_train, y_train)
y_pred = clf.predict_proba(X_test)

# Expected log loss on test data
log_loss(y_test, y_pred)

3.6635616461296463

In [4]:
# Read in data to predict and make predictions
X_predict = pd.read_csv("../../data/test.csv", usecols=["X", "Y"])
final_predictions = clf.predict_proba(X_predict)

# Export predictions to file to be submitted to Kaggle (Kaggle score = 3.66356)
make_Kaggle_file(final_predictions, labels.classes_, decimal_limit=2)

Finished writing file:  sf_crime_test_predictions_20160327-205911.csv.gz
