In [37]:
# This tells matplotlib not to try opening a new window for each plot.
%matplotlib inline

# General libraries.
import numpy as np
import pandas as pd
from sklearn import preprocessing

# SK-learn libraries for learning.
from sklearn.pipeline import Pipeline

from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.grid_search import GridSearchCV

# SK-learn libraries for evaluation.
from sklearn import metrics

In [2]:
# Load data using pandas.
train_data = pd.read_csv('data/train_users_2.csv')
test_data = pd.read_csv('data/test_users.csv')
print "Train data shape: {0}".format(train_data.shape)
print "\n"
print "--------------Top 5 Rows----------------------"
print train_data.head()
print "----------------------------------------------"

Train data shape: (213451, 16)


--------------Top 5 Rows----------------------
           id date_account_created  timestamp_first_active date_first_booking  \
0  gxn3p5htnn           2010-06-28          20090319043255                NaN   
1  820tgsjxq7           2011-05-25          20090523174809                NaN   
2  4ft3gnwmtx           2010-09-28          20090609231247         2010-08-02   
3  bjjt8pjhuk           2011-12-05          20091031060129         2012-09-08   
4  87mebub9p4           2010-09-14          20091208061105         2010-02-18   

      gender  age signup_method  signup_flow language affiliate_channel  \
0  -unknown-  NaN      facebook            0       en            direct   
1       MALE   38      facebook            0       en               seo   
2     FEMALE   56         basic            3       en            direct   
3     FEMALE   42      facebook            0       en            direct   
4  -unknown-   41         basic            0       en     

In [3]:
# Helper function to extract features.
def ExtractFeatures(data):
    # Choose a subset of categorical columns.
    categorical_columns = ["gender", "signup_method", "language", "first_device_type", "first_browser"]
    categorical_values = np.array(data[categorical_columns])

    # Do the first column
    enc_label = preprocessing.LabelEncoder()
    features_data = enc_label.fit_transform(categorical_values[:,0])

    # Do the rest of the columns
    for i in range(1, categorical_values.shape[1]):
            enc_label = preprocessing.LabelEncoder()
            features_data = np.column_stack((features_data,
                                             enc_label.fit_transform(categorical_values[:,i])))

    # Add age to features.
    # Clean up age data.
    data["age"] = data["age"].fillna(0)
    # Add age data to training data.
    features_data = np.column_stack((features_data, np.array(data["age"])))
    # Convert to matrix of ints.
    features_data = features_data.astype(int)
    return features_data

# Extract and format features for training and test data.
train_data_x = ExtractFeatures(train_data)
test_data_x = ExtractFeatures(test_data)
print "Train data shape: {0}".format(train_data_x.shape)
print "Test data shape: {0}".format(test_data_x.shape)

# Extract train labels.
train_data_y = np.array(train_data["country_destination"])
print "Train labels shape: {0}".format(train_data_y.shape)

Train data shape: (213451, 6)
Test data shape: (62096, 6)
Train labels shape: (213451,)


In [4]:
# Split train data into training and dev sets.
train_data_size = (len(train_data_x))/5*4
train_x, train_y = train_data_x[:train_data_size], train_data_y[:train_data_size]
dev_x, dev_y = train_data_x[train_data_size:], train_data_y[train_data_size:]
print train_x.shape, train_y.shape
print dev_x.shape, dev_y.shape

(170760, 6) (170760,)
(42691, 6) (42691,)


In [13]:
# Train Naive Bayes Model.
mnb = MultinomialNB()
mnb.fit(train_x, train_y)
print "Classes: {0}".format(mnb.classes_)
print "Class count: {0}".format(mnb.class_count_)
print "Accuracy: {0}".format(mnb.score(dev_x, dev_y))
predictions = mnb.predict(dev_x)
print "F1 score: {0}".format(metrics.f1_score(dev_y,predictions,average="weighted"))

Classes: ['AU' 'CA' 'DE' 'ES' 'FR' 'GB' 'IT' 'NDF' 'NL' 'PT' 'US' 'other']
Class count: [   462.   1128.    911.   1831.   4175.   1889.   2271.  98192.    602.
    163.  51033.   8103.]
Accuracy: 0.591506406503
F1 score: 0.561747196972


In [9]:
# Predict test data using multinomial model.
test_predictions = mnb.predict(test_data_x)
output = pd.concat([test_data["id"], pd.DataFrame(test_predictions, columns=["country"])], axis=1)
# Update version everytime!
version = 1
output.to_csv("submissions/shared_submission_{0}.csv".format(version), index=False)

In [34]:
# Predict Top 5 cities for each test example using multinomial model.
prob_est = mnb.predict_proba(test_data_x)
predictions = []
for i, example in enumerate(prob_est):
    sorted_scores = np.sort(example)
    top_5 = sorted_scores[len(sorted_scores) - 5:]
    for score in reversed(top_5):
        index_of_score = example.tolist().index(score)
        predictions.append((test_data["id"][i], mnb.classes_[index_of_score]))
output2 = pd.DataFrame(predictions, columns=["id", "country"])
version = 2
output2.to_csv("submissions/shared_submission_{0}.csv".format(version), index=False)
# Ranks 1142 with score of 0.79584.

In [40]:
# Train Logistic Regression Model.
lr = LogisticRegression(penalty='l2')
lr.fit(train_x, train_y)
lr_predictions = lr.predict(dev_x)
print "F1 score: {0}".format(metrics.f1_score(dev_y,lr_predictions,average="weighted"))

F1 score: 0.538665051038


In [41]:
# Predict Top 5 cities for each test example using logistic regression model.
prob_est = lr.predict_proba(test_data_x)
predictions = []
for i, example in enumerate(prob_est):
    sorted_scores = np.sort(example)
    top_5 = sorted_scores[len(sorted_scores) - 5:]
    for score in reversed(top_5):
        index_of_score = example.tolist().index(score)
        predictions.append((test_data["id"][i], lr.classes_[index_of_score]))
output3 = pd.DataFrame(predictions, columns=["id", "country"])
version = 3
output3.to_csv("submissions/shared_submission_{0}.csv".format(version), index=False)
# Ranks 1047 with score of 0.85894.