In [1]:
%matplotlib inline

# General libraries.
import datetime
import re
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

# SK-learn libraries for learning.
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn import preprocessing
from sklearn import metrics

In [2]:
train = pd.read_csv("train_users_2.csv") #Read in train data
test = pd.read_csv("test_users.csv") #Read in test data
test["train"]=0 #Flag test data as not training
train["train"]=1 #Flag train data as training
data=  pd.concat([train, test], axis=0) #Merge test and train data for pre-processing

In [3]:
data['date_account_created'] = pd.to_datetime(data['date_account_created']) #Convert to date
data['date_first_booking'] = pd.to_datetime(data['date_first_booking']) #Convert to date
data.head()

Unnamed: 0,affiliate_channel,affiliate_provider,age,country_destination,date_account_created,date_first_booking,first_affiliate_tracked,first_browser,first_device_type,gender,id,language,signup_app,signup_flow,signup_method,timestamp_first_active,train
0,direct,direct,,NDF,2010-06-28,NaT,untracked,Chrome,Mac Desktop,-unknown-,gxn3p5htnn,en,Web,0,facebook,20090319043255,1
1,seo,google,38.0,NDF,2011-05-25,NaT,untracked,Chrome,Mac Desktop,MALE,820tgsjxq7,en,Web,0,facebook,20090523174809,1
2,direct,direct,56.0,US,2010-09-28,2010-08-02,untracked,IE,Windows Desktop,FEMALE,4ft3gnwmtx,en,Web,3,basic,20090609231247,1
3,direct,direct,42.0,other,2011-12-05,2012-09-08,untracked,Firefox,Mac Desktop,FEMALE,bjjt8pjhuk,en,Web,0,facebook,20091031060129,1
4,direct,direct,41.0,US,2010-09-14,2010-02-18,untracked,Chrome,Mac Desktop,-unknown-,87mebub9p4,en,Web,0,basic,20091208061105,1


In [4]:
#Create data frame of features to model
features = pd.DataFrame()
features["male"] = (data['gender']=="MALE").astype(int) #Dummy variable for Male gender
features["female"] = (data['gender']=="FEMALE").astype(int) #Dummy variable for Female
features = pd.concat([features, pd.get_dummies(data["signup_method"],prefix="sm")], axis=1) #Dummies for all signup methods
features = pd.concat([features, pd.get_dummies(data["first_device_type"],prefix="dev")], axis=1) #Dummies for all device types

features['age_null']=data['age'].isnull().astype(int) #Dummy for missing age
features['age']=data['age'].fillna(0) #Replaces missing ages with 0

features['create_month'] = data['date_account_created'].map(lambda x: x.month) #Pulls account created month out of date 

features.head()
#features.dtypes

Unnamed: 0,male,female,sm_basic,sm_facebook,sm_google,sm_weibo,dev_Android Phone,dev_Android Tablet,dev_Desktop (Other),dev_Mac Desktop,dev_Other/Unknown,dev_SmartPhone (Other),dev_Windows Desktop,dev_iPad,dev_iPhone,age_null,age,create_month
0,0,0,0,1,0,0,0,0,0,1,0,0,0,0,0,1,0,6
1,1,0,0,1,0,0,0,0,0,1,0,0,0,0,0,0,38,5
2,0,1,1,0,0,0,0,0,0,0,0,0,1,0,0,0,56,9
3,0,1,0,1,0,0,0,0,0,1,0,0,0,0,0,0,42,12
4,0,0,1,0,0,0,0,0,0,1,0,0,0,0,0,0,41,9


In [5]:
#Re-split train and test data
train_features = features[data["train"]==1]
test_features = features[data["train"]==0]

In [6]:
X = train_features #Independent Variables
Y = train['country_destination'] #Dependent variable

In [7]:
num_test = len(train_features)

#Split train data into train and development datasets 
dev_data, dev_labels = X[:num_test/4], Y[:num_test/4]
train_data, train_labels = X[num_test/4:], Y[num_test/4:]

print num_test
print len(dev_data)
print len(train_data)

213451
53362
160089


In [8]:
#Logistic regression on training data
lm = LogisticRegression()
lm.fit(train_data, train_labels)
#Makes predictions on development data
preds = lm.predict(dev_data) 
#Print the F1 score for the model
print "Logistic Regression. F1: %f" %(metrics.f1_score(dev_labels,preds,average="weighted"))


preds2 = []
#Set all predictions to most common class
for i in range(0,preds.shape[0]):
    preds2.append("NDF")
preds2 = np.array(preds2)
print "Most common class.   F1: %f" %(metrics.f1_score(dev_labels,preds2,average="weighted"))

Logistic Regression. F1: 0.516574
Most common class.   F1: 0.357742


  'precision', 'predicted', average, warn_for)


In [9]:
#Predict on test data
test_preds = lm.predict(test_features)

In [10]:
#Output results to csv
output = pd.concat([test["id"], pd.DataFrame(test_preds,columns=["country"])], axis=1)
output.to_csv("Submission.csv",index=False)

In [None]:
#data[["country_destination","id"]].groupby("country_destination").count().sort_values("id",ascending=False)

#Logistic Regression. F1: 0.516574
#Most common class.   F1: 0.357742