In [57]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.naive_bayes import MultinomialNB
from sklearn.preprocessing import binarize

%matplotlib inline

In [58]:
train_data = pd.read_csv('final_project_data/train_users_2.csv')
test_data = pd.read_csv('final_project_data/test_users.csv')
train_data.head()
#test_data.head()

Unnamed: 0,id,date_account_created,timestamp_first_active,date_first_booking,gender,age,signup_method,signup_flow,language,affiliate_channel,affiliate_provider,first_affiliate_tracked,signup_app,first_device_type,first_browser,country_destination
0,gxn3p5htnn,2010-06-28,20090319043255,,-unknown-,,facebook,0,en,direct,direct,untracked,Web,Mac Desktop,Chrome,NDF
1,820tgsjxq7,2011-05-25,20090523174809,,MALE,38.0,facebook,0,en,seo,google,untracked,Web,Mac Desktop,Chrome,NDF
2,4ft3gnwmtx,2010-09-28,20090609231247,2010-08-02,FEMALE,56.0,basic,3,en,direct,direct,untracked,Web,Windows Desktop,IE,US
3,bjjt8pjhuk,2011-12-05,20091031060129,2012-09-08,FEMALE,42.0,facebook,0,en,direct,direct,untracked,Web,Mac Desktop,Firefox,other
4,87mebub9p4,2010-09-14,20091208061105,2010-02-18,-unknown-,41.0,basic,0,en,direct,direct,untracked,Web,Mac Desktop,Chrome,US


In [71]:
#Data Cleaning
train_data.gender.replace('-unknown-', np.nan, inplace=True) #replace -unknown- with NaN
train_data.first_browser.replace('-unknown-', np.nan, inplace=True) #replace -unknown- with NaN

test_data.gender.replace('-unknown-', np.nan, inplace=True) #replace -unknown- with NaN
test_data.first_browser.replace('-unknown-', np.nan, inplace=True) #replace -unknown- with NaN
#ticks = [0,20,40,60,80,100,2000,2500]

#Visualize age
train_data.age.describe() #summary
#train_data.age.plot.hist()
#plt.show()

#Clean age
train_data.loc[train_data.age > 90, 'age'] = np.nan
train_data.loc[train_data.age < 16, 'age'] = np.nan

test_data.loc[train_data.age > 90, 'age'] = np.nan
test_data.loc[train_data.age < 16, 'age'] = np.nan

#Format categorical data
categorical_features = [
    'affiliate_channel',
    'affiliate_provider',
    'country_destination',
    'first_affiliate_tracked',
    'first_browser',
    'first_device_type',
    'gender',
    'language',
    'signup_app',
    'signup_method'
]

for categorical_feature in categorical_features:
    train_data[categorical_feature] = train_data[categorical_feature].astype('category')
    if categorical_feature == 'country_destination':
        pass
    else:
        test_data[categorical_feature] = test_data[categorical_feature].astype('category')
    
#Set Date data into date format
train_data['date_account_created'] = pd.to_datetime(train_data['date_account_created'])
train_data['date_first_booking'] = pd.to_datetime(train_data['date_first_booking'])
train_data['date_first_active'] = pd.to_datetime((train_data.timestamp_first_active // 1000000), format='%Y%m%d')

test_data['date_account_created'] = pd.to_datetime(test_data['date_account_created'])
test_data['date_first_booking'] = pd.to_datetime(test_data['date_first_booking'])
test_data['date_first_active'] = pd.to_datetime((test_data.timestamp_first_active // 1000000), format='%Y%m%d')

test_data.head()

Unnamed: 0,id,date_account_created,timestamp_first_active,date_first_booking,gender,age,signup_method,signup_flow,language,affiliate_channel,affiliate_provider,first_affiliate_tracked,signup_app,first_device_type,first_browser,date_first_active
0,5uwns89zht,2014-07-01,20140701000006,NaT,FEMALE,35.0,facebook,0,en,direct,direct,untracked,Moweb,iPhone,Mobile Safari,2014-07-01
1,jtl0dijy2j,2014-07-01,20140701000051,NaT,,,basic,0,en,direct,direct,untracked,Moweb,iPhone,Mobile Safari,2014-07-01
2,xx0ulgorjt,2014-07-01,20140701000148,NaT,,,basic,0,en,direct,direct,linked,Web,Windows Desktop,Chrome,2014-07-01
3,6c6puo6ix0,2014-07-01,20140701000215,NaT,,,basic,0,en,direct,direct,linked,Web,Windows Desktop,IE,2014-07-01
4,czqhjk3yfe,2014-07-01,20140701000305,NaT,,,basic,0,en,direct,direct,untracked,Web,Mac Desktop,Safari,2014-07-01


In [61]:
#print train_data.country_destination.head()

#Setting up df for training - 'age','language','signup_app','first_browser','gender'
data= pd.DataFrame()
data = pd.concat([pd.get_dummies(train_data['gender']), pd.get_dummies(train_data["signup_method"]), pd.get_dummies(train_data["first_device_type"])], axis=1)
#Need to add weibo, because it isn't in the training data
data['weibo'] = 0

#Setting up test data df
testdata= pd.DataFrame()
testdata = pd.concat([pd.get_dummies(test_data['gender']), pd.get_dummies(test_data["signup_method"]), pd.get_dummies(test_data["first_device_type"])], axis=1)

print testdata.shape, data.shape
#print list(testdata.columns.values), list(data.columns.values)

(62096, 16) (213451, 16)


In [76]:
clf = MultinomialNB()
clf.fit(data, train_data.country_destination)  
y_preds = clf.predict(testdata)

output = pd.concat([test_data.id, pd.DataFrame(y_preds, columns=["country"])], axis=1)
output.to_csv("Submission.csv",index=False)

In [None]:
#Upon submission, I get a score of 0.59294...