In [1]:
import numpy as np
import pandas as pd
import datetime

In [2]:
# Load the pandas dataframe
df_train = pd.read_csv('../Data/Data/train_users_2.csv')
df_test = pd.read_csv('../Data/Data/test_users.csv')

In [3]:
#Sanity Check and Property Check
print(df_train.head())
print('-----------')
print(df_train.info())
print(df_test.info())

           id date_account_created  timestamp_first_active date_first_booking  \
0  gxn3p5htnn           2010-06-28          20090319043255                NaN   
1  820tgsjxq7           2011-05-25          20090523174809                NaN   
2  4ft3gnwmtx           2010-09-28          20090609231247         2010-08-02   
3  bjjt8pjhuk           2011-12-05          20091031060129         2012-09-08   
4  87mebub9p4           2010-09-14          20091208061105         2010-02-18   

      gender   age signup_method  signup_flow language affiliate_channel  \
0  -unknown-   NaN      facebook            0       en            direct   
1       MALE  38.0      facebook            0       en               seo   
2     FEMALE  56.0         basic            3       en            direct   
3     FEMALE  42.0      facebook            0       en            direct   
4  -unknown-  41.0         basic            0       en            direct   

  affiliate_provider first_affiliate_tracked signup_app 

In [4]:
# Labels of the data
labels = df_train['country_destination']
print(labels.head())

# Dropping columns that are not required
df_train = df_train.drop(columns=['country_destination'])
df_combined = pd.concat([df_train,df_test])
df_combined = df_combined.drop(columns=['date_first_booking','id'])
print(df_combined.info())

0      NDF
1      NDF
2       US
3    other
4       US
Name: country_destination, dtype: object
<class 'pandas.core.frame.DataFrame'>
Int64Index: 275547 entries, 0 to 62095
Data columns (total 13 columns):
date_account_created       275547 non-null object
timestamp_first_active     275547 non-null int64
gender                     275547 non-null object
age                        158681 non-null float64
signup_method              275547 non-null object
signup_flow                275547 non-null int64
language                   275547 non-null object
affiliate_channel          275547 non-null object
affiliate_provider         275547 non-null object
first_affiliate_tracked    269462 non-null object
signup_app                 275547 non-null object
first_device_type          275547 non-null object
first_browser              275547 non-null object
dtypes: float64(1), int64(2), object(10)
memory usage: 29.4+ MB
None


In [5]:
# Feature Engineering for each column

# First: Date Account Created
df_combined['date_account_created'] = pd.to_datetime(df_combined['date_account_created'], format= "%Y-%m-%d")
df_combined['year_account_created'] = df_combined['date_account_created'].dt.year
df_combined['month_account_created'] = df_combined['date_account_created'].dt.month
df_combined['week_account_created'] = df_combined['date_account_created'].dt.week
df_combined['weekday_account_created'] = df_combined['date_account_created'].dt.weekday
df_combined['day_account_created'] = df_combined['date_account_created'].dt.day
df_combined = df_combined.drop(columns=['date_account_created'])

In [6]:
# Second: Timestamp_first_active
def preprocess_timestamp(timestamp):
    timestamp = str(timestamp)
    timestamp = timestamp[0:8]
    return pd.to_datetime(timestamp,format='%Y%m%d')

df_combined['timestamp_first_active'] = df_combined['timestamp_first_active'].apply(preprocess_timestamp)
df_combined['year_first_used'] = df_combined['timestamp_first_active'].dt.year
df_combined['month_first_used'] = df_combined['timestamp_first_active'].dt.month
df_combined['week_first_used'] = df_combined['timestamp_first_active'].dt.week
df_combined['weekday_first_used'] = df_combined['timestamp_first_active'].dt.weekday
df_combined['day_first_used'] = df_combined['timestamp_first_active'].dt.day
df_combined = df_combined.drop(columns=['timestamp_first_active'])

In [7]:
# Third: Age: Remove outliers
age_values = df_combined['age'].values
mean_age = 0
count_age = 0
for i in range(np.shape(age_values)[0]):
    if age_values[i]<16 or age_values[i] >80 or np.isnan(age_values[i]):
        continue;
    mean_age+=age_values[i]
    count_age+=1
mean_age = mean_age/count_age
for i in range(np.shape(age_values)[0]):
    if age_values[i]<16 or age_values[i] >80 or np.isnan(age_values[i]):
        age_values[i] = mean_age

df_combined['age'] = age_values

In [8]:
# Label Encoding of other columns
categorical_features = [
    'affiliate_channel',
    'affiliate_provider',
    'first_affiliate_tracked',
    'first_browser',
    'first_device_type',
    'gender',
    'language',
    'signup_app',
    'signup_method',
    'signup_flow'
]
df_combined['first_affiliate_tracked'] = df_combined['first_affiliate_tracked'].fillna(value='untracked')

for categorical_feature in categorical_features:
    label_encoder = LabelEncoder()
    df_combined[categorical_feature] = label_encoder.fit_transform(df_combined[categorical_feature].values)

NameError: name 'LabelEncoder' is not defined

In [None]:
print(categorical_feature)

In [None]:
#Preparing Data for Classifiers

from sklearn.preprocessing import LabelEncoder
from xgboost.sklearn import XGBClassifier

train_size = len(labels)
X_train = df_combined.values[:train_size]
X_test = df_combined.values[train_size:]

label_encoder = LabelEncoder()
y_train = label_encoder.fit_transform(labels)



In [None]:
#Classifiers 

xgb = XGBClassifier(max_depth=6, learning_rate=0.25, n_estimators=42,
                    objective='multi:softprob', subsample=0.6, colsample_bytree=0.6, seed=0) 

xgb.fit(X_train, y_train)

In [None]:
def write_submission(classifier,testdata,label_encoder,filename):
    
    y_pred = classifier.predict_proba(testdata)
    id_test = df_test['id']
    ids = []  #list of ids
    cts = []  #list of countries
    
    for i in range(len(id_test)):
        idx = id_test[i]
        for i in range(5): ids.append(idx)
        cts += (label_encoder.inverse_transform(np.argsort(y_pred[i])[::-1])[:5].tolist())

        #Generate submission
    sub = pd.DataFrame(np.column_stack((ids, cts)), columns=['id', 'country'])
    sub.to_csv('./Submissions/'+filename,index=False)
    return 


In [None]:
write_submission(xgb,X_test,label_encoder,'Submission1')

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import RandomForestClassifier, VotingClassifier

clf1 = LogisticRegression(random_state=1)
clf2 = RandomForestClassifier(random_state=1)
clf3 = XGBClassifier(max_depth=6, learning_rate=0.25, n_estimators=42,
                    objective='multi:softprob', subsample=0.6, colsample_bytree=0.6, seed=0)

clf4 = KNeighborsClassifier(n_neighbors=1)


voting_classifiers = VotingClassifier(estimators=[ ('lr', clf1), ('rf', clf2), ('xgb', clf3)], voting='soft')
voting_classifiers.fit(X_train,y_train)
np.shape(X_train)

In [None]:
write_submission(voting_classifiers,X_test,label_encoder,'Submission_Voting.csv')

In [None]:
from mlxtend.classifier import StackingClassifier
from sklearn.neighbors import KNeighborsClassifier

clf1 = LogisticRegression(random_state=1)
clf2 = RandomForestClassifier(random_state=1)
clf3 = XGBClassifier(max_depth=6, learning_rate=0.25, n_estimators=42,
                    objective='multi:softprob', subsample=0.6, colsample_bytree=0.6, seed=0)

clf4 = KNeighborsClassifier(n_neighbors=1)

stacking_classifier = StackingClassifier(classifiers=[clf1, clf2, clf3,clf4], meta_classifier=clf3,use_probas=True,
                          average_probas=False)
stacking_classifier.fit(X_train,y_train)

In [None]:
write_submission(stacking_classifier,X_test,label_encoder,'Submission_Stacking_Probas.csv')