In [29]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import cross_val_predict
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import log_loss, accuracy_score
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import SVC
from sklearn.decomposition import TruncatedSVD
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import LabelEncoder
from xgboost import XGBClassifier
from sklearn.neural_network import MLPClassifier
import lightgbm as lgb

In [32]:
# Load the data into DataFrames
train_users = pd.read_csv('train_users_2.csv')
test_users = pd.read_csv('test_users.csv')
sessions=pd.read_csv('sessions.csv')
agegender=pd.read_csv('age_gender_bkts.csv')
countries=pd.read_csv('countries.csv')

In [41]:
X_train=train_users.drop(['id'],axis=1)
X_test=test_users.drop(['id'],axis=1)
X_test=X_test.drop(['date_first_booking'],axis=1)
X_train=X_train.drop(['date_first_booking'],axis=1)

In [42]:
from sklearn.preprocessing import LabelEncoder
le=LabelEncoder()
# Iterating over all the common columns in train and test
for col in X_test.columns.values:
   print(col)
   # Encoding only categorical variables
   if X_test[col].dtypes=='object':
   # Using whole data to form an exhaustive list of levels
       X_test[col]=X_test[col].fillna(value='NA') 
       X_train[col]=X_train[col].fillna(value='NA') 
       data=X_train[col].append(X_test[col])
       le.fit(data.values)
       X_train[col]=le.transform(X_train[col])
       X_test[col]=le.transform(X_test[col])

date_account_created
timestamp_first_active
gender
age
signup_method
signup_flow
language
affiliate_channel
affiliate_provider
first_affiliate_tracked
signup_app
first_device_type
first_browser


In [52]:
le=LabelEncoder()
data=X_train['country_destination']
le.fit(data.values)
X_train['country_destination']=le.transform(X_train['country_destination'])
  

In [53]:
# Third: Age: Remove outliers
age_values = X_train['age'].values
mean_age = 0
count_age = 0
for i in range(np.shape(age_values)[0]):
    if age_values[i]<10 or age_values[i] >80 or np.isnan(age_values[i]):
        continue;
    mean_age+=age_values[i]
    count_age+=1
mean_age = mean_age/count_age
for i in range(np.shape(age_values)[0]):
    if age_values[i]<10 or age_values[i] >80 or np.isnan(age_values[i]):
        age_values[i] = mean_age

X_train['age'] = age_values

# Third: Age: Remove outliers
age_values = X_test['age'].values
mean_age = 0
count_age = 0
for i in range(np.shape(age_values)[0]):
    if age_values[i]<10 or age_values[i] >80 or np.isnan(age_values[i]):
        continue;
    mean_age+=age_values[i]
    count_age+=1
mean_age = mean_age/count_age
for i in range(np.shape(age_values)[0]):
    if age_values[i]<10 or age_values[i] >80 or np.isnan(age_values[i]):
        age_values[i] = mean_age

X_test['age'] = age_values

In [54]:
y=X_train['country_destination']
X=X_train.loc[:, X_train.columns != 'country_destination']
X_tr, X_te, y_tr, y_te = train_test_split(X, y, test_size=0.10, random_state=42)
clf=RandomForestClassifier()
clf.fit(X_tr,y_tr)
accuracy_score(y_te,clf.predict(X_te))

0.5667103907055187

In [55]:
def write_submission(classifier,testdata,label_encoder,filename):
    import warnings
    warnings.filterwarnings("ignore", category=DeprecationWarning) 
    
    y_pred = classifier.predict_proba(testdata)
    id_test = test_users['id']
    ids = []  #list of ids
    cts = []  #list of countries
    
    for i in range(len(id_test)):
        idx = id_test[i]
        for i in range(5): ids.append(idx)
        cts += (label_encoder.inverse_transform(np.argsort(y_pred[i])[::-1])[:5].tolist())

        #Generate submission
    sub = pd.DataFrame(np.column_stack((ids, cts)), columns=['id', 'country'])
    sub.to_csv('./Submissions/'+filename,index=False)
    return

In [56]:
write_submission(clf,X_test,le,'Submission_first_RF.csv')

In [57]:
clf = XGBClassifier(max_depth=8,objective='multi:softprob',learning_rate=0.3333)
clf.fit(X,y)
write_submission(clf,X_test,le,'Submission_first_XGB.csv')

In [58]:
clf = lgb.LGBMClassifier(objective='multiclass',num_leaves=31,learning_rate=0.05,n_estimators=20)
clf.fit(X,y)
write_submission(clf,X_test,le,'Submission_first_LGBM.csv')