In [1]:
import pandas as pd
import numpy as np
from sklearn import preprocessing
from datetime import timedelta 

In [2]:
from google.colab import drive
drive.mount('/content/drive', force_remount=True)

Mounted at /content/drive


In [3]:
#import data from CSVs
train = pd.read_csv('/content/drive/My Drive/DS310/ProjectCsv/train_users.csv')
test = pd.read_csv('/content/drive/My Drive/DS310/ProjectCsv/test_users.csv')
sessions = pd.read_csv('/content/drive/My Drive/DS310/ProjectCsv/sessions.csv')

In [4]:
#We want the test ids for the end prediction
test_id = test['id']

In [5]:
#We want the labels as they are what we are predicting so we will test with them
labels = train['country_destination']
#We dont want the country destinations in the set
train = train.drop_duplicates()
test = test.drop_duplicates()
train = train.drop(columns = ['country_destination'])
#We want to combine the train and test sets so we need keep track of which is which
train['tt'] = 0
test['tt'] = 1
userdata = train.append(test)
#we want to drop the ids since the test ones were logged already and likewise, the date_first_booking holds minimal signifigance since none of the test set has values
userdata = userdata.drop(columns = ['date_first_booking'])



In [6]:
#date_account created needs to be separated
userdata['date_account_created'] = pd.to_datetime(userdata['date_account_created']) 
userdata['dac_year'] = userdata['date_account_created'].dt.year
userdata['dac_day'] = userdata['date_account_created'].dt.day
userdata['dac_month'] = userdata['date_account_created'].dt.month
userdata = userdata.drop(columns= ['date_account_created'])

#create dummy variable for the month into season
userdata['spring'] = userdata.dac_month.apply(lambda x: 1 if 2<x<6 else 0)
userdata['summer'] = userdata.dac_month.apply(lambda x: 1 if 5<x<9 else 0)
userdata['fall'] = userdata.dac_month.apply(lambda x: 1 if 8<x<12 else 0)
userdata['winter'] = userdata.dac_month.apply(lambda x: 1 if x<3 or x>11 else 0)

#clean the age data because there are skewed values and the unknown value needs to be set in manor that matters
userdata['age'] = userdata['age'].apply(lambda x: x if x < 105 else np.random.randint(28, 43))
userdata['age'] = userdata['age'].apply(lambda x: x if x > 10 else np.random.randint(10, 18))
userdata['age'] = userdata['age'].fillna(np.random.randint(28, 43))

#bucket age a bit
userdata['young_u18'] = userdata['age'].apply(lambda x: 1 if x<18 else 0)
userdata['young_u25'] = userdata['age'].apply(lambda x: 1 if 17<x<25 else 0)
userdata['young_u28'] = userdata['age'].apply(lambda x: 1 if 24<x<28 else 0)
userdata['young_u33'] = userdata['age'].apply(lambda x: 1 if 27<x<33 else 0)
userdata['young_u38'] = userdata['age'].apply(lambda x: 1 if 32<x<38 else 0)
userdata['young_u45'] = userdata['age'].apply(lambda x: 1 if 37<x<45 else 0)
userdata['young_u55'] = userdata['age'].apply(lambda x: 1 if 44<x<55 else 0)
userdata['young_u70'] = userdata['age'].apply(lambda x: 1 if 54<x<71 else 0)
userdata['young_g70'] = userdata['age'].apply(lambda x: 1 if 70<x<106 else 0)

In [7]:
#Manipulate timestamp data
userdata['timestamp_first_active'] = userdata['timestamp_first_active'].astype(str).str[0:10]

userdata['active_time_year'] = userdata['timestamp_first_active'].astype(str).str[:4]
userdata['active_time_year'] = userdata['active_time_year'].astype('int64')

userdata['active_time_month'] = userdata['timestamp_first_active'].str[4:6]
userdata['active_time_month'] = userdata['active_time_month'].astype('int64')

userdata['spring_act'] = userdata.active_time_month.apply(lambda x: 1 if 2<x<6 else 0)
userdata['summer_act'] = userdata.active_time_month.apply(lambda x: 1 if 5<x<9 else 0)
userdata['fall_act'] = userdata.active_time_month.apply(lambda x: 1 if 8<x<12 else 0)
userdata['winter_act'] = userdata.active_time_month.apply(lambda x: 1 if x<3 or x>11 else 0)

userdata['active_time_day'] = userdata['timestamp_first_active'].str[6:8]
userdata['active_time_day'] = userdata['active_time_day'].astype('int64')

userdata['active_time_hour'] = userdata['timestamp_first_active'].str[8:10]
userdata['active_time_hour'] = userdata['active_time_hour'].astype('int64')

userdata = userdata.drop(columns = 'timestamp_first_active')

In [8]:
#create dummy variables for langauge and normalize
userdata['language'] = userdata['language'].fillna('')
userdata['orig_lang_eng'] = userdata.language.apply(lambda x: 1 if "en" in x else 0)
userdata['orig_lang_deu'] = userdata.language.apply(lambda x: 1 if "de" in x else 0)
userdata['orig_lang_sp'] = userdata.language.apply(lambda x: 1 if "es" in x else 0)
userdata['orig_lang_fr'] = userdata.language.apply(lambda x: 1 if "fr" in x else 0)
userdata['orig_lang_it'] = userdata.language.apply(lambda x: 1 if "it" in x else 0)
userdata['orig_lang_nld'] = userdata.language.apply(lambda x: 1 if "nl" in x else 0)
userdata['orig_lang_pt'] = userdata.language.apply(lambda x: 1 if "pt" in x else 0)
#language is accounted for and can be removed
userdata = userdata.drop(columns = ['language'])

In [9]:
#Creat values for gender that are numbers
userdata['female'] = userdata.gender.apply(lambda x: 1 if "FEMALE" in x else 0)
userdata['male'] = userdata.gender.apply(lambda x: 1 if "MALE" in x else 0)
userdata['unkownn_gen'] = userdata.gender.apply(lambda x: 1 if "-unknown-" in x else 0)
userdata['other_gen'] = userdata.gender.apply(lambda x: 1 if "other" in x else 0)
userdata = userdata.drop(columns = ['gender'])

In [10]:
#The following code blocks are to remove all categorical varibles from the userdata
# signup_app..... this hurt the model for some reason so we removed it
userdata = userdata.drop(columns= ['signup_app'])

In [11]:
#signup_method
signup_m = {'facebook' : 1,
                     'google' : 2,
                     'basic' : 3,
                     'weibo' : 4}
userdata['signup_method'] = userdata.signup_method.apply(lambda x: signup_m[x])
min_max_scaler = preprocessing.MinMaxScaler()
devscaled = pd.DataFrame(userdata['signup_method'])
devscaled = min_max_scaler.fit_transform(devscaled)
userdata['signup_method'] = devscaled

In [12]:
#Brower used
browser = {'Chrome':1,
'Safari':2,
'Firefox':3,
'-unknown-':4,
'IE':5,
'Mobile Safari':6,
'Chrome Mobile':7,
'Android Browser':8,
'AOL Explorer':9,
'Opera':10,
'Silk':11,
'Chromium':12,
'BlackBerry Browser':13,
'Maxthon':14,
'IE Mobile':15,
'Apple Mail':16,
'Sogou Explorer':17,
'Mobile Firefox':18,
'RockMelt':19,
'SiteKiosk':20,
'Iron':21,
'IceWeasel':22,
'Pale Moon':23,
'SeaMonkey':24,
'Yandex.Browser':25,
'CometBird':26,
'Camino':27,
'TenFourFox':28,
'wOSBrowser':29,
'CoolNovo':30,
'Avant Browser':31,
'Opera Mini':32,
'Mozilla':33,
'Comodo Dragon':34,
'TheWorld Browser':35,
'Crazy Browser':36,
'Flock':37,
'OmniWeb':38,
'SlimBrowser':39,
'Opera Mobile':40,
'Conkeror':41,
'Outlook 2007':42,
'Palm Pre web browser':43,
'Stainless':44,
'NetNewsWire':45,
'Kindle Browser':46,
'Epic':47,
'Googlebot':48,
'Arora':49,
'Google Earth':50,
'IceDragon':51,
'PS Vita browser':52,
'IBrowse' : 53,
'UC Browser' : 54,
'IBrowse': 55,
'Nintendo Browser' : 56}
userdata['first_browser'] = userdata.first_browser.apply(lambda x: browser[x])
min_max_scaler = preprocessing.MinMaxScaler()
devscaled = pd.DataFrame(userdata['first_browser'])
devscaled = min_max_scaler.fit_transform(devscaled)
userdata['first_browser'] = devscaled

In [13]:
#provider
aff_provide = {'direct':1,
'google':2,
'other':3,
'craigslist':4,
'bing':5,
'facebook':6,
'vast':7,
'padmapper':8,
'facebook-open-graph':9,
'yahoo':10,
'gsp':11,
'meetup':12,
'email-marketing':13,
'naver':14,
'baidu':15,
'yandex':16,
'wayn':17,
'daum':18}
userdata['affiliate_provider'] = userdata.affiliate_provider.apply(lambda x: aff_provide[x])
min_max_scaler = preprocessing.MinMaxScaler()
devscaled = pd.DataFrame(userdata['affiliate_provider'])
devscaled = min_max_scaler.fit_transform(devscaled)
userdata['affiliate_provider'] = devscaled

In [14]:
#device type
device_types = {'Android Phone' : 1,
                     'Android Tablet' : 2,
                     'Desktop (Other)' : 3,
                     'iPad' : 4,
                     'iPhone': 5,
                     'Mac Desktop': 6,
                     'Other/Unknown': 7,
                     'SmartPhone (Other)': 8,
                     'Windows Desktop': 9}
userdata['first_device_type'] = userdata.first_device_type.apply(lambda x: device_types[x])
min_max_scaler = preprocessing.MinMaxScaler()
devscaled = pd.DataFrame(userdata['first_device_type'])
devscaled = min_max_scaler.fit_transform(devscaled)
userdata['first_device_type'] = devscaled

In [15]:
#aff tracked
aff_track = {'untracked' : 1,
                                   'linked' : 2,
                                   'omg' : 3,
                                   'tracked-other' : 4,
                                   'product' : 5,
                                   'marketing' : 6,
                                   'local ops' : 7,
                                    0: 0}
userdata['first_affiliate_tracked'] = userdata['first_affiliate_tracked'].fillna(0)
userdata['first_affiliate_tracked'] = userdata.first_affiliate_tracked.apply(lambda x: aff_track[x])
devscaled = pd.DataFrame(userdata['first_affiliate_tracked'])
devscaled = min_max_scaler.fit_transform(devscaled)
userdata['first_affiliate_tracked'] = devscaled

In [16]:
#channel
aff_channel = {'direct' : 1,
               'sem-brand' : 2,
               'sem-non-brand' : 3,
               'other' : 4,
               'api' : 5,
               'seo' : 6,
               'content' : 7,
               'remarketing' : 8}
userdata['affiliate_channel'] = userdata.affiliate_channel.apply(lambda x: aff_channel[x])
min_max_scaler = preprocessing.MinMaxScaler()
devscaled = pd.DataFrame(userdata['affiliate_channel'])
devscaled = min_max_scaler.fit_transform(devscaled)
userdata['affiliate_channel'] = devscaled

In [17]:
#normalize flow
min_max_scaler = preprocessing.MinMaxScaler()
devscaled = pd.DataFrame(userdata['signup_flow'])
devscaled = min_max_scaler.fit_transform(devscaled)
userdata['signup_flow'] = devscaled

In [18]:
#Check what data looks like
userdata

Unnamed: 0,id,age,signup_method,signup_flow,affiliate_channel,affiliate_provider,first_affiliate_tracked,first_device_type,first_browser,tt,dac_year,dac_day,dac_month,spring,summer,fall,winter,young_u18,young_u25,young_u28,young_u33,young_u38,young_u45,young_u55,young_u70,young_g70,active_time_year,active_time_month,spring_act,summer_act,fall_act,winter_act,active_time_day,active_time_hour,orig_lang_eng,orig_lang_deu,orig_lang_sp,orig_lang_fr,orig_lang_it,orig_lang_nld,orig_lang_pt,female,male,unkownn_gen,other_gen
0,gxn3p5htnn,40.0,0.000000,0.00,0.000000,0.000000,0.142857,0.625,0.000000,0,2010,28,6,0,1,0,0,0,0,0,0,0,1,0,0,0,2009,3,1,0,0,0,19,4,1,0,0,0,0,0,0,0,0,1,0
1,820tgsjxq7,38.0,0.000000,0.00,0.714286,0.058824,0.142857,0.625,0.000000,0,2011,25,5,1,0,0,0,0,0,0,0,0,1,0,0,0,2009,5,1,0,0,0,23,17,1,0,0,0,0,0,0,0,1,0,0
2,4ft3gnwmtx,56.0,0.666667,0.12,0.000000,0.000000,0.142857,1.000,0.072727,0,2010,28,9,0,0,1,0,0,0,0,0,0,0,0,1,0,2009,6,0,1,0,0,9,23,1,0,0,0,0,0,0,1,1,0,0
3,bjjt8pjhuk,42.0,0.000000,0.00,0.000000,0.000000,0.142857,0.625,0.036364,0,2011,5,12,0,0,0,1,0,0,0,0,0,1,0,0,0,2009,10,0,0,1,0,31,6,1,0,0,0,0,0,0,1,1,0,0
4,87mebub9p4,41.0,0.666667,0.00,0.000000,0.000000,0.142857,0.625,0.000000,0,2010,14,9,0,0,1,0,0,0,0,0,0,1,0,0,0,2009,12,0,0,0,1,8,6,1,0,0,0,0,0,0,0,0,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
62091,cv0na2lf5a,31.0,0.666667,0.00,0.000000,0.000000,0.142857,1.000,0.072727,1,2014,30,9,0,0,1,0,0,0,0,1,0,0,0,0,0,2014,9,0,0,1,0,30,23,1,0,0,0,0,0,0,0,0,1,0
62092,zp8xfonng8,41.0,0.666667,0.92,0.000000,0.000000,0.142857,0.000,0.054545,1,2014,30,9,0,0,1,0,0,0,0,0,0,1,0,0,0,2014,9,0,0,1,0,30,23,0,0,0,0,0,0,0,0,0,1,0
62093,fa6260ziny,38.0,0.666667,0.00,0.000000,0.000000,0.285714,1.000,0.036364,1,2014,30,9,0,0,1,0,0,0,0,0,0,1,0,0,0,2014,9,0,0,1,0,30,23,0,1,0,0,0,0,0,0,0,1,0
62094,87k0fy4ugm,32.0,0.666667,0.00,0.142857,0.058824,0.428571,0.625,0.018182,1,2014,30,9,0,0,1,0,0,0,0,1,0,0,0,0,0,2014,9,0,0,1,0,30,23,1,0,0,0,0,0,0,0,0,1,0


In [19]:
#What does sessions look like....
sessions

Unnamed: 0,user_id,action,action_type,action_detail,device_type,secs_elapsed
0,d1mm9tcy42,lookup,,,Windows Desktop,319.0
1,d1mm9tcy42,search_results,click,view_search_results,Windows Desktop,67753.0
2,d1mm9tcy42,lookup,,,Windows Desktop,301.0
3,d1mm9tcy42,search_results,click,view_search_results,Windows Desktop,22141.0
4,d1mm9tcy42,lookup,,,Windows Desktop,435.0
...,...,...,...,...,...,...
10567732,9uqfg8txu3,dashboard,view,dashboard,Windows Desktop,556.0
10567733,9uqfg8txu3,edit,view,edit_profile,Windows Desktop,6624.0
10567734,9uqfg8txu3,webcam_upload,-unknown-,-unknown-,Windows Desktop,200125.0
10567735,9uqfg8txu3,active,-unknown-,-unknown-,-unknown-,17624.0


In [20]:
#Use action type because it seems like everything else will crash my computer due to its size...which it did many times when I tried
sessions2 = sessions[['user_id', 'action_type']]
sessions2 = sessions2.pivot_table(index='user_id', 
               columns='action_type',
               aggfunc='size')

In [21]:
#Remove all NaN values from the set
sessions2 = sessions2.fillna(0)

In [22]:
#Make the id column a regular column because it makes joining easier
sessions2.index.names = ["id"]
sessions2.reset_index(inplace=True)

In [23]:
#join sessions data to userdata table 
userdata = userdata.merge(sessions2, how='left', on=['id']).fillna(0)

In [24]:
#check userdata table with sessions data
userdata

Unnamed: 0,id,age,signup_method,signup_flow,affiliate_channel,affiliate_provider,first_affiliate_tracked,first_device_type,first_browser,tt,dac_year,dac_day,dac_month,spring,summer,fall,winter,young_u18,young_u25,young_u28,young_u33,young_u38,young_u45,young_u55,young_u70,young_g70,active_time_year,active_time_month,spring_act,summer_act,fall_act,winter_act,active_time_day,active_time_hour,orig_lang_eng,orig_lang_deu,orig_lang_sp,orig_lang_fr,orig_lang_it,orig_lang_nld,orig_lang_pt,female,male,unkownn_gen,other_gen,-unknown-,booking_request,booking_response,click,data,message_post,modify,partner_callback,submit,view
0,gxn3p5htnn,40.0,0.000000,0.00,0.000000,0.000000,0.142857,0.625,0.000000,0,2010,28,6,0,1,0,0,0,0,0,0,0,1,0,0,0,2009,3,1,0,0,0,19,4,1,0,0,0,0,0,0,0,0,1,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,820tgsjxq7,38.0,0.000000,0.00,0.714286,0.058824,0.142857,0.625,0.000000,0,2011,25,5,1,0,0,0,0,0,0,0,0,1,0,0,0,2009,5,1,0,0,0,23,17,1,0,0,0,0,0,0,0,1,0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,4ft3gnwmtx,56.0,0.666667,0.12,0.000000,0.000000,0.142857,1.000,0.072727,0,2010,28,9,0,0,1,0,0,0,0,0,0,0,0,1,0,2009,6,0,1,0,0,9,23,1,0,0,0,0,0,0,1,1,0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,bjjt8pjhuk,42.0,0.000000,0.00,0.000000,0.000000,0.142857,0.625,0.036364,0,2011,5,12,0,0,0,1,0,0,0,0,0,1,0,0,0,2009,10,0,0,1,0,31,6,1,0,0,0,0,0,0,1,1,0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,87mebub9p4,41.0,0.666667,0.00,0.000000,0.000000,0.142857,0.625,0.000000,0,2010,14,9,0,0,1,0,0,0,0,0,0,1,0,0,0,2009,12,0,0,0,1,8,6,1,0,0,0,0,0,0,0,0,1,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
275542,cv0na2lf5a,31.0,0.666667,0.00,0.000000,0.000000,0.142857,1.000,0.072727,1,2014,30,9,0,0,1,0,0,0,0,1,0,0,0,0,0,2014,9,0,0,1,0,30,23,1,0,0,0,0,0,0,0,0,1,0,6.0,0.0,0.0,1.0,8.0,0.0,0.0,0.0,33.0,36.0
275543,zp8xfonng8,41.0,0.666667,0.92,0.000000,0.000000,0.142857,0.000,0.054545,1,2014,30,9,0,0,1,0,0,0,0,0,0,1,0,0,0,2014,9,0,0,1,0,30,23,0,0,0,0,0,0,0,0,0,1,0,3.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0,11.0
275544,fa6260ziny,38.0,0.666667,0.00,0.000000,0.000000,0.285714,1.000,0.036364,1,2014,30,9,0,0,1,0,0,0,0,0,0,1,0,0,0,2014,9,0,0,1,0,30,23,0,1,0,0,0,0,0,0,0,1,0,4.0,0.0,0.0,20.0,20.0,0.0,0.0,1.0,2.0,18.0
275545,87k0fy4ugm,32.0,0.666667,0.00,0.142857,0.058824,0.428571,0.625,0.018182,1,2014,30,9,0,0,1,0,0,0,0,1,0,0,0,0,0,2014,9,0,0,1,0,30,23,1,0,0,0,0,0,0,0,0,1,0,1.0,0.0,0.0,2.0,5.0,0.0,0.0,0.0,3.0,3.0


In [25]:
#return data to training and testing sets, remove the testing/training ids
train = userdata.loc[userdata['tt'] == 0]
train = train.drop(columns = ['id'])

test = userdata.loc[userdata['tt'] == 1]
test = test.drop(columns = ['id'])

In [26]:
#check if the sets have null values
train.isna().sum()

age                        0
signup_method              0
signup_flow                0
affiliate_channel          0
affiliate_provider         0
first_affiliate_tracked    0
first_device_type          0
first_browser              0
tt                         0
dac_year                   0
dac_day                    0
dac_month                  0
spring                     0
summer                     0
fall                       0
winter                     0
young_u18                  0
young_u25                  0
young_u28                  0
young_u33                  0
young_u38                  0
young_u45                  0
young_u55                  0
young_u70                  0
young_g70                  0
active_time_year           0
active_time_month          0
spring_act                 0
summer_act                 0
fall_act                   0
winter_act                 0
active_time_day            0
active_time_hour           0
orig_lang_eng              0
orig_lang_deu 

In [27]:
test.isna().sum()

age                        0
signup_method              0
signup_flow                0
affiliate_channel          0
affiliate_provider         0
first_affiliate_tracked    0
first_device_type          0
first_browser              0
tt                         0
dac_year                   0
dac_day                    0
dac_month                  0
spring                     0
summer                     0
fall                       0
winter                     0
young_u18                  0
young_u25                  0
young_u28                  0
young_u33                  0
young_u38                  0
young_u45                  0
young_u55                  0
young_u70                  0
young_g70                  0
active_time_year           0
active_time_month          0
spring_act                 0
summer_act                 0
fall_act                   0
winter_act                 0
active_time_day            0
active_time_hour           0
orig_lang_eng              0
orig_lang_deu 

In [28]:
#drop the train vs test classifier 
train = train.drop(columns = 'tt')
test = test.drop(columns = 'tt')

In [29]:
#for evaluation reasons the sets need to be arrays
trainarr = train.values
len(trainarr)

213451

In [30]:
#See above
testarr = test.values
len(testarr)

62096

In [31]:
#This is a classification problem so using regressors wont provide the right type of modeling
#Tried using linear, logistic, regular decision tree, and adaboost but they didn't provide the highest ncdg score
#Using Randomforestclassifier worked the best
#Tried using CV for hyperparameter tuning but it crashed due to memory load of fitting so they were picked by testing them individually
from sklearn.preprocessing import LabelEncoder
from sklearn.ensemble import RandomForestClassifier
labelenc = LabelEncoder()
y_train = labelenc.fit_transform(labels) 
model = RandomForestClassifier(n_estimators=150, criterion = 'entropy')                
model.fit(trainarr, y_train)


RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None,
                       criterion='entropy', max_depth=None, max_features='auto',
                       max_leaf_nodes=None, max_samples=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=150,
                       n_jobs=None, oob_score=False, random_state=None,
                       verbose=0, warm_start=False)

In [32]:
#Since we want the top five predictions we need to use predict.proba and then manipulate the output to assign them to the test_ids
y_pred = model.predict_proba(testarr)

testid_list = []
countries = []
for i in range(len(test_id)):
    curr_id = test_id[i]
    testid_list += [curr_id] * 5
    countries += labelenc.inverse_transform(np.argsort(y_pred[i])[::-1])[:5].tolist()

id_df = pd.DataFrame(testid_list)
country_df = pd.DataFrame(countries)
sub = pd.DataFrame(list(zip(testid_list, countries)))
sub.columns = ['ID', 'Country']

In [33]:
sub

Unnamed: 0,ID,Country
0,5uwns89zht,NDF
1,5uwns89zht,US
2,5uwns89zht,other
3,5uwns89zht,FR
4,5uwns89zht,CA
...,...,...
310475,9uqfg8txu3,NDF
310476,9uqfg8txu3,US
310477,9uqfg8txu3,other
310478,9uqfg8txu3,IT


In [34]:
#.67 .... .68... .69... .74... .75... .87
sub.to_csv('Submission.csv', index=False)
!cp Submission.csv "drive/My Drive/"