In [106]:
import numpy as np
import pandas as pd
from pandas import Series
from pandas import DataFrame
import csv

df_train = pd.read_csv('prediction_challenge_train.csv')
cc = pd.read_csv('airport_country_code_mapping.csv')
df_test = pd.read_csv('prediction_challenge_test.csv')
print(len(df_train[df_train['Eligible_For_Discount'] == 'Yes']))

26794


In [107]:
#Converting 'airport_country_codes.csv' to a dict
cc_values = cc['Country Name']
cc_values.index = cc['Airport Country Code']
cc_dict = cc_values.to_dict()

In [108]:
#Making new columns for convenience
df_train['Airport Country'] = df_train['Airport Country Code'].map(cc_dict)
stored_dates = df_train['Departure Date'].values #year is always 2022
df_train['D Month'] = [x.split('/')[0] for x in stored_dates]
df_train['D Day'] = [x.split('/')[1] for x in stored_dates]
df_train.head()

Unnamed: 0,ID,First Name,Last Name,Gender,Age,Nationality,Airport Country Code,Departure Date,Pilot Name,Ticket Price,Eligible_For_Discount,Airport Country,D Month,D Day
0,22554,Tally,Unworth,Female,13,Indonesia,PG,12/10/2022,Gussie Ridding,1218.0,Yes,Papua New Guinea,12,10
1,42928,Bobette,Turfitt,Female,38,Thailand,CN,09/06/2022,Ursola Faudrie,688.0,Yes,China,9,6
2,26198,Karalee,Gross,Female,75,Philippines,US,03/11/2022,Kellia Bunney,824.0,Yes,United States,3,11
3,56569,Laurene,Shilton,Female,32,Poland,ID,1/19/2022,Kial McCaighey,702.0,Yes,Indonesia,1,19
4,65769,Nancy,McGuigan,Female,17,Albania,MG,3/24/2022,Pincas Lorenzetto,1120.0,Yes,Madagascar,3,24


In [109]:
#Looking at Age
print(len(df_train[df_train['Age'] < 5]))
print(len(df_train[(df_train['Age'] < 5) & (df_train['Eligible_For_Discount'] == 'Yes')]))
print(len(df_train[df_train['Age'] > 85]))
print(len(df_train[(df_train['Age'] > 85) & (df_train['Eligible_For_Discount'] == 'Yes')]))
#all extremely young (< 5) or extremely old (> 85) passengers are eligible for discount

3260
3260
4056
4056


In [110]:
#Looking at Departure Date (day & month)
print(df_train.groupby(['D Day', 'Eligible_For_Discount']).size()) 
#^length = 62, so each day has 2 subgroups (none are completely 'Yes' or 'No')
print(df_train.groupby(['D Month', 'Eligible_For_Discount']).size())
#all flights with departure dates in Jan ('D Month' is 1 or 01) or Dec ('D Month' is 12) are eligible for discount

D Day  Eligible_For_Discount
01     No                       828
       Yes                      855
02     No                       757
       Yes                      879
03     No                       843
                               ... 
29     Yes                      779
30     No                       731
       Yes                      813
31     No                       393
       Yes                      440
Length: 62, dtype: int64
D Month  Eligible_For_Discount
01       Yes                      2409
02       No                        958
         Yes                       548
03       No                        945
         Yes                       564
04       No                       1010
         Yes                       595
05       No                        985
         Yes                       590
06       No                       1009
         Yes                       551
07       No                       1005
         Yes                       549
08       No 

In [111]:
#Looking at Airport Country (derived from Airport Country Code)
print(len(df_train[df_train['Airport Country'] == df_train['Nationality']]))
print(len(df_train[(df_train['Airport Country'] == df_train['Nationality']) & (df_train['Eligible_For_Discount'] == 'Yes')]))
#all passengers whose nationality matches the airport country are eligible for discount

1500
1500


In [112]:
#Drop rows whose eligibility reasons are already accounted for
young = df_train[df_train['Age'] < 5]
df_train = df_train.drop(df_train[df_train['Age'] < 5].index)
old = df_train[df_train['Age'] > 85]
df_train = df_train.drop(df_train[df_train['Age'] > 85].index)
jan = df_train[(df_train['D Month'] == '1') | (df_train['D Month'] == '01')]
df_train = df_train.drop(df_train[(df_train['D Month'] == '1') | (df_train['D Month'] == '01')].index)
dec = df_train[df_train['D Month'] == '12']
df_train = df_train.drop(df_train[df_train['D Month'] == '12'].index)
acn_match = df_train[df_train['Airport Country'] == df_train['Nationality']]
df_train = df_train.drop(df_train[df_train['Airport Country'] == df_train['Nationality']].index)
print(len(df_train[df_train['Eligible_For_Discount'] == 'Yes'])) #7290 left

7290


In [113]:
#Looking at Gender & Ticket Price
df_train_f = df_train[df_train['Gender'] == 'Female']
df_train_m = df_train[df_train['Gender'] == 'Male']
print(len(df_train_m[df_train_m['Eligible_For_Discount'] == 'Yes'])) #= 0, only need to look at female passengers
df_train_f[df_train_f['Eligible_For_Discount'] == 'Yes'] #ticket prices are all even

0


Unnamed: 0,ID,First Name,Last Name,Gender,Age,Nationality,Airport Country Code,Departure Date,Pilot Name,Ticket Price,Eligible_For_Discount,Airport Country,D Month,D Day
1,42928,Bobette,Turfitt,Female,38,Thailand,CN,09/06/2022,Ursola Faudrie,688.0,Yes,China,09,06
2,26198,Karalee,Gross,Female,75,Philippines,US,03/11/2022,Kellia Bunney,824.0,Yes,United States,03,11
4,65769,Nancy,McGuigan,Female,17,Albania,MG,3/24/2022,Pincas Lorenzetto,1120.0,Yes,Madagascar,3,24
7,84796,Teena,Ciccottini,Female,65,Philippines,BR,4/21/2022,Read Widdocks,540.0,Yes,Brazil,4,21
17,14666,Euphemia,Boase,Female,70,Czech Republic,PG,7/25/2022,Novelia Harris,1500.0,Yes,Papua New Guinea,7,25
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
51436,56146,Adelle,Raiker,Female,36,Indonesia,US,10/18/2022,Salli Cruft,1104.0,Yes,United States,10,18
51441,90113,Rene,Maffi,Female,53,New Zealand,PK,10/14/2022,Gayle Ide,740.0,Yes,Pakistan,10,14
51443,87270,Tina,Paiton,Female,17,Indonesia,CR,10/01/2022,Francesco Gremane,808.0,Yes,Costa Rica,10,01
51446,9723,Binny,Ticic,Female,17,Indonesia,CN,11/03/2022,Meggie Colquitt,688.0,Yes,China,11,03


In [114]:
#Gender & Ticket Price, cont.
print(len(df_train_f[df_train_f['Ticket Price'] % 2 == 0])) #greater amount than eligible rows, condition is not strict enough
print(len(df_train_f[df_train_f['Ticket Price'] % 4 == 0])) #same amount as eligible rows
print(len(df_train_f[(df_train_f['Ticket Price'] % 4 == 0) & (df_train_f['Eligible_For_Discount'] == 'Yes')]))
#all female passengers with a ticket price divisible by 4 are eligible for discount

10848
7290
7290


In [115]:
#Applying model to test data

#making new columns for convenience
df_test['Airport Country'] = df_test['Airport Country Code'].map(cc_dict)
stored_dates_2 = df_test['Departure Date'].values
df_test['D Month'] = [x.split('/')[0] for x in stored_dates_2]
df_test['D Day'] = [x.split('/')[1] for x in stored_dates_2]

#filling in Eligible_For_Discount
df_test['Eligible_For_Discount'] = np.where(df_test['Age'] < 5, df_test['Eligible_For_Discount'].fillna('Yes'), df_test['Eligible_For_Discount'])
df_test['Eligible_For_Discount'] = np.where(df_test['Age'] > 85, df_test['Eligible_For_Discount'].fillna('Yes'), df_test['Eligible_For_Discount'])
df_test['Eligible_For_Discount'] = np.where(df_test['D Month'] == '1', df_test['Eligible_For_Discount'].fillna('Yes'), df_test['Eligible_For_Discount'])
df_test['Eligible_For_Discount'] = np.where(df_test['D Month'] == '01', df_test['Eligible_For_Discount'].fillna('Yes'), df_test['Eligible_For_Discount'])
df_test['Eligible_For_Discount'] = np.where(df_test['D Month'] == '12', df_test['Eligible_For_Discount'].fillna('Yes'), df_test['Eligible_For_Discount'])
df_test['Eligible_For_Discount'] = np.where(df_test['Airport Country'] == df_test['Nationality'], df_test['Eligible_For_Discount'].fillna('Yes'), df_test['Eligible_For_Discount'])
df_test['Eligible_For_Discount'] = np.where((df_test['Gender'] == 'Female') & (df_test['Ticket Price'] % 4 == 0), df_test['Eligible_For_Discount'].fillna('Yes'), df_test['Eligible_For_Discount'])
df_test['Eligible_For_Discount'] = df_test['Eligible_For_Discount'].fillna('No')

#removing created cols
df_test = df_test.iloc[:,0:11]

#export
df_test.to_csv('jqy2_prediction_challenge_test.csv')