In [1]:
import pandas as pd 
import numpy as np 
from scipy import stats
from sklearn import metrics
from sklearn.metrics import accuracy_score
from sklearn.model_selection import KFold
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.model_selection import cross_val_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.decomposition import PCA
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.multiclass import OneVsRestClassifier
from sklearn.neighbors import NearestNeighbors
from sklearn.neighbors import KNeighborsClassifier as KNN
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import classification_report, confusion_matrix
from scipy.sparse import csr_matrix

import xgboost as xgb
from xgboost import XGBClassifier 
from xgboost import XGBRegressor
from xgboost import plot_importance
import matplotlib.pylab as plt
from matplotlib import pyplot
import warnings
warnings.filterwarnings('ignore')
%matplotlib inline

In [2]:
train_dataset = pd.read_csv("HackRiceForeFlightData_Train.csv")

In [72]:
train_dataset.head()

Unnamed: 0,user_id,airport,number_of_flights,airport_type,number_fbos,number_restaurants,number_hotels,number_maintenance,number_flight_schools,number_flying_clubs,name,latitude,longitude
0,user_31441,KSAT,6,AIRPORT,6,15,10,9,7,0,San Antonio International,29.533958,-98.469057
1,user_31441,KROW,3,AIRPORT,2,15,10,1,1,0,Roswell International Air Center,33.29987,-104.529398
2,user_31441,KMAF,2,AIRPORT,1,12,8,1,1,0,Midland International Air and Space Port,31.942528,-102.201917
3,user_31441,KCRP,1,AIRPORT,0,15,13,1,1,1,Corpus Christi International,27.772194,-97.502417
4,user_31441,KLBB,1,AIRPORT,0,16,10,3,5,1,Lubbock Preston Smith International,33.663667,-101.820556


In [3]:
test_dataset = pd.read_csv("HackRiceForeFlightData_Test.csv")

In [38]:
df_airport_features_pivot = train_dataset.pivot_table(
    index='user_id',
    columns='airport',
    values='number_of_flights'
).fillna(0)
#df_airport_features_pivot = df_airport_features_pivot.drop(columns=['00AL'])

In [91]:
X_train = df_airport_features_pivot

In [92]:
X_train.head()

airport,00AL,00C,00F,00FL,00M,00MN,00N,00NC,00R,00S,...,Y87,Y88,Y89,Y91,Y93,Y94,Y95,Y96,Y99,Z98
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
user_0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
user_1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
user_10,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
user_100,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
user_1000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [93]:
df_airport_features_pivot.head()

airport,00AL,00C,00F,00FL,00M,00MN,00N,00NC,00R,00S,...,Y87,Y88,Y89,Y91,Y93,Y94,Y95,Y96,Y99,Z98
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
user_0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
user_1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
user_10,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
user_100,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
user_1000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [97]:
df_airport_features_pivot.index.size

39750

In [42]:
df_airport_features_pivot.loc[['user_0']]

airport,00AL,00C,00F,00FL,00M,00MN,00N,00NC,00R,00S,...,Y87,Y88,Y89,Y91,Y93,Y94,Y95,Y96,Y99,Z98
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
user_0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [135]:
model_knn = NearestNeighbors(metric='cosine', algorithm='brute', n_neighbors=50, n_jobs=-1)
model_knn.fit(X_train)

NearestNeighbors(algorithm='brute', leaf_size=30, metric='cosine',
         metric_params=None, n_jobs=-1, n_neighbors=50, p=2, radius=1.0)

In [136]:
KNearestNeighbors = model_knn.kneighbors([df_airport_features_pivot.loc['user_31411']])

ValueError: Incompatible dimension for X and Y matrices: X.shape[1] == 5397 while Y.shape[1] == 5398

In [141]:
def findAverages(KNearestNeighbors, X_train):
    listOfAverages = []
    for airport in X_train.columns:
        counter = 1
        totalRating = 0
        for index in KNearestNeighbors:
            neighbor = X_train.iloc[index] 
            if(neighbor[airport] > 0.1):
                totalRating += neighbor[airport]
                counter += 1
        averageRating = totalRating//counter
        listOfAverages.append([averageRating,airport])
    return sorted(listOfAverages, reverse=True)

In [113]:
X_train = train_dataset.pivot_table(
    index='user_id',
    columns='airport',
    values='number_of_flights'
).fillna(0)

test_dataset_pivot = test_dataset.pivot_table(
    index='user_id',
    columns='airport',
    values='number_of_flights'
).fillna(0)
test_dataset_pivot.head()



col_list = (test_dataset_pivot.append([X_train])).columns.tolist()
test_dataset_pivot = test_dataset_pivot.loc[:, col_list].fillna(0)
X_train = X_train.loc[:, col_list].fillna(0)
X_train.head()

airport,00AL,00C,00F,00FL,00M,00MN,00N,00NC,00R,00S,...,Y87,Y88,Y89,Y91,Y93,Y94,Y95,Y96,Y99,Z98
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
user_0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
user_1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
user_10,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
user_100,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
user_1000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [114]:
test_dataset_pivot.head()

airport,00AL,00C,00F,00FL,00M,00MN,00N,00NC,00R,00S,...,Y87,Y88,Y89,Y91,Y93,Y94,Y95,Y96,Y99,Z98
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
user_10883,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
user_10972,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
user_11380,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
user_1153,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
user_11965,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [151]:
model_knn = NearestNeighbors(metric='cosine', algorithm='brute', n_neighbors=50, n_jobs=-1)
model_knn.fit(X_train)

NearestNeighbors(algorithm='brute', leaf_size=30, metric='cosine',
         metric_params=None, n_jobs=-1, n_neighbors=50, p=2, radius=1.0)

In [152]:
KNearestNeighbors = model_knn.kneighbors(test_dataset_pivot)
for element in KNearestNeighbors:
    print(element)

[[0.21299037 0.21414758 0.22021908 ... 0.65095508 0.65095508 0.65162918]
 [0.04358805 0.0866134  0.09125519 ... 0.29276882 0.29394277 0.29699275]
 [0.06862962 0.07277808 0.09589477 ... 0.44089386 0.4515915  0.45399457]
 ...
 [0.43208073 0.43636364 0.43639507 ... 0.5049178  0.50643606 0.50787277]
 [0.20059193 0.23513384 0.27942331 ... 0.43386148 0.44198813 0.44250533]
 [0.01680444 0.02785322 0.02794844 ... 0.14719713 0.15040921 0.163017  ]]
[[16523 14965 30074 ... 11607 15913 25366]
 [ 9789 21563 15851 ... 25186 35266 16950]
 [  796 16017 24089 ... 21534 13352 12093]
 ...
 [26211 38146 30734 ... 27735  8537   681]
 [33585  8350 22490 ... 18758 13143 37608]
 [   44 32143  1704 ... 17904  9224 26404]]


In [153]:
i = 0
returnList = []
for user_row in test_dataset_pivot.iterrows():
    listOfAverages = findAverages(KNearestNeighbors[1][i], X_train)
    for entry in listOfAverages:
        if(user_row[1][entry[1]] < 1):
            returnList.append([user_row[0], entry[1]])
            break
#     if !returnFlag:
#         #append something from Sakib's list
    i += 1
print(returnList)

[['user_10883', 'KINT'], ['user_10972', 'S39'], ['user_11380', 'KTIF'], ['user_1153', 'KBMQ'], ['user_11965', 'KWHP'], ['user_12112', 'KLVK'], ['user_12358', 'KHDO'], ['user_12522', 'KABE'], ['user_13088', '6P3'], ['user_13142', 'KTOB'], ['user_13304', 'KAYS'], ['user_13327', 'KLUA'], ['user_13633', 'I17'], ['user_13677', 'KBFI'], ['user_13969', 'KHTO'], ['user_1452', 'KHII'], ['user_14667', 'KMGN'], ['user_14768', 'KTTN'], ['user_14897', 'F69'], ['user_15260', 'KCCB'], ['user_15573', 'KRZR'], ['user_16010', '4M9'], ['user_16208', 'KPGD'], ['user_16833', 'PAFA'], ['user_17231', '0S9'], ['user_17335', '3F3'], ['user_17459', 'KPVU'], ['user_17616', 'KLUM'], ['user_18326', 'KHSV'], ['user_18680', '06WN'], ['user_19242', '2J9'], ['user_19306', 'K24'], ['user_20792', 'KSVH'], ['user_20887', 'KISM'], ['user_2120', 'KCVX'], ['user_21447', 'KDTL'], ['user_21650', 'W96'], ['user_21696', 'KPWK'], ['user_2211', 'KPVC'], ['user_24040', 'KPNS'], ['user_24720', '1A5'], ['user_25088', 'KCRW'], ['user

In [154]:
#if it makes it here lol
model_knn = NearestNeighbors(metric='cosine', algorithm='brute', n_neighbors=200, n_jobs=-1)
model_knn.fit(X_train)

NearestNeighbors(algorithm='brute', leaf_size=30, metric='cosine',
         metric_params=None, n_jobs=-1, n_neighbors=200, p=2, radius=1.0)

In [155]:
KNearestNeighbors = model_knn.kneighbors(test_dataset_pivot)
i = 0
returnList = []
for user_row in test_dataset_pivot.iterrows():
    listOfAverages = findAverages(KNearestNeighbors[1][i], X_train)
    for entry in listOfAverages:
        if(user_row[1][entry[1]] < 1):
            returnList.append([user_row[0], entry[1]])
            break
#     if !returnFlag:
#         #append something from Sakib's list
    i += 1
print(returnList)

[['user_10883', 'KLNL'], ['user_10972', 'KOTH'], ['user_11380', 'KTIF'], ['user_1153', 'KLVJ'], ['user_11965', 'KFTG'], ['user_12112', 'CA32'], ['user_12358', 'KRIL'], ['user_12522', 'KPGV'], ['user_13088', 'MO00'], ['user_13142', 'Y19'], ['user_13304', 'KHQU'], ['user_13327', 'KGRD'], ['user_13633', 'KPTK'], ['user_13677', '1V6'], ['user_13969', 'KMGJ'], ['user_1452', 'KIYK'], ['user_14667', 'KBJC'], ['user_14768', 'KTTN'], ['user_14897', 'F69'], ['user_15260', 'KCMH'], ['user_15573', 'KSBM'], ['user_16010', '4M9'], ['user_16208', 'KCNI'], ['user_16833', 'KIWS'], ['user_17231', 'PHMK'], ['user_17335', 'KBQP'], ['user_17459', 'KTRK'], ['user_17616', 'KMML'], ['user_18326', 'KTBR'], ['user_18680', '06WN'], ['user_19242', 'FD38'], ['user_19306', 'KAFJ'], ['user_20792', '0R7'], ['user_20887', 'KOCF'], ['user_2120', 'KMMU'], ['user_21447', 'KLHZ'], ['user_21650', 'S64'], ['user_21696', 'KATY'], ['user_2211', 'KRUT'], ['user_24040', 'KMJX'], ['user_24720', 'PHNL'], ['user_25088', 'KCRW'], [

In [162]:
import csv

with open('output.csv', 'w') as writeFile:
    writer = csv.writer(writeFile)
    writer.writerows(returnList)
