In [1]:
import pandas as pd 
import numpy as np 
from scipy import stats
from sklearn import metrics
from sklearn.metrics import accuracy_score
from sklearn.model_selection import KFold
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.model_selection import cross_val_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.decomposition import PCA
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.multiclass import OneVsRestClassifier
from sklearn.neighbors import NearestNeighbors
from sklearn.neighbors import KNeighborsClassifier as KNN
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import classification_report, confusion_matrix
from scipy.sparse import csr_matrix

import xgboost as xgb
from xgboost import XGBClassifier 
from xgboost import XGBRegressor
from xgboost import plot_importance
import matplotlib.pylab as plt
from matplotlib import pyplot
import warnings
warnings.filterwarnings('ignore')
%matplotlib inline

In [2]:
train_dataset = pd.read_csv("HackRiceForeFlightData_Train.csv")

In [72]:
train_dataset.head()

Unnamed: 0,user_id,airport,number_of_flights,airport_type,number_fbos,number_restaurants,number_hotels,number_maintenance,number_flight_schools,number_flying_clubs,name,latitude,longitude
0,user_31441,KSAT,6,AIRPORT,6,15,10,9,7,0,San Antonio International,29.533958,-98.469057
1,user_31441,KROW,3,AIRPORT,2,15,10,1,1,0,Roswell International Air Center,33.29987,-104.529398
2,user_31441,KMAF,2,AIRPORT,1,12,8,1,1,0,Midland International Air and Space Port,31.942528,-102.201917
3,user_31441,KCRP,1,AIRPORT,0,15,13,1,1,1,Corpus Christi International,27.772194,-97.502417
4,user_31441,KLBB,1,AIRPORT,0,16,10,3,5,1,Lubbock Preston Smith International,33.663667,-101.820556


In [3]:
test_dataset = pd.read_csv("HackRiceForeFlightData_Test.csv")

In [38]:
df_airport_features_pivot = train_dataset.pivot_table(
    index='user_id',
    columns='airport',
    values='number_of_flights'
).fillna(0)
#df_airport_features_pivot = df_airport_features_pivot.drop(columns=['00AL'])

In [39]:
X_train = df_airport_features_pivot[list(df_airport_features_pivot.columns)[0:30000]]
X_test = df_airport_features_pivot[list(df_airport_features_pivot.columns)[30001:-1]]

In [40]:
X_train.head()

airport,00AL,00C,00F,00FL,00M,00MN,00N,00NC,00R,00S,...,Y87,Y88,Y89,Y91,Y93,Y94,Y95,Y96,Y99,Z98
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
user_0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
user_1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
user_10,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
user_100,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
user_1000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [41]:
df_airport_features_pivot.head()

airport,00AL,00C,00F,00FL,00M,00MN,00N,00NC,00R,00S,...,Y87,Y88,Y89,Y91,Y93,Y94,Y95,Y96,Y99,Z98
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
user_0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
user_1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
user_10,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
user_100,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
user_1000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [42]:
df_airport_features_pivot.loc[['user_0']]

airport,00AL,00C,00F,00FL,00M,00MN,00N,00NC,00R,00S,...,Y87,Y88,Y89,Y91,Y93,Y94,Y95,Y96,Y99,Z98
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
user_0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [43]:
model_knn = NearestNeighbors(metric='cosine', algorithm='brute', n_neighbors=20, n_jobs=-1)
model_knn.fit(X_train)

NearestNeighbors(algorithm='brute', leaf_size=30, metric='cosine',
         metric_params=None, n_jobs=-1, n_neighbors=20, p=2, radius=1.0)

In [73]:
KNearestNeighbors = model_knn.kneighbors([df_airport_features_pivot.loc['user_31411']])

In [74]:
print(KNearestNeighbors[1])

[[23731   318  7613  5950 37479 10943 31237 20761 30026 36139 15376 25516
  17479 16387 37693 34359  8948 20868 26167 34260]]


In [64]:
neighbor = X_train.iloc[0]
neighbor['00AL']

0.0

In [75]:
listOfAverages = []
for airport in X_train.columns:
    counter = 0
    totalRating = 0
    for index in KNearestNeighbors[1][0]:
        neighbor = X_train.iloc[index] 
        totalRating += neighbor[airport]
        counter += 1
    averageRating = totalRating//counter
    listOfAverages.append([averageRating,airport])
    

In [76]:
sorted(listOfAverages)

[[0.0, '00AL'],
 [0.0, '00C'],
 [0.0, '00F'],
 [0.0, '00FL'],
 [0.0, '00M'],
 [0.0, '00MN'],
 [0.0, '00N'],
 [0.0, '00NC'],
 [0.0, '00R'],
 [0.0, '00S'],
 [0.0, '00SC'],
 [0.0, '00U'],
 [0.0, '00WA'],
 [0.0, '01CL'],
 [0.0, '01G'],
 [0.0, '01GE'],
 [0.0, '01J'],
 [0.0, '01M'],
 [0.0, '01MD'],
 [0.0, '01MT'],
 [0.0, '01NC'],
 [0.0, '01WY'],
 [0.0, '02A'],
 [0.0, '02AR'],
 [0.0, '02C'],
 [0.0, '02FA'],
 [0.0, '02G'],
 [0.0, '02GA'],
 [0.0, '02GE'],
 [0.0, '02KS'],
 [0.0, '02MO'],
 [0.0, '02T'],
 [0.0, '02WA'],
 [0.0, '02WN'],
 [0.0, '03B'],
 [0.0, '03D'],
 [0.0, '03ME'],
 [0.0, '03NC'],
 [0.0, '03S'],
 [0.0, '04A'],
 [0.0, '04CA'],
 [0.0, '04G'],
 [0.0, '04I'],
 [0.0, '04M'],
 [0.0, '04TN'],
 [0.0, '04V'],
 [0.0, '04VG'],
 [0.0, '04W'],
 [0.0, '04Y'],
 [0.0, '05C'],
 [0.0, '05D'],
 [0.0, '05KY'],
 [0.0, '05N'],
 [0.0, '05S'],
 [0.0, '05U'],
 [0.0, '05V'],
 [0.0, '05XS'],
 [0.0, '06A'],
 [0.0, '06B'],
 [0.0, '06C'],
 [0.0, '06D'],
 [0.0, '06FA'],
 [0.0, '06FD'],
 [0.0, '06ID'],
 [0.0, '06

In [None]:
feature_cols = ['number_fbos','number_of_flights', 'number_restaurants', 'number_hotels', 'number_maintenance', 'number_flight_schools','number_flying_clubs']
target = 'airport'
ID_col = 'user_id'
train_X = train_dataset[feature_cols]
train_Y = train_dataset[target]
test_dataset_X = test_dataset[feature_cols]
test__dataset_Y = test_dataset[target]
test_Id = test_dataset[ID_col]
predictors = [x for x in train_features_X.columns if x not in [target,ID_col]]

In [None]:
print(confusion_matrix(y_test, y_pred))
print(classification_report(y_test, y_pred))