In [18]:
import numpy as np
import pandas as pd

In [19]:
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import MinMaxScaler

In [20]:
from sklearn.ensemble import GradientBoostingClassifier
from xgboost import XGBClassifier

# Report Data

In [21]:
ufo_df = pd.read_csv('data/ufo_df.csv', index_col=0)
ufo_df.head()

Unnamed: 0_level_0,datetime,geolocation,season,month,day,time_of_day,region,shape,duration,report_text
report_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
073/S73909,2010-01-01 00:00:00,"Park City, KY",Winter,January,Friday,Night,East South Central,Light,3.0,"Lights orbiting the moon,I am 10,and i was wit..."
073/S73915,2010-01-01 00:00:00,"La Mesa, CA",Winter,January,Friday,Night,Pacific,Light,600.0,Three red lights over southern California that...
078/S78231,2010-01-01 00:00:00,"Benton, AR",Winter,January,Friday,Night,West South Central,Circle,300.0,4 bright green circles high in the sky going i...
073/S73918,2010-01-01 00:00:00,"El Cajon, CA",Winter,January,Friday,Night,Pacific,Triangle,720.0,"3 Red objects hovering over El Cajon CA ,Exit..."
073/S73916,2010-01-01 00:00:00,"Lemon Grove, CA",Winter,January,Friday,Night,Pacific,Light,900.0,3 Red lights in line pattern above El Cajon/ E...


In [22]:
ufo_df.loc[y[y.isna()].index, :]

Unnamed: 0_level_0,datetime,geolocation,season,month,day,time_of_day,region,shape,duration,report_text
report_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
085/S85858,2011-11-21 02:19:00,"Woodland, CA",Fall,November,Monday,Night,Pacific,Unknown,600.0,"Re: ""Abduction"" at Woodland, CA the descriptio..."
087/S87719,2012-03-15 21:30:00,"Pearl, MS",Winter,March,Thursday,Evening,East South Central,Unknown,7.0,isawabluestarmovingslowwthadstrobelightaroundi...
106/S106746,2014-01-29 05:30:00,"Fort Lauderdale, FL",Winter,January,Wednesday,Night,South Atlantic,Unknown,2.0,Extremely loud boom!
132/S132902,2017-02-24 21:00:00,"Bridgeport, CT",Winter,February,Friday,Evening,New England,Circle,787.613198,Yesterday I made a claim apparently they relea...
135/S135404,2017-07-28 22:30:00,"Chicago, IL",Summer,July,Friday,Evening,East North Central,Fireball,787.613198,Three ufos. Fire type over Humboldt Park.


In [23]:
# report topic as target variable
y = pd.read_csv('data/ufo_tops.csv', index_col=0, header=None, names=['topic'])['topic']
y.head()

073/S73909          Observation Terms
073/S73915                   Lights 1
078/S78231                   Lights 2
073/S73918    Observation Description
073/S73916                     Colors
Name: topic, dtype: object

In [24]:
y[y.isna()]

Series([], Name: topic, dtype: object)

In [25]:
# number of target classes
num_cls = len(y.unique())

## Categorical Data

In [26]:
# encode categorical variables
X_cat = pd.get_dummies(ufo_df[['season', 'month', 'day', 'time_of_day', 'region', 'shape']])
X_cat.head()

Unnamed: 0_level_0,season_Fall,season_Spring,season_Summer,season_Winter,month_April,month_August,month_December,month_February,month_January,month_July,...,shape_Flash,shape_Formation,shape_Light,shape_Other,shape_Oval,shape_Rectangle,shape_Sphere,shape_Teardrop,shape_Triangle,shape_Unknown
report_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
073/S73909,0,0,0,1,0,0,0,0,1,0,...,0,0,1,0,0,0,0,0,0,0
073/S73915,0,0,0,1,0,0,0,0,1,0,...,0,0,1,0,0,0,0,0,0,0
078/S78231,0,0,0,1,0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0
073/S73918,0,0,0,1,0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,1,0
073/S73916,0,0,0,1,0,0,0,0,1,0,...,0,0,1,0,0,0,0,0,0,0


## Feature Dataframe

In [27]:
# combine numerical and encoded categorical variables
X = pd.concat([ufo_df['duration'], X_cat], axis=1)
X.head()

Unnamed: 0_level_0,duration,season_Fall,season_Spring,season_Summer,season_Winter,month_April,month_August,month_December,month_February,month_January,...,shape_Flash,shape_Formation,shape_Light,shape_Other,shape_Oval,shape_Rectangle,shape_Sphere,shape_Teardrop,shape_Triangle,shape_Unknown
report_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
073/S73909,3.0,0,0,0,1,0,0,0,0,1,...,0,0,1,0,0,0,0,0,0,0
073/S73915,600.0,0,0,0,1,0,0,0,0,1,...,0,0,1,0,0,0,0,0,0,0
078/S78231,300.0,0,0,0,1,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0
073/S73918,720.0,0,0,0,1,0,0,0,0,1,...,0,0,0,0,0,0,0,0,1,0
073/S73916,900.0,0,0,0,1,0,0,0,0,1,...,0,0,1,0,0,0,0,0,0,0


## Test/Train Split

In [8]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

# Modeling

In [9]:
xgb_clf = XGBClassifier(objective='multi:softmax', num_class=num_cls, random_state=0)
xgb_clf.fit(X_train, y_train)

TypeError: '<' not supported between instances of 'str' and 'float'

In [47]:
gb_clf = GradientBoostingClassifier(random_state=0)
gb_clf.fit(X_train, y_train)

TypeError: '<' not supported between instances of 'str' and 'float'