In [1]:
import numpy as np
import pandas as pd

In [2]:
import random

In [3]:
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import MinMaxScaler

In [4]:
from sklearn.ensemble import GradientBoostingClassifier
from xgboost import XGBClassifier

  from numpy.core.umath_tests import inner1d


# Report Data

In [5]:
def sample_skip_idx(file, pct, rand=False):
    '''
    Generate row index for sampling data
    
    Parameters
    ----------
    file : data file
    pct : percent of data to sample
    rand : randomly sample
    '''
    
    # total number of lines in file
    n_lines = sum(1 for line in open(file))
    
    # sample randomly
    if rand: 
        # sample size
        size = int(n_lines * pct/100)
        # row indicies to skip
        skip_idx = random.sample(range(1, n_lines), n_lines - size)
    # sample every nth row    
    else: 
        # number of lines to skip
        n = 1/pct*100
        # row indicies to skip
        skip_idx = [x for x in range(1, n_lines) if x % n != 0]
        
    # return row index to skip
    return(skip_idx)

In [7]:
# sample every 10th row
skiprows = sample_skip_idx('data/ufo_df.csv', 10)
ufo_df = pd.read_csv('data/ufo_df.csv', index_col=0, skiprows=skiprows)
ufo_df.head()

Unnamed: 0_level_0,datetime,geolocation,season,month,day,time_of_day,region,shape,duration,report_text
report_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
073/S73908,2010-01-01 00:00:00,"St. Louis, MO",Winter,January,Friday,Night,West North Central,Fireball,60.0,7 floating orbs or fireballs moving from south...
097/S97247,2010-01-01 00:20:00,"Anchorage, AK",Winter,January,Friday,Night,Pacific,Light,787.613198,"Upper atmosphere object moving south, becoming..."
073/S73917,2010-01-01 12:02:00,"San Diego, CA",Winter,January,Friday,Afternoon,Pacific,Triangle,180.0,"It happen nEw years eve 2010,Triangle red ligh..."
085/S85087,2010-01-02 18:45:00,"Middleburg, FL",Winter,January,Saturday,Evening,South Atlantic,Sphere,600.0,"I was on my way out the door of my house , whe..."
073/S73980,2010-01-04 19:10:00,"Cedar Rapids, IA",Winter,January,Monday,Evening,West North Central,Light,900.0,Two very bright white lights come close togeth...


In [14]:
len(ufo_df)

5151

In [8]:
# report topic as target variable
y = pd.read_csv('data/ufo_tops.csv', index_col=0, header=None, names=['topic'])['topic']
y.head()

073/S73909          Observation Terms
073/S73915                   Lights 1
078/S78231                   Lights 2
073/S73918    Observation Description
073/S73916                     Colors
Name: topic, dtype: object

In [9]:
# sample target data
y = y[y.index.isin(ufo_df.index)]
y.head()

073/S73908    Observation Terms
097/S97247            Direction
073/S73917               Colors
085/S85087    Observation Terms
073/S73980             Lights 2
Name: topic, dtype: object

In [15]:
len(y)

6315

In [18]:
y[y.index.duplicated()]

095/S95670     Observation Terms
095/S95821          Fiery Lights
095/S95714          Fiery Lights
095/S95667          NUFORC Notes
095/S95661     Observation Terms
                     ...        
105/S105812         NUFORC Notes
105/S105874         Fiery Lights
105/S105964    Observation Terms
105/S105815         Fiery Lights
105/S105850         Fiery Lights
Name: topic, Length: 1164, dtype: object

In [10]:
# number of target classes
num_cls = len(y.unique())

## Categorical Data

In [11]:
# encode categorical variables
X_cat = pd.get_dummies(ufo_df[['season', 'month', 'day', 'time_of_day', 'region', 'shape']])
X_cat.head()

Unnamed: 0_level_0,season_Fall,season_Spring,season_Summer,season_Winter,month_April,month_August,month_December,month_February,month_January,month_July,...,shape_Flash,shape_Formation,shape_Light,shape_Other,shape_Oval,shape_Rectangle,shape_Sphere,shape_Teardrop,shape_Triangle,shape_Unknown
report_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
073/S73908,0,0,0,1,0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0
097/S97247,0,0,0,1,0,0,0,0,1,0,...,0,0,1,0,0,0,0,0,0,0
073/S73917,0,0,0,1,0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,1,0
085/S85087,0,0,0,1,0,0,0,0,1,0,...,0,0,0,0,0,0,1,0,0,0
073/S73980,0,0,0,1,0,0,0,0,1,0,...,0,0,1,0,0,0,0,0,0,0


## Feature Dataframe

In [12]:
# combine numerical and encoded categorical variables
X = pd.concat([ufo_df['duration'], X_cat], axis=1)
X.head()

Unnamed: 0_level_0,duration,season_Fall,season_Spring,season_Summer,season_Winter,month_April,month_August,month_December,month_February,month_January,...,shape_Flash,shape_Formation,shape_Light,shape_Other,shape_Oval,shape_Rectangle,shape_Sphere,shape_Teardrop,shape_Triangle,shape_Unknown
report_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
073/S73908,60.0,0,0,0,1,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0
097/S97247,787.613198,0,0,0,1,0,0,0,0,1,...,0,0,1,0,0,0,0,0,0,0
073/S73917,180.0,0,0,0,1,0,0,0,0,1,...,0,0,0,0,0,0,0,0,1,0
085/S85087,600.0,0,0,0,1,0,0,0,0,1,...,0,0,0,0,0,0,1,0,0,0
073/S73980,900.0,0,0,0,1,0,0,0,0,1,...,0,0,1,0,0,0,0,0,0,0


## Test/Train Split

In [13]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

ValueError: Found input variables with inconsistent numbers of samples: [5151, 6315]

# Modeling

In [11]:
xgb_clf = XGBClassifier(objective='multi:softmax', num_class=num_cls, random_state=0)
xgb_clf.fit(X_train, y_train)

KeyboardInterrupt: 

In [47]:
gb_clf = GradientBoostingClassifier(random_state=0)
gb_clf.fit(X_train, y_train)

TypeError: '<' not supported between instances of 'str' and 'float'