In [2]:
from mpl_toolkits.basemap import Basemap
import matplotlib.pyplot as plt
import pandas as pd
import numpy as  np
from sklearn.cluster import KMeans
from sklearn.preprocessing import scale
from sklearn.decomposition import PCA

In [3]:
import datetime, time

def dateToInt(x):    
    _date = x.split('/')
    
    year, time = _date[2].split()
    hour, minute = time.split(':')

    if int(hour) > 23:
        hour = 23
    if int(hour) < 0:
        hour = 0
        
    t = datetime.datetime(int(year), int(_date[0]), int(_date[1]), int(hour), int(minute), 0, 0).timestamp()
    return t

def getLabelMap(data):
    # Map shape to an integer
    labels = data['shape']
    labels = labels.unique()
    nums = [i for i in range(len(labels))]
    return dict(zip(labels, nums))
    

# Vectorize strings

In [4]:
from sklearn.feature_extraction.text import CountVectorizer

def vectorize(text):    
    # create the transform
    vectorizer = CountVectorizer()

    # tokenize and build vocab
    vectorizer.fit(text)

    # encode document
    vector = vectorizer.transform(text)

    return vector.toarray()

def vectorize_dm(text):
    n_features = 5000
    n_top_words = 20

    count_vectorizor = CountVectorizer(
            max_df=0.95, 
            min_df=2,
            max_features=n_features,
            stop_words='english'
    )
    count = count_vectorizor.fit_transform(text)
#     count_feature_names = count_vectorizor.get_feature_names()
#     print(count_feature_names)
    return count.toarray()

# TFIDF

In [5]:
from sklearn.feature_extraction.text import TfidfVectorizer

def tfidf(text):
    from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

    vectorizer = TfidfVectorizer(
    #     min_df=1,
        max_df=0.7, 
        analyzer='word',
        ngram_range=(1, 1),
#         stop_words=None,
        stop_words='english')
    vector = vectorizer.fit_transform(text)
    # vectorizer.get_feature_names()
    return vector.toarray() 

# Loading and Cleaning up data

In [12]:
names=['datetime', 'city', 'state', 'country',
                              'shape', 'duration(seconds)', 'duration(hours/min)', 'comments',
                              'date posted', 'longitude', 'latitude']
data = pd.read_csv('../ufo-scrubbed-geocoded-time-standardized.csv',
                   names=names,
                   dtype=object,
                   error_bad_lines=False, warn_bad_lines=False)

data = data[data['country'] == 'us']

# Columns currently not using
names.remove('date posted')
names.remove('duration(hours/min)')
names.remove('country')
names.remove('state')
names.remove('city')

# Make date into ints
data['datetime'] = data.datetime.apply(lambda x: dateToInt(x))

# Filter rows that have strings in the duration column
data = data[data['duration(seconds)'].apply(lambda x : str.isdigit(x))]

# Add keys to be able to merge
data['key'] = [i for i in range(data.shape[0])]
names.append('key')


# Creating Comment Features

In [15]:
# Get rid of unicode characters
text = data["comments"].apply(lambda x: ''.join([" " if ord(i) < 32 or ord(i) > 126 else i for i in str(x)]))

# Create dataframe out of the comment vector
# commentDf = pd.DataFrame(vectorize_dm(text))
commentDf = pd.DataFrame(tfidf(text))
commentDf['key'] = [i for i in range(commentDf.shape[0])]
print(commentDf.shape)
print(data.shape)

(64896, 24420)
(64896, 12)


# Apply Principle Component Anaysis (PCA) to only extract the 100 most relevant features

In [None]:
pca = PCA(n_components=100)
dt = pca.fit_transform(commentDf)

In [40]:
dt.shape

(64896, 100)

# Merge the comment vectors with data
 Takes a  long time

In [30]:
# Since we already vectorized the comments we can get rid of them now
if 'comments' in names:
    names.remove('comments')
_dt = pd.DataFrame(dt)    
_dt['key'] = [i for i in range(_dt.shape[0])]
dt_combined = data[names].merge(_dt, on='key', how='left')
dt_combined.head()

Unnamed: 0,datetime,shape,duration(seconds),longitude,latitude,key,0,1,2,3,...,90,91,92,93,94,95,96,97,98,99
0,-638224200.0,cylinder,2700,29.8830556,-97.9411111,0,32447.5,0.022872,-0.048544,-0.093274,...,-0.018625,0.019019,0.018338,-0.012718,-0.03104,-0.007422,0.028385,-0.027876,-0.026148,-0.008617
1,-417297600.0,circle,20,28.9783333,-96.6458333,1,32446.5,-0.031388,-0.128362,-0.021686,...,-0.02491,-0.018309,-0.020805,0.02478,-0.02168,0.029465,0.013871,-0.00799,0.022723,0.018363
2,-291070800.0,light,900,21.4180556,-157.8036111,2,32445.5,-0.064271,-0.045229,-0.035193,...,0.007032,0.026768,-0.015443,0.056996,-0.013207,-0.009036,-0.002535,0.001083,-0.049151,0.096894
3,-259538400.0,sphere,300,36.595,-82.1888889,3,32444.5,-0.024537,-0.085541,-0.077092,...,0.013916,0.007839,-0.010955,-0.015469,0.00307,0.006732,-0.018667,-0.003456,-0.007307,0.022287
4,-133294500.0,disk,1200,41.1175,-73.4083333,4,32443.5,-0.037341,-0.011301,-0.013076,...,-0.014662,0.219529,0.052545,0.027624,-0.007185,0.098949,-0.182941,0.04991,0.077155,0.111064


## Test-Train Split

For testing the classifiers we will split the data into a 'test' and 'train' set. This will be done with a 40-60% (test-train) split.

In [33]:
label_dict = getLabelMap(dt_combined)

x_cols = list(dt_combined)

if 'shape' in x_cols:
    x_cols.remove('shape')

Y = np.array([label_dict[i] for i in dt_combined['shape']])    
X = dt_combined[x_cols]

In [34]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(
    X,
    Y,
    test_size=0.2, 
    random_state=42)


In [35]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import f1_score

# rf = RandomForestClassifier(n_estimators=25)
rf = RandomForestClassifier(n_estimators=25, max_features='auto', max_depth=None)
rf.fit(X_train, y_train)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_split=1e-07, min_samples_leaf=1,
            min_samples_split=2, min_weight_fraction_leaf=0.0,
            n_estimators=25, n_jobs=1, oob_score=False, random_state=None,
            verbose=0, warm_start=False)

In [36]:
# pred = rf.predict(X_test)
y_pred_probs = rf.predict_proba(X_test)
y_pred = rf.predict(X_test)

s = y_test
count = 0

for i in range(len(y_pred)):
    if y_pred[i] == s[i]:
        count += 1
print(count)


4682


# Accuracy and F Score

In [37]:
count/len(y_pred)

0.36070878274268103

In [38]:

# scores = f1_score(y_test, y_guess, average=None)
scores = f1_score(y_test, y_pred, average='weighted', labels=np.unique(y_pred))


In [39]:
print(scores)
print(np.mean(scores))

0.322256593672
0.322256593672
