In [4]:
from mpl_toolkits.basemap import Basemap
import matplotlib.pyplot as plt
import pandas as pd
import numpy as  np
from sklearn.cluster import KMeans
from sklearn.preprocessing import scale
from sklearn.decomposition import PCA

In [5]:
import datetime, time

def dateToInt(x):    
    _date = x.split('/')
    
    year, time = _date[2].split()
    hour, minute = time.split(':')

    if int(hour) > 23:
        hour = 23
    if int(hour) < 0:
        hour = 0
        
    t = datetime.datetime(int(year), int(_date[0]), int(_date[1]), int(hour), int(minute), 0, 0).timestamp()
    return t

def getLabelMap(data):
    # Map shape to an integer
    labels = data['shape']
    labels = labels.unique()
    nums = [i for i in range(len(labels))]
    return dict(zip(labels, nums))
    

# Vectorize strings

In [16]:
from sklearn.feature_extraction.text import CountVectorizer

def vectorize(text):    
    # create the transform
    vectorizer = CountVectorizer()

    # tokenize and build vocab
    vectorizer.fit(text)

    # encode document
    vector = vectorizer.transform(text)

    return vector.toarray()

def vectorize_dm(text):
    n_features = 5000
    n_top_words = 20

    count_vectorizor = CountVectorizer(
            max_df=0.95, 
            min_df=2,
            max_features=n_features,
            stop_words='english'
    )
    count = count_vectorizor.fit_transform(text)
#     count_feature_names = count_vectorizor.get_feature_names()
#     print(count_feature_names)
    return count.toarray()

# TFIDF

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
# list of text documents
text = ["The quick brown fox jumped over the lazy dog.",
        "The dog.",
        "The fox"]
# create the transform
vectorizer = TfidfVectorizer()

# tokenize and build vocab
vector = vectorizer.fit_transform(text)

# # summarize
# print(vectorizer.vocabulary_)
# print(vectorizer.idf_)

# # encode document
# vector = vectorizer.transform(text)

# summarize encoded vector
# print(vector.shape)
print(vector.toarray())
print('\n')
adf = pd.DataFrame(vector.todense())
adf

In [19]:
names=['datetime', 'city', 'state', 'country',
                              'shape', 'duration(seconds)', 'duration(hours/min)', 'comments',
                              'date posted', 'longitude', 'latitude']
data = pd.read_csv('ufo-scrubbed-geocoded-time-standardized.csv',
                   names=names,
                   dtype=object,
                   error_bad_lines=False, warn_bad_lines=False)

data = data[data['country'] == 'us']

# Columns currently not using
# names.remove('shape')
names.remove('date posted')
names.remove('duration(hours/min)')
names.remove('country')
names.remove('state')
names.remove('city')

# Make date into ints
data['datetime'] = data.datetime.apply(lambda x: dateToInt(x))

# Filter rows that have strings in the duration column
data = data[data['duration(seconds)'].apply(lambda x : str.isdigit(x))]

# Add keys to be able to merge
data['key'] = [i for i in range(data.shape[0])]
names.append('key')

# Get rid of unicode characters
text = data["comments"].apply(lambda x: ''.join([" " if ord(i) < 32 or ord(i) > 126 else i for i in str(x)]))

# Create dataframe out of the comment vector
commentDf = pd.DataFrame(vectorize_dm(text))
commentDf['key'] = [i for i in range(commentDf.shape[0])]
print(commentDf.shape)
print(data.shape)

# Merge comment dataframe and original dataframe
# data.merge(commentDf, on='key', how='left')

# data.shape[0]

#

(64896, 5001)
(64896, 12)


# Merge the comment vectors with data
#### Takes a  long time

In [20]:
# Since we already vectorized the comments we can get rid of them now
if 'comments' in names:
    names.remove('comments')
    
dt = data[names].merge(commentDf, on='key', how='left')
dt.head()

Unnamed: 0,datetime,shape,duration(seconds),longitude,latitude,key,0,1,2,3,...,4990,4991,4992,4993,4994,4995,4996,4997,4998,4999
0,-638224200.0,cylinder,2700,29.8830556,-97.9411111,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,-417297600.0,circle,20,28.9783333,-96.6458333,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,-291070800.0,light,900,21.4180556,-157.8036111,2,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,-259538400.0,sphere,300,36.595,-82.1888889,3,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,-133294500.0,disk,1200,41.1175,-73.4083333,4,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


# Get x and y

In [21]:
label_dict = getLabelMap(dt)

Y = np.array([label_dict[i] for i in dt['shape']])

x_cols = list(dt)

if 'shape' in x_cols:
    x_cols.remove('shape')

X = dt[x_cols]

# Apply Principle Component Anaysis (PCA) to only extract the 100 most relevant features

In [22]:
pca = PCA(n_components=100)
XX = pca.fit_transform(X)

In [26]:
from sklearn.model_selection import train_test_split


X_train, X_test, y_train, y_test = train_test_split(
    X,
    Y,
    test_size=0.2, 
    random_state=42)
# print(X_train.shape, y_train.shape)
# print(X_test.shape, y_test.shape)
# print(X.shape)
# print(X.head(10))

In [47]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import f1_score

# rf = RandomForestClassifier(n_estimators=25)
rf = RandomForestClassifier(n_estimators=25, max_features='auto', max_depth=None)
rf.fit(X_train, y_train)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_split=1e-07, min_samples_leaf=1,
            min_samples_split=2, min_weight_fraction_leaf=0.0,
            n_estimators=25, n_jobs=1, oob_score=False, random_state=None,
            verbose=0, warm_start=False)

In [48]:
# pred = rf.predict(X_test)
y_pred_probs = rf.predict_proba(X_test)
y_pred = rf.predict(X_test)


s = y_test
count = 0

for i in range(len(y_pred)):
    if y_pred[i] == s[i]:
        count += 1
print(count)


5961


# Accuracy and F Score

In [53]:
count/len(y_pred)

0.45924499229583976

In [52]:

# scores = f1_score(y_test, y_guess, average=None)
scores = f1_score(y_test, y_pred, average='weighted', labels=np.unique(y_pred))


In [51]:
print(scores)
print(np.mean(scores))

0.437853538339
0.437853538339
