In [1]:
from mpl_toolkits.basemap import Basemap
import matplotlib.pyplot as plt
import pandas as pd
import numpy as  np
from sklearn.cluster import KMeans
from sklearn.preprocessing import scale
from sklearn.decomposition import PCA

import sys
import os

path = os.path.abspath(os.getcwd()) + '/../data_load'
sys.path.insert(0, path)

In [2]:
import datetime, time

def dateToInt(x):    
    _date = x.split('/')
    
    year, time = _date[2].split()
    hour, minute = time.split(':')

    if int(hour) > 23:
        hour = 23
    if int(hour) < 0:
        hour = 0
        
    t = datetime.datetime(int(year), int(_date[0]), int(_date[1]), int(hour), int(minute), 0, 0).timestamp()
    return t

def getLabelMap(data):
    # Map shape to an integer
    labels = data['shape']
    labels = labels.unique()
    nums = [i for i in range(len(labels))]
    return dict(zip(labels, nums))
    

# Vectorize strings

In [3]:
from sklearn.feature_extraction.text import CountVectorizer

def vectorize(text):    
    # create the transform
    vectorizer = CountVectorizer()

    # tokenize and build vocab
    vectorizer.fit(text)

    # encode document
    vector = vectorizer.transform(text)

    return vector.toarray()

def vectorize_dm(text):
    n_features = 5000
    n_top_words = 20

    count_vectorizor = CountVectorizer(
            max_df=0.95, 
            min_df=2,
            max_features=n_features,
            stop_words='english'
    )
    count = count_vectorizor.fit_transform(text)
#     count_feature_names = count_vectorizor.get_feature_names()
#     print(count_feature_names)
    return count.toarray()

In [4]:
import os
import sys
path = os.path.abspath(os.getcwd()) + '/../data_load'
sys.path.insert(0, path)
from ufo_data import UFOData

names=['datetime', 'shape', 'duration(seconds)', 'comments','longitude', 'latitude']

ufo_data = UFOData(cols=names, country='us')
data = ufo_data.encoded()

# Create dataframe out of the comment vector
commentDf = pd.DataFrame(vectorize_dm(data['comments']))
# commentDf['key'] = [i for i in range(commentDf.shape[0])]
print(commentDf.shape)
print(data.shape)

(64896, 5000)
(64896, 6)


# Apply Principle Component Anaysis (PCA) to only extract the 100 most relevant features

In [5]:
pca = PCA(n_components=20)
# XX = pca.fit_transform(X)

commentDf = pca.fit_transform(commentDf)
commentDf = pd.DataFrame(commentDf)

## Merge the comment vectors with data
#### Takes a  long time

In [6]:

# Since we already vectorized the comments we can get rid of them now
if 'comments' in names:
    names.remove('comments')
# names.remove('key')
# Add key to merge the dataframes    
if 'key' not in names:
    names.append('key')
    data['key'] = [i for i in range(data.shape[0])]
    commentDf['key'] = [i for i in range(commentDf.shape[0])]        

dt = data[names].merge(commentDf, on='key', how='left')

dt.head()

Unnamed: 0,datetime,shape,duration(seconds),longitude,latitude,key,0,1,2,3,...,10,11,12,13,14,15,16,17,18,19
0,-638224200.0,0,2700,29.8830556,-97.9411111,0,-0.593106,-0.190754,-0.28232,-0.246347,...,-0.013473,0.033756,-0.069167,-0.150652,-0.021241,-0.052523,-0.055023,-0.090842,0.078578,-0.000604
1,-417297600.0,1,20,28.9783333,-96.6458333,1,0.409239,-0.201193,-0.328218,-0.263124,...,-0.02934,-0.010125,-0.110545,-0.139359,-0.010865,-0.089516,-0.05837,-0.06404,0.072943,-0.002115
2,-291070800.0,2,900,21.4180556,-157.8036111,2,0.471579,-0.226775,-0.324664,-0.179353,...,-0.199383,-0.038169,0.072736,-0.134776,-0.007711,-0.176942,0.684728,-0.177663,-0.271457,-0.471136
3,-259538400.0,3,300,36.595,-82.1888889,3,-0.594173,-0.198979,-0.289476,-0.26095,...,-0.01167,0.040052,-0.067081,-0.167316,-0.030233,-0.069421,-0.096605,-0.081779,0.007832,-0.006804
4,-133294500.0,4,1200,41.1175,-73.4083333,4,-0.542606,0.145994,0.040089,-0.161465,...,0.193471,0.065173,0.212618,0.179415,0.139464,0.027179,-0.222321,-0.120377,0.115508,-0.106054


# Get x and y

In [7]:
# label_dict = getLabelMap(dt)

Y = np.array(dt['shape']) #np.array([label_dict[i] for i in dt['shape']])

x_cols = list(dt)

if 'shape' in x_cols:
    x_cols.remove('shape')
    x_cols.remove('datetime')

X = dt[x_cols]

In [44]:
from sklearn.model_selection import train_test_split


X_train, X_test, y_train, y_test = train_test_split(
    X,
    Y,
    test_size=0.2, 
    random_state=42)

In [45]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import f1_score

# rf = RandomForestClassifier(n_estimators=25)
rf = RandomForestClassifier(n_estimators=25, max_features='auto', max_depth=None)
rf.fit(X_train, y_train)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_split=1e-07, min_samples_leaf=1,
            min_samples_split=2, min_weight_fraction_leaf=0.0,
            n_estimators=25, n_jobs=1, oob_score=False, random_state=None,
            verbose=0, warm_start=False)

In [None]:
# pred = rf.predict(X_test)
y_pred_probs = rf.predict_proba(X_test)
y_pred = rf.predict(X_test)

print(rf.score(X_test, y_test))
s = y_test
count = 0

for i in range(len(y_pred)):
    if y_pred[i] == s[i]:
        count += 1
print(count)

In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import f1_score

def RandomForest(X_train, y_train, X_test, y_test):
    rf = RandomForestClassifier(n_estimators=25, max_features='auto', max_depth=None)
    rf.fit(X_train, y_train)
    
    y_pred_probs = rf.predict_proba(X_test)
    y_pred = rf.predict(X_test)

    print(rf.score(X_test, y_test))

# Accuracy and F Score

In [None]:
count/len(y_pred)

In [None]:

# scores = f1_score(y_test, y_guess, average=None)
scores = f1_score(y_test, y_pred, average='weighted', labels=np.unique(y_pred))


In [None]:
print(scores)
print(np.mean(scores))

In [None]:
print(count/len(y_pred))

scores = f1_score(y_test, y_pred, average='weighted', labels=np.unique(y_pred))

print(scores)
print(np.mean(scores))



In [None]:
from sklearn.metrics import confusion_matrix
cm = confusion_matrix(y_test, y_pred)
cm

In [8]:
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split



def prepare_data(X, Y):


    X_train, X_test, y_train, y_test = train_test_split(
        X,
        Y,
        test_size=0.2, 
        random_state=42)
    
    scaling = MinMaxScaler(feature_range=(-1,1)).fit(X_train)
    X_train = scaling.transform(X_train)
    X_test = scaling.transform(X_test)
    
    return X_train, X_test, y_train, y_test

In [9]:
# training a linear SVM classifier
from sklearn.svm import SVC
def svm(X_train, y_train, X_test, y_test):
   
    svm_model_linear = SVC(kernel = 'linear', C = 1, cache_size=7000).fit(X_train, y_train)
    svm_predictions = svm_model_linear.predict(X_test)

    # model accuracy for X_test
    accuracy = svm_model_linear.score(X_test, y_test)

    # creating a confusion matrix
#     cm = confusion_matrix(y_test, svm_predictions)
    print(accuracy)
#     return cm

In [12]:
X_train, X_test, y_train, y_test = prepare_data(X, Y)
svm(X_train, y_train, X_test, y_test)

0.269568567026


In [16]:
import sys
import os

path = os.path.abspath(os.getcwd()) + '/../credibility_score'
sys.path.insert(0, path)

from scoring import Scoring
from ufo_data import UFOData


names=['datetime', 'city','state', 'shape', 'duration(seconds)', 'comments','date posted', 'longitude', 'latitude']
loader = UFOData(names, 'us')
data = loader.encoded()

#calculate credibility scores on data
s = Scoring(data)
scored_data = s.calc_scores()
    



KeyError: "['timestamp'] not in index"

In [None]:
names=['state', 'duration(seconds)', 'longitude', 'latitude', 'posted_ts', 'dbscan_cluster','credibility']    
X = scored_data[names]   
Y = scored_data['shape']

X_train, X_test, y_train, y_test = prepare_data(X, Y)

print('SVM')
svm(X_train, y_train, X_test, y_test)

print('Random Forest')
RandomForest(X_train, y_train, X_test, y_test)

In [27]:
names=['state', 'duration(seconds)', 'longitude', 'latitude']

X = data[names]   
Y = data['shape']

X_train, X_test, y_train, y_test = prepare_data(X, Y)
print('SVM')
svm(X_train, y_train, X_test, y_test)

print('Random Forest')
RandomForest(X_train, y_train, X_test, y_test)

SVM
0.208397534669
Random Forest
0.130277349769


KeyError: 0