In [1]:
import numpy as np
import pandas as pd

In [2]:
import random
import pickle

In [3]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import NMF

# Data

In [4]:
ufo_df = pd.read_csv('data/ufo_df.csv', index_col=0)

In [5]:
ufo_df.head()

Unnamed: 0_level_0,datetime,geolocation,season,month,day,time_of_day,region,shape,duration,report_text
report_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
073/S73909,2010-01-01 00:00:00,"Park City, KY",Winter,January,Friday,Night,East South Central,Light,3.0,"Lights orbiting the moon,I am 10,and i was wit..."
073/S73915,2010-01-01 00:00:00,"La Mesa, CA",Winter,January,Friday,Night,Pacific,Light,600.0,Three red lights over southern California that...
078/S78231,2010-01-01 00:00:00,"Benton, AR",Winter,January,Friday,Night,West South Central,Circle,300.0,4 bright green circles high in the sky going i...
073/S73918,2010-01-01 00:00:00,"El Cajon, CA",Winter,January,Friday,Night,Pacific,Triangle,720.0,"3 Red objects hovering over El Cajon CA ,Exit..."
073/S73916,2010-01-01 00:00:00,"Lemon Grove, CA",Winter,January,Friday,Night,Pacific,Light,900.0,3 Red lights in line pattern above El Cajon/ E...


# Vectorize Report Text
Create sparse matrix of word counts for each document (report)

In [6]:
# text data
documents = ufo_df['report_text']

In [10]:
# # create instance of TFIDF vectorizer
# tf_vectorizer = TfidfVectorizer(stop_words='english', max_df=0.99, min_df=0.01, ngram_range=(1, 3))
# # transform documents into document-term matrix
# X = tf_vectorizer.fit_transform(documents)
# # save vectorizer
# with open('model_output/tfidf.pkl', 'wb') as outfile:
#     pickle.dump(tf_vectorizer, outfile)
# # save array
# with open('model_output/v_array.pkl', 'wb') as outfile:
#     pickle.dump(X, outfile)

In [12]:
# load vectorizer
with open('model_output/tfidf.pkl', 'rb') as infile:
    tf_vectorizer = pickle.load(infile)
# load vectorized array
with open('model_output/v_array.pkl', 'rb') as infile:
    X = pickle.load(infile)

In [11]:
# create dataframe of document-term matrix
tf_df = pd.DataFrame(X.toarray(), columns=[tf_vectorizer.get_feature_names()])
tf_df.head()

Unnamed: 0,00,00 pm,000,000 feet,10,10 15,10 minutes,10 seconds,100,1000,...,yards,year,year old,years,yellow,yellowish,youtube,zig,zoom,zoomed
0,0.0,0.0,0.0,0.0,0.096782,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.08439,0.0,0.137015,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.073019,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.081632,0.0,0.132537,0.0,0.0,0.0,...,0.0,0.128908,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


# Topic Modeling
Generate topics

In [13]:
# function to print top words of topic model
def print_top_words(model, feature_names, n_top_words):
    for index, topic in enumerate(model.components_):
        message = "\nTopic #{}: ".format(index)
        message += "; ".join([feature_names[i] for i in topic.argsort()[:-n_top_words - 1 :-1]])
        print(message)
        print("="*70)

In [14]:
# # create instance of model, input number of topics to output
# nmf = NMF(n_components=10, random_state=0)
# # run model
# nmf.fit(tf_df)
# # save model
# with open('model_output/nmf.pkl', 'wb') as outfile:
#     pickle.dump(nmf, outfile)

In [12]:
# load model
with open('model_output/nmf.pkl', 'rb') as infile:
    nmf = pickle.load(infile)

In [15]:
# create array of topic weights per document
topic_array = nmf.transform(tf_df)

In [16]:
# display top words for each topic
print_top_words(nmf, tf_vectorizer.get_feature_names(), 20)


Topic #0: object; shaped; shaped object; appeared; object appeared; sky; shape; object sky; object moving; white; observed; large; object moved; circular; approximately; saw object; object seen; stationary; noticed; did

Topic #1: provides; information; anonymous; elects; contact information; elects remain; anonymous provides; remain; totally anonymous; remain totally; remain totally anonymous; elects remain totally; totally anonymous provides; witness elects; contact; witness elects remain; information pd; note witness elects; provides contact; provides contact information

Topic #2: lights; formation; triangle; white lights; lights sky; orange lights; lights moving; bright lights; white; red lights; blinking; line; shape; flashing; lights appeared; triangular; sky; moving; saw lights; flashing lights

Topic #3: light; bright; white; bright light; white light; sky; bright white; star; bright white light; light sky; flash; disappeared; moving; light moving; seconds; saw bright; bright

## Greatest Topic

In [17]:
# topic label dictionary
topic_dict = {
    0: 'Object Description',
    1: 'NUFORC Notes 1',
    2: 'Light Description 1',
    3: 'Light Description 2',
    4: 'Fiery Lights',
    5: 'General Observation Terms',
    6: 'NUFORC Notes 2',
    7: 'Craft Description',
    8: 'Colors',
    9: 'Direction'
}

In [18]:
# create topic dataframe
topic_df = pd.DataFrame(topic_array, columns=topic_dict.values(), index=ufo_df.index)
# convert topic weights to percentile per document
topic_df = round(topic_df.div(topic_df.sum(axis=1), axis=0)*100, 0)
topic_df.head()

Unnamed: 0_level_0,Object Description,NUFORC Notes 1,Light Description 1,Light Description 2,Fiery Lights,General Observation Terms,NUFORC Notes 2,Craft Description,Colors,Direction
report_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
073/S73909,5.0,0.0,23.0,13.0,6.0,48.0,1.0,0.0,4.0,0.0
073/S73915,0.0,0.0,50.0,0.0,3.0,9.0,0.0,1.0,23.0,14.0
078/S78231,0.0,0.0,0.0,45.0,0.0,22.0,0.0,0.0,33.0,0.0
073/S73918,0.0,0.0,4.0,0.0,0.0,11.0,0.0,0.0,40.0,44.0
073/S73916,0.0,0.0,23.0,16.0,3.0,18.0,0.0,0.0,23.0,16.0


In [20]:
# assign dominant topic to report observation
ufo_tops = topic_df.idxmax(axis=1)

In [21]:
# number of missing topics
ufo_tops.isna().sum()

1

In [22]:
# replace empty with most common
ufo_tops.fillna(ufo_tops.mode().values[0], inplace=True)

In [23]:
# export topic series
# ufo_tops.to_csv('data/ufo_tops.csv', header=False)