In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

In [2]:
# set plotting params
plt.rcParams['font.serif'] = 'Ubuntu'
plt.rcParams['font.monospace'] = 'Ubuntu Mono'
plt.rcParams['font.size'] = 18
plt.rcParams['axes.labelsize'] = 18
plt.rcParams['axes.titlesize'] = 18
plt.rcParams['xtick.labelsize'] = 12
plt.rcParams['ytick.labelsize'] = 12
plt.rcParams['legend.fontsize'] = 18
plt.rcParams['figure.titlesize'] = 24 

In [3]:
from src.Clean import CleanUFOs

In [4]:
cufo = CleanUFOs('data/ufodata.json')

In [5]:
df = cufo.to_pandas()

In [6]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 98 entries, 0 to 97
Data columns (total 7 columns):
 #   Column       Non-Null Count  Dtype         
---  ------       --------------  -----         
 0   occured      98 non-null     datetime64[ns]
 1   reported     98 non-null     datetime64[ns]
 2   city         98 non-null     object        
 3   state        98 non-null     object        
 4   shape        98 non-null     object        
 5   duration     98 non-null     int64         
 6   description  98 non-null     object        
dtypes: datetime64[ns](2), int64(1), object(4)
memory usage: 5.5+ KB


In [7]:
# df['duration'].apply(lambda x: adjust_duration(x))

In [8]:
df['duration'].unique()

array([  600,     1,     3,    30,     0,    60,    15,     2,   300,
          -1,    20,   180,    10,  1200,   240,     4,     5, 14400,
        1800,     7,  7200,  2700,  3600,    45,   840,   120,   -60,
         420])

In [9]:
df.head(5)

Unnamed: 0,occured,reported,city,state,shape,duration,description
0,2017-05-06 05:00:00,2017-05-06 04:10:00,Camp McGregor,NM,Light,600,Light seen over mountain's east of Camp McGreg...
1,2017-05-06 04:50:00,2017-05-06 05:00:00,Mojave (Canada),BC,Light,1,Light in sky stationary. Not a airplane or an...
2,2017-05-05 11:30:00,2017-05-05 12:18:00,Austin,TX,Disk,3,"Flying saucer descends, possibly lands in Nort..."
3,2017-05-05 03:00:00,2017-05-05 03:49:00,El Mirage,AZ,Circle,30,"While letting my dog out, a very bright white ..."
4,2017-05-04 23:34:00,2017-05-04 22:38:00,York,NE,Fireball,0,A fire ball was moving in the atmosphere while...


In [10]:
lst = df['description'].tolist()

In [11]:
import nltk
from sklearn.feature_extraction.text import TfidfVectorizer
from nltk.stem.porter import PorterStemmer

def tokenize(text):
    tokenizer = nltk.RegexpTokenizer(r"\w+")

    tokens = tokenizer.tokenize(text)
#     tokens = nltk.word_tokenize(text)
    stems = []
    for item in tokens:
        stems.append(PorterStemmer().stem(item))
    return stems

In [12]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [13]:
tfidf = TfidfVectorizer(sublinear_tf=True,tokenizer=tokenize, stop_words='english')
p = tfidf.fit_transform(df['description'].tolist())



In [16]:
test_df = pd.DataFrame(p.todense(),columns = features)
test_df.head(5)

Unnamed: 0,00,000,00am,01,03,05,07,09,1,10,...,year,yell,yellow,yellowish,ying,yoga,younger,yoyo,zip,zoom
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.083118,0.0,0.055828,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [17]:
response = tfidf.transform([df['description'][0]])
print( response )

  (0, 1614)	0.15127529523335134
  (0, 1607)	0.17968660841966547
  (0, 1593)	0.07966057254861508
  (0, 1574)	0.2679402080200819
  (0, 1401)	0.22381340821987367
  (0, 1382)	0.2096077516267166
  (0, 1309)	0.13966842866655682
  (0, 1280)	0.14877844520513928
  (0, 1256)	0.15941336136217188
  (0, 1218)	0.1336008382022203
  (0, 1082)	0.12135415202630022
  (0, 1006)	0.12135415202630022
  (0, 1003)	0.11675010912683295
  (0, 953)	0.2537345514269248
  (0, 907)	0.3120670078202901
  (0, 884)	0.10224903893981067
  (0, 870)	0.11385590550660521
  (0, 863)	0.14827491525675596
  (0, 761)	0.17582715313385436
  (0, 738)	0.24212768486013025
  (0, 731)	0.17968660841966547
  (0, 728)	0.2537345514269248
  (0, 530)	0.15658349480173225
  (0, 478)	0.14877844520513928
  (0, 383)	0.2679402080200819
  (0, 296)	0.3120670078202901
  (0, 266)	0.1263173018868688
  (0, 161)	0.13966842866655682


In [18]:
# tfidf.get_feature_names()
tfidf.vocabulary_

{'light': 863,
 'seen': 1280,
 'mountain': 953,
 's': 1256,
 'east': 530,
 'camp': 296,
 'mcgregor': 907,
 'hover': 761,
 'spot': 1382,
 'look': 884,
 'like': 870,
 'helicopt': 731,
 'wa': 1593,
 'way': 1607,
 'bright': 266,
 'went': 1614,
 'higher': 738,
 'height': 728,
 'stay': 1401,
 'disappear': 478,
 'nuforc': 1006,
 'note': 1003,
 'report': 1218,
 'appear': 161,
 'consist': 383,
 'sight': 1309,
 'venu': 1574,
 'pd': 1082,
 'sky': 1328,
 'stationari': 1400,
 'airplan': 121,
 'ani': 147,
 'known': 830,
 'star': 1392,
 'flicker': 631,
 'possibl': 1130,
 'sourc': 1361,
 'elect': 541,
 'remain': 1214,
 'anonym': 149,
 'provid': 1154,
 'contact': 384,
 'inform': 791,
 'citi': 337,
 'mojav': 943,
 'bc': 213,
 'canada': 298,
 'fli': 630,
 'saucer': 1263,
 'descend': 451,
 'land': 837,
 'north': 997,
 'austin': 195,
 'natur': 967,
 'trailat': 1509,
 'approxim': 165,
 '11': 13,
 '30': 48,
 'balconi': 206,
 'apart': 159,
 'saw': 1264,
 'object': 1010,
 'veri': 1575,
 'rapidli': 1180,
 'trav

In [19]:
features = tfidf.get_feature_names()

In [20]:
# for a in zip(features[0,:],response):
#     print(a)
print(response[:,])

  (0, 1614)	0.15127529523335134
  (0, 1607)	0.17968660841966547
  (0, 1593)	0.07966057254861508
  (0, 1574)	0.2679402080200819
  (0, 1401)	0.22381340821987367
  (0, 1382)	0.2096077516267166
  (0, 1309)	0.13966842866655682
  (0, 1280)	0.14877844520513928
  (0, 1256)	0.15941336136217188
  (0, 1218)	0.1336008382022203
  (0, 1082)	0.12135415202630022
  (0, 1006)	0.12135415202630022
  (0, 1003)	0.11675010912683295
  (0, 953)	0.2537345514269248
  (0, 907)	0.3120670078202901
  (0, 884)	0.10224903893981067
  (0, 870)	0.11385590550660521
  (0, 863)	0.14827491525675596
  (0, 761)	0.17582715313385436
  (0, 738)	0.24212768486013025
  (0, 731)	0.17968660841966547
  (0, 728)	0.2537345514269248
  (0, 530)	0.15658349480173225
  (0, 478)	0.14877844520513928
  (0, 383)	0.2679402080200819
  (0, 296)	0.3120670078202901
  (0, 266)	0.1263173018868688
  (0, 161)	0.13966842866655682
