In [62]:
import pandas as pd
import re
import numpy as np
from  sklearn.feature_extraction.text import TfidfVectorizer

In [63]:
ufo = pd.read_csv('ufo_sightings_large.csv')
ufo.dtypes

date               object
city               object
state              object
country            object
type               object
seconds           float64
length_of_time     object
desc               object
recorded           object
lat                object
long              float64
dtype: object

In [64]:
# Check how many values are missing in the length_of_time, state, and type columns
print(ufo[["length_of_time", "state", "type"]].isnull().sum())

# Keep only rows where length_of_time, state, and type are not null
ufo_no_missing = ufo[ufo["length_of_time"].notnull() & 
          ufo["state"].notnull() & 
          ufo["type"].notnull()]

# Print out the shape of the new dataset
print(ufo.shape)

length_of_time    143
state             419
type              159
dtype: int64
(4935, 11)


In [65]:
def return_minutes(time_string):

    # Use \d+ to grab digits
    pattern = re.compile(r"\d+")
    
    # Use match on the pattern and column
    num = re.match(pattern, time_string)
    if num is not None:
        return int(num.group(0))
        
ufo.length_of_time = ufo.length_of_time.astype(str)
# Apply the extraction to the length_of_time column
ufo["minutes"] = ufo_no_missing["length_of_time"].apply(lambda row: return_minutes(row))

# Take a look at the head of both of the columns
print(ufo[['minutes','length_of_time']].head())

   minutes   length_of_time
0      2.0          2 weeks
1     30.0           30sec.
2      NaN              nan
3      NaN  about 5 minutes
4      2.0                2


In [66]:
# Check the variance of the seconds and minutes columns
print(ufo[['seconds','minutes']].var())

# Log normalize the seconds column
ufo["seconds_log"] = np.log(ufo['seconds'])

# Print out the variance of just the seconds_log column
print(ufo.seconds_log.var())

seconds    3.156735e+10
minutes    9.470577e+02
dtype: float64
nan


  """


In [67]:
# Use Pandas to encode us values as 1 and others as 0
ufo["country_enc"] = ufo["country"].apply(lambda val: 1 if val == 'us' else 0 )

# Print the number of unique type values
print(len(ufo["type"] .unique()))

# Create a one-hot encoded set of the type values
type_set = pd.get_dummies(ufo["type"])

# Concatenate this set back to the ufo DataFrame
ufo = pd.concat([ufo, type_set], axis=1)

22


In [71]:
# Feature Engineering #
# Change date column
ufo.date = pd.to_datetime(ufo.date)

# Extract the month from the date column
ufo["month"] = ufo["date"].apply(lambda row: row.month)

# Extract the year from the date column
ufo["year"] = ufo["date"].apply(lambda row: row.year)

# Take a look at the head of all three columns
print(ufo[['year','month','date']].head())

   year  month                date
0  2011     11 2011-11-03 19:21:00
1  2004     10 2004-10-03 19:05:00
2  2009      9 2009-09-25 21:00:00
3  2002     11 2002-11-21 05:45:00
4  2010      8 2010-08-19 12:55:00


In [88]:
# Take a look at the head of the desc field
print(ufo.desc.head())

# Create the tfidf vectorizer object
vec = TfidfVectorizer()

#Change type desc
ufo.desc = ufo.desc.astype(str)
# Use vec's fit_transform method on the desc field
desc_tfidf = vec.fit_transform(ufo.desc)
# Look at the number of columns this creates
print(desc_tfidf.shape)

0    Red blinking objects similar to airplanes or s...
1                 Many fighter jets flying towards UFO
2    Green&#44 red&#44 and blue pulses of light tha...
3    It was a large&#44 triangular shaped flying ob...
4       A white spinning disc in the shape of an oval.
Name: desc, dtype: object
(4935, 6434)


In [91]:
def words_to_filter(vocab, original_vocab, vector, top_n):
    filter_list = []
    for i in range(0, vector.shape[0]):
    
        # Here we'll call the function from the previous exercise, and extend the list we're creating
        filtered = return_weights(vocab, original_vocab, vector, i, top_n)
        filter_list.extend(filtered)
    # Return the list in a set, so we don't get duplicate word indices
    return set(filter_list)
# Check the correlation between the seconds, seconds_log, and minutes columns
print(ufo[["seconds", "seconds_log", "minutes"]].corr())

# Make a list of features to drop   
to_drop = ["city", "country", "date", "desc", "lat", "length_of_time", "long", "minutes", "recorded", "seconds", "state"]

# Drop those features
ufo_dropped = ufo.drop(to_drop, axis=1)

              seconds  seconds_log   minutes
seconds      1.000000     0.164613 -0.009932
seconds_log  0.164613     1.000000  0.111460
minutes     -0.009932     0.111460  1.000000
