# Feature Engineering
- Creation of new features from existing ones.

In [3]:
import pandas as pd
hiking = pd.read_json('datasets/hiking.json')
hiking.head()

Unnamed: 0,Prop_ID,Name,Location,Park_Name,Length,Difficulty,Other_Details,Accessible,Limited_Access,lat,lon
0,B057,Salt Marsh Nature Trail,"Enter behind the Salt Marsh Nature Center, loc...",Marine Park,0.8 miles,,<p>The first half of this mile-long trail foll...,Y,N,,
1,B073,Lullwater,Enter Park at Lincoln Road and Ocean Avenue en...,Prospect Park,1.0 mile,Easy,Explore the Lullwater to see how nature thrive...,N,N,,
2,B073,Midwood,Enter Park at Lincoln Road and Ocean Avenue en...,Prospect Park,0.75 miles,Easy,Step back in time with a walk through Brooklyn...,N,N,,
3,B073,Peninsula,Enter Park at Lincoln Road and Ocean Avenue en...,Prospect Park,0.5 miles,Easy,Discover how the Peninsula has changed over th...,N,N,,
4,B073,Waterfall,Enter Park at Lincoln Road and Ocean Avenue en...,Prospect Park,0.5 miles,Easy,Trace the source of the Lake on the Waterfall ...,N,N,,


# Encoding categorical variables

## Encoding categorical variables - binary

In [4]:
from sklearn.preprocessing import LabelEncoder

# Set up the LabelEncoder object
enc = LabelEncoder()

# Apply the encoding to the "Accessible" column
hiking['Accessible_enc'] = enc.fit_transform(hiking['Accessible'])

# Compare the two columns
print(hiking[['Accessible', 'Accessible_enc']].head())

  Accessible  Accessible_enc
0          Y               1
1          N               0
2          N               0
3          N               0
4          N               0


## Encoding categorical variables - one-hot

In [5]:
hiking['Park_Name'].unique()

array(['Marine Park', 'Prospect Park', 'Alley Pond Park', 'Forest Park',
       'Cunningham Park', 'La Tourette Parks & Golf Course',
       'William T. Davis Wildlife Refuge', 'Willowbrook Park',
       'High Rock Park', 'Deere Park', 'Van Cortlandt Park',
       'Arden Woods', 'Wolfes Pond Park', 'Long Pond Park',
       'Clove Lakes Park', 'Pelham Bay Park', 'Inwood Hill Park',
       'Bronx Park', 'Conference House Park'], dtype=object)

In [6]:
hiking['Park_Name'].head(10)

0        Marine Park
1      Prospect Park
2      Prospect Park
3      Prospect Park
4      Prospect Park
5    Alley Pond Park
6        Forest Park
7        Forest Park
8        Forest Park
9    Cunningham Park
Name: Park_Name, dtype: object

In [7]:
# Transform the category_desc column
category_enc = pd.get_dummies(hiking['Park_Name'])

# Take a look at the encoded columns
category_enc.head(10)

Unnamed: 0,Alley Pond Park,Arden Woods,Bronx Park,Clove Lakes Park,Conference House Park,Cunningham Park,Deere Park,Forest Park,High Rock Park,Inwood Hill Park,La Tourette Parks & Golf Course,Long Pond Park,Marine Park,Pelham Bay Park,Prospect Park,Van Cortlandt Park,William T. Davis Wildlife Refuge,Willowbrook Park,Wolfes Pond Park
0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0
5,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
6,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0
7,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0
8,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0
9,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0


# Engineering numerical features

In [8]:
volunteer = pd.read_csv('datasets/volunteer_opportunities.csv')
volunteer.head()

Unnamed: 0,opportunity_id,content_id,vol_requests,event_time,title,hits,summary,is_priority,category_id,category_desc,amsl,amsl_unit,org_title,org_content_id,addresses_count,locality,region,postalcode,primary_loc,display_url,recurrence_type,hours,created_date,last_modified_date,start_date_date,end_date_date,status,Latitude,Longitude,Community Board,Community Council,Census Tract,BIN,BBL,NTA
0,4996,37004,50,0,Volunteers Needed For Rise Up & Stay Put! Home...,737,Building on successful events last summer and ...,,,,,,Center For NYC Neighborhoods,4426,1,,NY,,,/opportunities/4996,onetime,0,January 13 2011,June 23 2011,July 30 2011,July 30 2011,approved,,,,,,,,
1,5008,37036,2,0,Web designer,22,Build a website for an Afghan business,,1.0,Strengthening Communities,,,Bpeace,37026,1,"5 22nd St\nNew York, NY 10010\n(40.74053152272...",NY,10010.0,,/opportunities/5008,onetime,0,January 14 2011,January 25 2011,February 01 2011,February 01 2011,approved,,,,,,,,
2,5016,37143,20,0,Urban Adventures - Ice Skating at Lasker Rink,62,Please join us and the students from Mott Hall...,,1.0,Strengthening Communities,,,Street Project,3001,1,,NY,10026.0,,/opportunities/5016,onetime,0,January 19 2011,January 21 2011,January 29 2011,January 29 2011,approved,,,,,,,,
3,5022,37237,500,0,Fight global hunger and support women farmers ...,14,The Oxfam Action Corps is a group of dedicated...,,1.0,Strengthening Communities,,,Oxfam America,2170,1,,NY,2114.0,,/opportunities/5022,ongoing,0,January 21 2011,January 25 2011,February 14 2011,March 31 2012,approved,,,,,,,,
4,5055,37425,15,0,Stop 'N' Swap,31,Stop 'N' Swap reduces NYC's waste by finding n...,,4.0,Environment,,,Office of Recycling Outreach and Education,36773,1,,NY,10455.0,,/opportunities/5055,onetime,0,January 28 2011,February 01 2011,February 05 2011,February 05 2011,approved,,,,,,,,


In [9]:
# First, convert string column to date column
volunteer["start_date_converted"] = pd.to_datetime(volunteer["start_date_date"])

# Extract just the month from the converted column
volunteer["start_date_month"] = volunteer['start_date_converted'].dt.month

# Take a look at the converted and new month columns
print(volunteer[['start_date_converted', 'start_date_month']].head())

  start_date_converted  start_date_month
0           2011-07-30                 7
1           2011-02-01                 2
2           2011-01-29                 1
3           2011-02-14                 2
4           2011-02-05                 2


# Engineering text features

In [10]:
# Using regular expressions: code to identify patterns to extract information from strings
import re

my_string = "temperature: 75.6 F"

# Extract numerical data from the string
temp = re.search("\d+\.\d+", my_string)

print(temp)
print(float(temp.group(0)))

<re.Match object; span=(13, 17), match='75.6'>
75.6


The pattern:
- `\d+` : to grab digits 
- `\.` : to grab decimal point

## Extracting string patterns

In [11]:
hiking.isna().sum()

Prop_ID            0
Name               0
Location           0
Park_Name          0
Length             4
Difficulty         6
Other_Details      2
Accessible         0
Limited_Access     0
lat               33
lon               33
Accessible_enc     0
dtype: int64

In [17]:
hiking.dropna(subset=['Length'],inplace=True)

In [18]:
import re

# Write a pattern to extract numbers and decimals
def return_mileage(length):
    
    # Search the text for matches
    mile = re.search('\d+\.\d+', length)
    
    # If a value is returned, use group(0) to return the found value
    if mile is not None:
        return float(mile.group(0))
        
# Apply the function to the Length column and take a look at both columns
hiking["Length_num"] = hiking['Length'].apply(return_mileage)
print(hiking[["Length", "Length_num"]].head())

       Length  Length_num
0   0.8 miles        0.80
1    1.0 mile        1.00
2  0.75 miles        0.75
3   0.5 miles        0.50
4   0.5 miles        0.50


## Text classification using Tf/Idf vectors

In [25]:
volunteer.dropna(subset=['category_desc'],inplace=True)

In [28]:
from sklearn.feature_extraction.text import TfidfVectorizer

# Take the title text
title_text = volunteer['title']

# Create the vectorizer method
tfidf_vec = TfidfVectorizer()

# Transform the text into tf-idf vectors
text_tfidf = tfidf_vec.fit_transform(title_text)

In [29]:
text_tfidf

<617x1089 sparse matrix of type '<class 'numpy.float64'>'
	with 3172 stored elements in Compressed Sparse Row format>

Now that you've encoded the volunteer dataset's title column into tf/idf vectors, you'll use those vectors to predict the category_desc column.

In [30]:
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import GaussianNB
nb = GaussianNB()

# Split the dataset according to the class distribution of category_desc
y = volunteer["category_desc"]
X_train, X_test, y_train, y_test = train_test_split(text_tfidf.toarray(), y, stratify=y, random_state=42)

# Fit the model to the training data
nb.fit(X_train, y_train)

# Print out the model's accuracy
print(nb.score(X_test, y_test))

0.5161290322580645
