In [2]:
import pandas as pd 
import numpy as np 
import matplotlib.pyplot as plt 
import seaborn as sns 
import warnings 

warnings.filterwarnings('ignore')

# Introduction

Data preprocessing takes place after exploratory data analysis and cleaning

We preprocess the data to: 
- transform the dataset so its suitable for modeling
  AND
- to improve model performance

## 1. Importing data

In [22]:
df = pd.read_csv('../data/volunteer_opportunities.csv')

## 2. Inspecting

In [23]:
df.head()

Unnamed: 0,opportunity_id,content_id,vol_requests,event_time,title,hits,summary,is_priority,category_id,category_desc,...,end_date_date,status,Latitude,Longitude,Community Board,Community Council,Census Tract,BIN,BBL,NTA
0,4996,37004,50,0,Volunteers Needed For Rise Up & Stay Put! Home...,737,Building on successful events last summer and ...,,,,...,July 30 2011,approved,,,,,,,,
1,5008,37036,2,0,Web designer,22,Build a website for an Afghan business,,1.0,Strengthening Communities,...,February 01 2011,approved,,,,,,,,
2,5016,37143,20,0,Urban Adventures - Ice Skating at Lasker Rink,62,Please join us and the students from Mott Hall...,,1.0,Strengthening Communities,...,January 29 2011,approved,,,,,,,,
3,5022,37237,500,0,Fight global hunger and support women farmers ...,14,The Oxfam Action Corps is a group of dedicated...,,1.0,Strengthening Communities,...,March 31 2012,approved,,,,,,,,
4,5055,37425,15,0,Stop 'N' Swap,31,Stop 'N' Swap reduces NYC's waste by finding n...,,4.0,Environment,...,February 05 2011,approved,,,,,,,,


In [24]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 665 entries, 0 to 664
Data columns (total 35 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   opportunity_id      665 non-null    int64  
 1   content_id          665 non-null    int64  
 2   vol_requests        665 non-null    int64  
 3   event_time          665 non-null    int64  
 4   title               665 non-null    object 
 5   hits                665 non-null    int64  
 6   summary             665 non-null    object 
 7   is_priority         62 non-null     object 
 8   category_id         617 non-null    float64
 9   category_desc       617 non-null    object 
 10  amsl                0 non-null      float64
 11  amsl_unit           0 non-null      float64
 12  org_title           665 non-null    object 
 13  org_content_id      665 non-null    int64  
 14  addresses_count     665 non-null    int64  
 15  locality            595 non-null    object 
 16  region  

In [26]:
df.dtypes

opportunity_id          int64
content_id              int64
vol_requests            int64
event_time              int64
title                  object
hits                    int64
summary                object
is_priority            object
category_id           float64
category_desc          object
amsl                  float64
amsl_unit             float64
org_title              object
org_content_id          int64
addresses_count         int64
locality               object
region                 object
postalcode            float64
primary_loc           float64
display_url            object
recurrence_type        object
hours                   int64
created_date           object
last_modified_date     object
start_date_date        object
end_date_date          object
status                 object
Latitude              float64
Longitude             float64
Community Board       float64
Community Council     float64
Census Tract          float64
BIN                   float64
BBL       

In [27]:
df.describe()

Unnamed: 0,opportunity_id,content_id,vol_requests,event_time,hits,category_id,amsl,amsl_unit,org_content_id,addresses_count,...,primary_loc,hours,Latitude,Longitude,Community Board,Community Council,Census Tract,BIN,BBL,NTA
count,665.0,665.0,665.0,665.0,665.0,617.0,0.0,0.0,665.0,665.0,...,0.0,665.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
mean,5374.454135,42790.643609,78.778947,0.0,345.409023,2.105348,,,20752.207519,1.046617,...,,0.027068,,,,,,,,
std,234.322154,5491.720274,569.763773,0.0,530.716526,1.412003,,,19143.034346,0.537109,...,,0.519952,,,,,,,,
min,4952.0,36697.0,1.0,0.0,0.0,1.0,,,24.0,1.0,...,,0.0,,,,,,,,
25%,5175.0,38414.0,3.0,0.0,102.0,1.0,,,2138.0,1.0,...,,0.0,,,,,,,,
50%,5377.0,40222.0,12.0,0.0,204.0,2.0,,,4442.0,1.0,...,,0.0,,,,,,,,
75%,5580.0,49308.0,30.0,0.0,374.0,3.0,,,39189.0,1.0,...,,0.0,,,,,,,,
max,5782.0,52894.0,9999.0,0.0,4662.0,6.0,,,52858.0,12.0,...,,12.0,,,,,,,,


## 3. Remove Missing Data

In [33]:
df.isna().sum()

opportunity_id          0
content_id              0
vol_requests            0
event_time              0
title                   0
hits                    0
summary                 0
is_priority           603
category_id            48
category_desc          48
amsl                  665
amsl_unit             665
org_title               0
org_content_id          0
addresses_count         0
locality               70
region                  0
postalcode              6
primary_loc           665
display_url             0
recurrence_type         0
hours                   0
created_date            0
last_modified_date      0
start_date_date         0
end_date_date           0
status                  0
Latitude              665
Longitude             665
Community Board       665
Community Council     665
Census Tract          665
BIN                   665
BBL                   665
NTA                   665
dtype: int64

- *df.dropna()* -> if only a few rows contain missing data
- *df.drop([1,2,3])* -> drops specific rows
- *df.drop('column_name', axis=1)* -> drops columns
- *df.dropna(subset=['column_name'])* -> drops rows where column_name is empty
- *df.dropna(thresh=2)* -> drop columns with 2 or more missing values

## 4. Typing

Pandas infer data types, sometimes incorrectly.

The *.info()* method shows the datatype of each column as well as *.dtypes*

In [34]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 665 entries, 0 to 664
Data columns (total 35 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   opportunity_id      665 non-null    int64  
 1   content_id          665 non-null    int64  
 2   vol_requests        665 non-null    int64  
 3   event_time          665 non-null    int64  
 4   title               665 non-null    object 
 5   hits                665 non-null    int64  
 6   summary             665 non-null    object 
 7   is_priority         62 non-null     object 
 8   category_id         617 non-null    float64
 9   category_desc       617 non-null    object 
 10  amsl                0 non-null      float64
 11  amsl_unit           0 non-null      float64
 12  org_title           665 non-null    object 
 13  org_content_id      665 non-null    int64  
 14  addresses_count     665 non-null    int64  
 15  locality            595 non-null    object 
 16  region  

In [35]:
df.head()

Unnamed: 0,opportunity_id,content_id,vol_requests,event_time,title,hits,summary,is_priority,category_id,category_desc,...,end_date_date,status,Latitude,Longitude,Community Board,Community Council,Census Tract,BIN,BBL,NTA
0,4996,37004,50,0,Volunteers Needed For Rise Up & Stay Put! Home...,737,Building on successful events last summer and ...,,,,...,July 30 2011,approved,,,,,,,,
1,5008,37036,2,0,Web designer,22,Build a website for an Afghan business,,1.0,Strengthening Communities,...,February 01 2011,approved,,,,,,,,
2,5016,37143,20,0,Urban Adventures - Ice Skating at Lasker Rink,62,Please join us and the students from Mott Hall...,,1.0,Strengthening Communities,...,January 29 2011,approved,,,,,,,,
3,5022,37237,500,0,Fight global hunger and support women farmers ...,14,The Oxfam Action Corps is a group of dedicated...,,1.0,Strengthening Communities,...,March 31 2012,approved,,,,,,,,
4,5055,37425,15,0,Stop 'N' Swap,31,Stop 'N' Swap reduces NYC's waste by finding n...,,4.0,Environment,...,February 05 2011,approved,,,,,,,,


In [36]:
df_toy = pd.DataFrame({'A': ['1.0', '2.0']})
df_toy.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2 entries, 0 to 1
Data columns (total 1 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   A       2 non-null      object
dtypes: object(1)
memory usage: 148.0+ bytes


In [37]:
df_toy['A'] = df_toy['A'].astype('float')

In [38]:
df_toy.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2 entries, 0 to 1
Data columns (total 1 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   A       2 non-null      float64
dtypes: float64(1)
memory usage: 148.0 bytes


In [39]:
df_toy.dtypes

A    float64
dtype: object

## 5. Training and test split

Splitting the dataset into training and test helps: 
- reducing overfitting
- evaluate performance on a holdout set

In [41]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(df, df.category_desc, test_size=0.2, random_state=42)

**Stratified sampling** helps keeping all the classes represented in the target test dataset when it is very imbalanced.

In [42]:
X_train, X_test, y_train, y_test = train_test_split(df, df.category_desc, test_size=0.2, random_state=42, stratify=df.category_desc)

ValueError: Input contains NaN

## 6. Standardization

**Standardization** is the process to transform **continuous** data to appear **normally distributed**

Many of the sklearn models assume normally distributed data. Using non-normal data could bias the models.

Standardization is required: 
- When we are using a model in linera space (like KNN, linear regression or KMeans)
- When the dataset features have high variance
- Features are on different scales (for instance number of bedrooms vs price)

## 6.1. Log Normalization

Useful for features with high variance

Applies logarithm transformation

Natural log using the constant $e$




In [47]:
c
df['logs'] = np.log(df['values'])

df

Unnamed: 0,values,logs
0,0,-inf
1,1,0.0
2,10,2.302585
3,100,4.60517
4,1000,6.907755
5,10000,9.21034


Captures relative changes, the magnitude of change, and keeps everything positive.

## 6.2. Scaling 

Features on different scales

Model with linear characteristics

Center features around 0 and transform variance to 1

Transforms to approximately normal distribution


In [53]:
from sklearn.preprocessing import StandardScaler

df = pd.DataFrame({
    'col1': [0.1, 0.2, 0.3],
    'col2': [10, 5.2, 8.3],
    'col3': [120, 100.2, 89.3]
})

scaler = StandardScaler()

df_scaled = pd.DataFrame(scaler.fit_transform(df), columns=df.columns)

df_scaled

Unnamed: 0,col1,col2,col3
0,-1.224745,1.090322,1.32467
1,-3.39935e-16,-1.32516,-0.233457
2,1.224745,0.234839,-1.091213


In [54]:
np.var(df_scaled)

col1    1.0
col2    1.0
col3    1.0
dtype: float64

## 6. Standardized data and modeling

Its important to split the data before preprocessing the data, otherwise there would be a **data leakage** and the test data could have been showed somehow to the model.

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier

X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, random_state=42)
scaler = StandardScaler()

X_train_scaled = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

knn.fit(X_train_scaled, y_train)
knn.score(X_test_scaled, y_test)

## 7. Feature Engineering

**Feature engineering** is the creation of new features from existing ones

It adds information to the dataset that can improve the performance of the model or add insight into relationships between features

Before doing feature engineering we must understand our data first.

Feature engineering is highly dependent on the dataset we have at hand

Typical feature engineering scenarios are extracting features from free text, or parsing strings containing dates.

## 8. Encoding categorical variables

Sklearn models requires numerical input only. If there is any categorical data, it has to be encoded.

### 8.2 Encoding variables with 2 different values

In [59]:
df = pd.DataFrame({
    'user': [1,2,3, 4],
    'subscribed': ['y','y','n', 'y'],
    'fav_color': ['yellow', 'orange', 'orange', 'green']
})

In [56]:
df.subscribed

0    y
1    y
2    n
Name: subscribed, dtype: object

In [57]:
# Pandas way
df.subscribed.apply(lambda x: 1 if x=='y' else 0)

0    1
1    1
2    0
Name: subscribed, dtype: int64

In [58]:
# SciKit Learn way 

from sklearn.preprocessing import LabelEncoder

encoder = LabelEncoder()
encoder.fit_transform(df.subscribed)

array([1, 1, 0])

### 8.2 One Hot Encoding

Applies when the variable has more than 2 different values.



In [60]:
pd.get_dummies(df.fav_color)

Unnamed: 0,green,orange,yellow
0,False,False,True
1,False,True,False
2,False,True,False
3,True,False,False


# 9. Engineering Numerical Features

Examples of reducing dimensionality: Means or medians of several variables, extracting month or week from dates


In [63]:
volunteer = pd.read_csv('../data/volunteer_opportunities.csv')
volunteer.head()

Unnamed: 0,opportunity_id,content_id,vol_requests,event_time,title,hits,summary,is_priority,category_id,category_desc,...,end_date_date,status,Latitude,Longitude,Community Board,Community Council,Census Tract,BIN,BBL,NTA
0,4996,37004,50,0,Volunteers Needed For Rise Up & Stay Put! Home...,737,Building on successful events last summer and ...,,,,...,July 30 2011,approved,,,,,,,,
1,5008,37036,2,0,Web designer,22,Build a website for an Afghan business,,1.0,Strengthening Communities,...,February 01 2011,approved,,,,,,,,
2,5016,37143,20,0,Urban Adventures - Ice Skating at Lasker Rink,62,Please join us and the students from Mott Hall...,,1.0,Strengthening Communities,...,January 29 2011,approved,,,,,,,,
3,5022,37237,500,0,Fight global hunger and support women farmers ...,14,The Oxfam Action Corps is a group of dedicated...,,1.0,Strengthening Communities,...,March 31 2012,approved,,,,,,,,
4,5055,37425,15,0,Stop 'N' Swap,31,Stop 'N' Swap reduces NYC's waste by finding n...,,4.0,Environment,...,February 05 2011,approved,,,,,,,,


In [66]:
volunteer.columns

Index(['opportunity_id', 'content_id', 'vol_requests', 'event_time', 'title',
       'hits', 'summary', 'is_priority', 'category_id', 'category_desc',
       'amsl', 'amsl_unit', 'org_title', 'org_content_id', 'addresses_count',
       'locality', 'region', 'postalcode', 'primary_loc', 'display_url',
       'recurrence_type', 'hours', 'created_date', 'last_modified_date',
       'start_date_date', 'end_date_date', 'status', 'Latitude', 'Longitude',
       'Community Board', 'Community Council ', 'Census Tract', 'BIN', 'BBL',
       'NTA'],
      dtype='object')

In [73]:
volunteer["start_date_converted"] = pd.to_datetime(volunteer["start_date_date"])
volunteer["start_date_month"] = volunteer['start_date_converted'].dt.month

In [74]:
print(volunteer[['start_date_converted', 'start_date_month']].head())

  start_date_converted  start_date_month
0           2011-07-30                 7
1           2011-02-01                 2
2           2011-01-29                 1
3           2011-02-14                 2
4           2011-02-05                 2


# 10. Engineering Text Features

## 10.1. Extraction

Regular expressions is code that identify patterns


In [78]:
import re

my_string = 'temperature:75.6 F'
temp = re.search('\d+\.\d+', my_string)

temp

<re.Match object; span=(12, 16), match='75.6'>

If we are working with text it could be helpful to model it in some way.

**TF/IDF** (Term Frequency/Inverse Document Frequency) Vectorizes words based upon importance



In [82]:
from sklearn.feature_extraction.text import TfidfVectorizer

volunteer.summary

0      Building on successful events last summer and ...
1                 Build a website for an Afghan business
2      Please join us and the students from Mott Hall...
3      The Oxfam Action Corps is a group of dedicated...
4      Stop 'N' Swap reduces NYC's waste by finding n...
                             ...                        
660    Volunteers needed to file for fair hearings, d...
661    Come out to the South Bronx to help us hold ou...
662    Volunteer needed to translate written material...
663    World Cares Center is looking for individuals ...
664    Attention all filmmakers, producers, and edito...
Name: summary, Length: 665, dtype: object

In [84]:
tfidf_vec = TfidfVectorizer()
text_tfidf = tfidf_vec.fit_transform(volunteer.summary)

In [88]:
text_tfidf

<665x3242 sparse matrix of type '<class 'numpy.float64'>'
	with 15600 stored elements in Compressed Sparse Row format>