# Dropping missing data

In [3]:
import pandas as pd
volunteer = pd.read_csv('datasets/volunteer_opportunities.csv')
volunteer.isna().sum()

opportunity_id          0
content_id              0
vol_requests            0
event_time              0
title                   0
hits                    0
summary                 0
is_priority           603
category_id            48
category_desc          48
amsl                  665
amsl_unit             665
org_title               0
org_content_id          0
addresses_count         0
locality               70
region                  0
postalcode              6
primary_loc           665
display_url             0
recurrence_type         0
hours                   0
created_date            0
last_modified_date      0
start_date_date         0
end_date_date           0
status                  0
Latitude              665
Longitude             665
Community Board       665
Community Council     665
Census Tract          665
BIN                   665
BBL                   665
NTA                   665
dtype: int64

In [4]:
# Drop the Latitude and Longitude columns from volunteer
volunteer_cols = volunteer.drop(['Latitude','Longitude'],axis=1)

# Drop rows with missing category_desc values from volunteer_cols
volunteer_subset = volunteer_cols.dropna(subset=['category_desc'])

# Print out the shape of the subset
print(volunteer_subset.shape)

(617, 33)


# Working with data types

In [5]:
volunteer.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 665 entries, 0 to 664
Data columns (total 35 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   opportunity_id      665 non-null    int64  
 1   content_id          665 non-null    int64  
 2   vol_requests        665 non-null    int64  
 3   event_time          665 non-null    int64  
 4   title               665 non-null    object 
 5   hits                665 non-null    int64  
 6   summary             665 non-null    object 
 7   is_priority         62 non-null     object 
 8   category_id         617 non-null    float64
 9   category_desc       617 non-null    object 
 10  amsl                0 non-null      float64
 11  amsl_unit           0 non-null      float64
 12  org_title           665 non-null    object 
 13  org_content_id      665 non-null    int64  
 14  addresses_count     665 non-null    int64  
 15  locality            595 non-null    object 
 16  region  

# Train Test split

In [17]:
# Dropping all the null features
volunteer.drop(['is_priority','amsl','amsl_unit','primary_loc',
               'Latitude','Longitude','Community Board','Community Council ',
                'Census Tract','BIN','BBL','NTA'], axis=1, inplace=True)

volunteer.isna().sum()

opportunity_id         0
content_id             0
vol_requests           0
event_time             0
title                  0
hits                   0
summary                0
category_id           48
category_desc         48
org_title              0
org_content_id         0
addresses_count        0
locality              70
region                 0
postalcode             6
display_url            0
recurrence_type        0
hours                  0
created_date           0
last_modified_date     0
start_date_date        0
end_date_date          0
status                 0
dtype: int64

In [20]:
# Dropping rows with null values
volunteer.dropna(subset=['category_id','category_desc','locality'], inplace=True)
volunteer.isna().sum()

opportunity_id        0
content_id            0
vol_requests          0
event_time            0
title                 0
hits                  0
summary               0
category_id           0
category_desc         0
org_title             0
org_content_id        0
addresses_count       0
locality              0
region                0
postalcode            4
display_url           0
recurrence_type       0
hours                 0
created_date          0
last_modified_date    0
start_date_date       0
end_date_date         0
status                0
dtype: int64

In [21]:
volunteer['category_desc'].value_counts()

Strengthening Communities    285
Helping Neighbors in Need    110
Education                     79
Health                        43
Environment                   22
Emergency Preparedness        13
Name: category_desc, dtype: int64

This shows **Class Imbalance** and so while splitting the data, it needs to be stratified.

In [23]:
# Stratified Sampling

from sklearn.model_selection import train_test_split

# Create a DataFrame with all columns except category_desc
X = volunteer.drop('category_desc', axis=1)

# Create a category_desc labels dataset
y = volunteer[['category_desc']]

# Use stratified sampling to split up the dataset according to the y dataset
X_train, X_test, y_train, y_test = train_test_split(X, y, stratify = y, random_state=42)

# Print the category_desc counts from y_train
print(y_train['category_desc'].value_counts())

# Print the category_desc counts from y_test
print(y_test['category_desc'].value_counts())

Strengthening Communities    214
Helping Neighbors in Need     83
Education                     59
Health                        32
Environment                   16
Emergency Preparedness        10
Name: category_desc, dtype: int64
Strengthening Communities    71
Helping Neighbors in Need    27
Education                    20
Health                       11
Environment                   6
Emergency Preparedness        3
Name: category_desc, dtype: int64


The data is split such that it mirrors the actual population.