# California Grants Dataset EDA

**EDA Using Python**
- understand data
    - many columns are not very useful
    - some that may be useful are self reported, having no consistency in formatting between rows (award period, estimated amounts.)
- clean data
- analyze variables

In [None]:
# import necessary packages
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

In [None]:
grants_raw = pd.read_csv("ca_grants.csv")

In [None]:
grants_raw.head()

In [None]:
print(grants_raw.info()) 

In [None]:
# Change ID of the grant to a categorical variable
grants_raw['PortalID'] = grants_raw['PortalID'].astype('object')

Removal of unecessary columns
- columns with excessive missing values
- redundant information
- information that cannot realistically be useful or analyzed

In [None]:
grants = grants_raw.drop(grants_raw.columns[[1,4,10,14,15,17,19,24,27,29,31,32,33,34,35]], axis = 1)
grants.head()

In [None]:
# Columns we are left with
print(grants.info()) 

In [None]:
grants.AgencyDept.value_counts()
grants.AgencyDept.unique()

In [None]:

grants.describe(include=["object"]) #no duplicate rows

Converting the columns 'EstAwards', 'EstAmounts', and 'EstAvailFunds' into numeric variables. Unique values reveal that the entries for these two columns are formatted consistently.
As many entries contain a range of values these columns were each split into 2, relecting their maximum and minimum values. Undeclared entries were replaces with a missing value (NaN).

In [None]:
grants['EstAwards'].unique()

In [None]:
awards = grants['EstAwards']
maxaward = []
minaward = []
for i in (range(len(awards))):
    if awards[i][0] == 'E':
        maxaward.append(int(''.join(filter(str.isdigit, awards[i]))))
        minaward.append(int(''.join(filter(str.isdigit, awards[i]))))
    elif awards[i][0] == 'B':
        maxaward.append(int(''.join(filter(str.isdigit, awards[i].rpartition('a')[2]))))
        minaward.append(int(''.join(filter(str.isdigit, awards[i].rpartition('a')[0]))))
    else:
        maxaward.append(float('nan'))
        minaward.append(float('nan'))
        
        
amounts = grants['EstAmounts']
maxamnt = []
minamnt = []
for i in (range(len(amounts))):
    if amounts[i][0] == 'E':
        maxamnt.append(int(''.join(filter(str.isdigit, amounts[i]))))
        minamnt.append(int(''.join(filter(str.isdigit, amounts[i]))))
    elif amounts[i][0] == 'B':
        maxamnt.append(int(''.join(filter(str.isdigit, amounts[i].rpartition('a')[2]))))
        minamnt.append(int(''.join(filter(str.isdigit, amounts[i].rpartition('a')[0]))))
    else:
        maxamnt.append(float('nan'))
        minamnt.append(float('nan'))

In [None]:
grants['MaxAwards'] = maxaward
grants['MinAwards'] = minaward
grants = grants.drop('EstAwards', axis = 1)

grants['MaxAmounts'] = maxamnt
grants['MinAmounts'] = minamnt
grants = grants.drop('EstAmounts', axis = 1)

In [None]:
availfunds = []
for i in (range(len(grants['EstAvailFunds']))):
    if type(grants['EstAvailFunds'][i]) != str:
        availfunds.append(float('nan'))
    else:
        availfunds.append(int(''.join(filter(str.isdigit, grants['EstAvailFunds'][i]))))

In [None]:
grants['EstAvailFunds'] = availfunds  
grants.head()

In [None]:
print(grants.info()) # Our new columns are left with mostly missing values as a majority of entries were undeclared

In [None]:
grants.describe(include=["object"])

In [None]:
grants.describe()

**Exploring the Numeric Variables**

In [None]:
sns.catplot(x = 'MaxAwards', kind = 'box', data = grants)
grants2 = grants[grants["MaxAwards"] < 50000] # remove the excessively large outliars

In [None]:
sns.catplot(x = 'MaxAwards', kind = 'box', data = grants2) # still many outliars to potentially remove

In [None]:
grants3 = grants[grants["MaxAwards"] < 200] # further subset our data
sns.catplot(x = 'MaxAwards', kind = 'box', data = grants3)

In [None]:
# Potential relationship: Funding Source and Maximum Awards?
sns.boxplot(x = 'FundingSource', y = 'EstAvailFunds', data = grants2) #bulk of outliers are coming from state grants

In [None]:
sns.countplot(x = 'FundingSource', data = grants) #to be expected as we are dealing with CA

In [None]:
# Another potentially interesting variable to consider: Funding Method
fig, ax = plt.subplots()
fig.set_size_inches(9,7)
sns.countplot(x = 'FundingMethod', data = grants, ax = ax)

In [None]:
# Looking further into funding method
grants4 = grants3.dropna(axis=0, how='any', thresh=None, subset=None, inplace=False) # removing missing values
sns.pairplot(data = grants4.drop('PortalID', axis = 1), hue = 'FundingMethod')

In [None]:
# Most interesting scatter: Maximum Amount vs Estimated Available Funds?
sns.relplot(x = 'MaxAmounts', y = 'EstAvailFunds', hue = 'FundingMethod', data = grants4)
sns.displot(data = grants4, x = 'MaxAmounts', y = 'EstAvailFunds')

Next Steps
- apply transformation
- linear model/analysis
- potential multivariate analysis as well?
- exploring the categories (split them up?)
- which agencies? whats useful?
- distribution of which agencies that are reporting stuff
- contextualize the data (are these values legit)
- are they even available?


# California Grants: Further Analysis

In [None]:
grants.info()
grants.ApplicationDeadline.value_counts().head()

In [None]:
deadline = grants.ApplicationDeadline
ongoing = []
for i in deadline:
    if type(i) == float:
        ongoing.append(0)
    elif i[0] == 'O':
        ongoing.append(1)
    elif i[0] == '2':
        temp = pd.to_datetime(i, format="%Y-%m-%d %H:%M:%S")
        today = pd.datetime.now()
        if temp < today:
            ongoing.append(0)
        else:
            ongoing.append(1)
grants['IsOngoing'] = ongoing

In [None]:
grants.AgencyDept.unique()

In [None]:
fig, ax = plt.subplots()
fig.set_size_inches(20,7)
plt.xticks(rotation = 90)
sns.countplot(x = 'AgencyDept', data = grants, ax = ax, order = grants.AgencyDept.value_counts().index)

In [None]:
grants_ongoing = grants[grants['IsOngoing'] == 1]
fig, ax = plt.subplots()
fig.set_size_inches(20,7)
plt.xticks(rotation = 90)
sns.countplot(x = 'AgencyDept', data = grants_ongoing, ax = ax, order = grants_ongoing.AgencyDept.value_counts().index)

In [None]:
grants.AgencyDept.unique() == grants_ongoing.AgencyDept.unique()

In [None]:
# Initialize the matplotlib figure
f, ax = plt.subplots(figsize=(20,8))

# Plot 
sns.set_color_codes("pastel")
sns.countplot(y = "AgencyDept", data = grants, label= "Total Grants by Agency", color="b", orient = 'h')

# Differentiate between ongoing grants
sns.set_color_codes("muted")
sns.countplot(y = "AgencyDept", data = grants_ongoing, label= "Total Ongoing Grants by Agency", color="b", orient = 'h')

# Add a legend and informative axis label
ax.legend(ncol=2, loc="lower right", frameon=True)
ax.set(ylabel="", xlabel="Total Grants")
sns.despine(left=True, bottom=True)

# Agencies with no ongoing grants are omitted