# Wrangle: [testfit.io](https://blog.testfit.io/)

In [4]:
import numpy as np
import pandas as pd

%matplotlib inline
import matplotlib.pyplot as plt
import seaborn as sns

import wrangle

import itertools

import preprocessing


In [5]:
pd.set_option("display.max_columns", None)
plt.rc("figure", figsize=(16, 8))

In [6]:
df = wrangle.wrangle_hud()

In [7]:
city_mask = df.project_city.value_counts().nlargest(35)

In [8]:
def in_city_mask(x):
    return x in city_mask

In [9]:
pre_df = df[df.project_city.apply(in_city_mask)]

In [10]:
def preprocessing_for_modeling(df):
    """function to manipulate df into df usable for modeling"""
    
    # city mask identifies 35 cities with most hud loans
    city_mask = df.project_city.value_counts().nlargest(35)
    
    
    # apply city mask to shrink the df
    def in_city_mask(x):
        return x in city_mask
    df = df[df.project_city.apply(in_city_mask)]
    
    #create a df for modelling that groups-by year and city aggregating mortgage amount by count, median, mean, and sum
    df_for_model = (df.groupby(['fiscal_year_of_firm_commitment_activity', 'project_city'])
         .final_mortgage_amount.agg(['count', 'median', 'mean', 'sum']).reset_index())
    
    df_for_model = df_for_model.rename(columns={'fiscal_year_of_firm_commitment_activity': 'year', 'project_city': 'city'})
    
    return df_for_model
    

In [11]:
df = preprocessing_for_modeling(df)

In [12]:
df['city_year'] = df.city + "_" + df.year.astype(str)

In [13]:
chicago =  df[df.city == "Chicago"]

In [14]:
chicago['y/y'] = (chicago['count'].diff(1) / chicago['count'])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.


In [15]:
chicago

Unnamed: 0,year,city,count,median,mean,sum,city_year,y/y
7,2006,Chicago,17,6812900.0,8535829.0,145109100,Chicago_2006,
41,2007,Chicago,23,3160700.0,4273548.0,98291600,Chicago_2007,0.26087
72,2008,Chicago,8,10061200.0,9693712.0,77549700,Chicago_2008,-1.875
104,2009,Chicago,17,11868600.0,11839840.0,201277300,Chicago_2009,0.529412
136,2010,Chicago,23,7250000.0,9042609.0,207980000,Chicago_2010,0.26087
171,2011,Chicago,41,8677500.0,11277200.0,462365300,Chicago_2011,0.439024
206,2012,Chicago,44,7710000.0,11592480.0,510069300,Chicago_2012,0.068182
241,2013,Chicago,40,8926350.0,10597390.0,423895600,Chicago_2013,-0.1
276,2014,Chicago,20,3556100.0,10742320.0,214846300,Chicago_2014,-1.0
311,2015,Chicago,20,6769000.0,10442470.0,208849400,Chicago_2015,0.0


In [16]:
refinance_mask = df.activity_description_Refinance == 1

AttributeError: 'DataFrame' object has no attribute 'activity_description_Refinance'

In [None]:
(df[refinance_mask].groupby(['fiscal_year_of_firm_commitment_activity', 'project_city'])
         .final_mortgage_amount.agg(['count', 'median', 'mean', 'sum']).reset_index())

# Explore Object Columns

In [None]:
object_columns = []
for col in df.columns:
    if df[col].dtype == 'object':
        object_columns.append(col)
object_columns

In [None]:
#fha_number
repeat_numbers = df.shape[0] - len(df.fha_number.unique())
repeat_numbers

In [None]:
df_repeat.firm_commitment_activity.value_counts()

In [None]:
#project_name
repeat_name = df.shape[0] - len(df.project_name.unique())
repeat_name

In [None]:
fha_numbers_max = df_repeat.groupby('fha_number').date_of_firm_commitment_activity.idxmax()

- take dataframe and find multiple fha_numbers
- get the most recent row of the repeat fha_numbers

    - make df of only uniques
    - make df of max from repeat fha_numbers
    - concat two dfs

In [None]:
fha_value_counts = df.fha_number.value_counts()
repeat_list = list(fha_value_counts[fha_value_counts != 1].index)

In [None]:
# list of unique fha numbers
unique_fha_numbers = list(df.fha_number.value_counts()[df.fha_number.value_counts() == 1]

In [None]:
df.groupby('fha_number').date_of_firm_commitment_activity.max()

In [None]:
df_repeat.groupby('fha_number').date_of_firm_commitment_activity.max()

In [None]:
repeat_fha_numbers

In [None]:
len(all_unique_fha_numbers)

In [None]:
df.columns

In [None]:
len(all_unique_fha_numbers)

In [None]:
def in_unique_list(x):
    return x in all_unique_fha_numbers

In [None]:
df.fha_number.apply(in_unique_list).mean()

In [None]:
len(df_repeat.final_mortgage_amount.unique())

for fha_number drop if not max date

In [None]:
df.basic_fha_risk_share_or_other.value_counts()

In [None]:
df.groupby('basic_fha_risk_share_or_other').final_mortgage_amount.mean().plot.bar()

In [None]:
for col in object_columns:
    print(col)
    print(df[col].value_counts())
    print()

In [None]:
df[df.project_state == 'TX'].project_city.value_counts()

In [None]:
df.activity_description.value_counts().plot.barh()

In [None]:
df.current_status.value_counts()

In [None]:
df.activity_description.value_counts()

In [None]:
df.groupby('map_or_tap').final_mortgage_amount.count()

In [None]:
df.groupby('facility_type').final_mortgage_amount.count()

In [None]:
df.head()

In [None]:

df[.groupby('project_city').final_mortgage_amount.mean()

In [None]:
city_mask = df.project_city.value_counts().nlargest(50).index

In [None]:
city_mask

In [None]:
df_model = preprocessing.get_model_df()

In [None]:
df_model

In [None]:
df

In [None]:
city_mask = df.project_city.value_counts().nlargest(35)

In [None]:
def in_city_mask(x):
        return x in city_mask
df = df[df.project_city.apply(in_city_mask)]

In [None]:
df.project_city.value_counts()

In [None]:
def preprocessing_for_modeling(df):
    """function to manipulate df into df usable for modeling"""
    
    # city mask identifies 35 cities with most hud loans
    city_mask = df.project_city.value_counts().nlargest(35)
    
    
    # apply city mask to shrink the df
    def in_city_mask(x):
        return x in city_mask
    df = df[df.project_city.apply(in_city_mask)]
    
    #create a df for modelling that groups-by year and city aggregating mortgage amount by count, median, mean, and sum
    df_for_model = (df.groupby(['fiscal_year_of_firm_commitment_activity', 'project_city'])
         .final_mortgage_amount.agg(['count', 'median', 'mean', 'sum']).reset_index())
    
    #  only take cities that have an observation for every year
    observations_mask =  df_for_model.project_city.value_counts()[df_for_model.project_city.value_counts() == 15]
    
     # apply city mask to shrink the df
    def in_observations_mask(x):
        return x in observations_mask
    df_for_model = df_for_model[df_for_model.project_city.apply(in_observations_mask)]
    
    
    
    df_for_model = df_for_model.rename(columns={'fiscal_year_of_firm_commitment_activity': 'year', 'project_city': 'city'})
    
    return df_for_model

In [None]:
df_prep = preprocessing_for_modeling(df)

In [None]:
df_prep[df_prep.should_enter]

In [None]:
city_mask = df.project_city.value_counts().nlargest(35)

In [None]:
city_mask

In [None]:
def in_city_mask(x):
    return x in city_mask
df2 = df[df.project_city.apply(in_city_mask)]

In [None]:
df2

In [None]:
df2.project_city.value_counts()

In [None]:
observations_mask