In [41]:
import json
import logging

import pandas as pd
import numpy as np

# Model imports
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import RandomizedSearchCV, GridSearchCV

%matplotlib inline

In [42]:
# Common interface to get clean datasets

def get_clean_dataset_0():
    """
    This gets you a cleans dataset 0.
    APPROVED.
    """
    return pd.read_csv('data/0/data.csv')


def get_clean_dataset_2():
    """
    This gets you a cleans dataset 2.
    APPROVED.
    """
    df = read_dataset_2('data/2/data.csv')
    return clean_dataset_2(df)


def get_clean_dataset_3():
    """
    This gets you a cleans dataset 3.
    APPROVED.
    """
    return clean_dataset_3()


def get_clean_dataset_4():
    """
    This gets you a cleans dataset 4.
    APPROVED.
    """
    df  = read_dataset_4('data/4/data.tsv')
    return clean_dataset_4(df)


def get_clean_dataset_5():
    """
    This gets you a cleans dataset 5.
    """
    return clean_dataset_5()


def get_clean_dataset_6():
    """
    This gets you a cleans dataset 6.
    """
    return clean_dataset_6()

def get_clean_dataset_7():
    """
    This gets you a cleans dataset 7.
    APPROVED.
    """
    df1, df2 = read_dataset_7()
    return clean_dataset_7(df1, df2)


def get_clean_dataset_8():
    """
    This gets you a cleans dataset 8.
    APPROVED.
    """
    df = read_dataset_8()
    return clean_dataset_8(df)


def get_clean_dataset_9():
    """
    This gets you a cleans dataset 9.
    APPROVED.
    """
    df = read_dataset_9('data/9/data.csv')
    return clean_dataset_9(df)



# Helper functions to clean datasets from all the team!

def read_dataset_2(path):
    """
    This reads the dataset 2.
    """
    #In this case path is 'data/2/data.csv' as there is only one csv
    df = pd.read_csv(path)
    return df.copy()

def clean_dataset_2(df):
    """
    This cleans dataset 2.
    """
    df = df.copy()
    df['id'] = df.squirrel_id.str.replace("_", "").astype(int)
    return df.drop('squirrel_id', axis=1)


def clean_dataset_3(filepath='data/3/data.csv'):
    df = pd.read_csv(filepath)
    # print('datase has {} rows and {} cols'.format(df.shape[0], df.shape[1]))
    # df.set_index('id', inplace=True)
    return df

def read_dataset_4(path):
    """
    This reads the dataset 4
    This 
    """
    #In this case path is 'data/4/data.tsv' as there is only one csv
    df = pd.read_json(path)
    return df.copy()

def clean_dataset_4(df):
    """
    This cleans dataset 4.
    """
    df = df.copy()
	# Your cleaning code here!
    df = df.reset_index()
    df['id'] = df['index'].str.split('-').map(lambda p: p[0])
    df = df.drop('index', axis = 1)
    df['id'] = df.id.astype(int)
    df = df[df['id'] <= 1595]
    df = df.set_index('id')
    df = df.sort_index()
    for i in df:
        df[i] = df[i].str.replace('%', '').astype(float)
    df = df.drop_duplicates()
    return df

def clean_dataset_5(filepath='data/5/data.csv'):
    df = pd.read_csv(filepath)
    # print('dataset has {} rows and {} cols'.format(df.shape[0], df.shape[1]))
    #df.set_index('id', inplace=True)
    df.index.name= 'id'
    df = df.drop(['is_this_the_id_col', 'or_maybe_this_is_the_id_col'], axis=1)
    df['violent_crimes_per_pop'] = df['Percentviolent_crimes_per_pop']
    return df.drop('Percentviolent_crimes_per_pop', axis=1)

def clean_dataset_6(filepath='data/6/data.html'):
    # the file suffix is not relevant
    
    # fix decimal symbol; sep = '|'
    df = pd.read_csv('data/6/data.html', sep='|', decimal=',')
    # clean last row
    df = df.iloc[:-1,:]
    # print('dataset has {} rows and {} cols'.format(df.shape[0], df.shape[1]))
    df.set_index('id', inplace=True)
    return df

def read_dataset_7():
    """
    Reads and returns both datasets 7 as a tuple.
    """
    df1 = pd.read_csv('data/7/data-1.csv')
    df2 = pd.read_csv('data/7/data-2.csv')
    
    return df1, df2

def clean_dataset_7(df1, df2):
    clean_df1 = df1.set_index('id')
    clean_df2 = df2.set_index('id')
    
    def _clean_float_columns(some_df):
        """
        This assumes all the columns are floats!
        """
        # Treat single whitespace as null values
        float_cleaned_df = some_df.replace(' ', np.nan)
        
        float_columns = [c for c in float_cleaned_df.columns]

        for c in float_columns:
            float_cleaned_df[c] = float_cleaned_df[c].astype(float)
        
        return float_cleaned_df
    
    clean_df1 = _clean_float_columns(clean_df1)
    clean_df2 = _clean_float_columns(clean_df2)
    
    clean_df1 = clean_df1.dropna(how='all')
    clean_df2 = clean_df2.dropna(how='all')
    
    return pd.concat([clean_df1, clean_df2])


def read_dataset_8():
    #this loads dataset 8
    last = 1595
    dirty8 = load_partial_dataset_8(str(0))
    dirty8['id'] = 0
    for x in range(1, last):
        dirty8_x = load_partial_dataset_8(str(x))
        dirty8_x['id'] = x
        dirty8 = dirty8.append(dirty8_x)
    return dirty8

def clean_dataset_8(df):
    #This cleans dataset 8
    return df

def load_partial_dataset_8(index):
    #this loads dataset 8
    filename = 'data/8/' + index +'.csv'
    with open(filename, 'r') as f:
        data = json.load(f)
    df = pd.DataFrame(data, index=[index])
    return df


def convert_to_float(x):
    try:
        return float(x)
    except ValueError:
        return None

def clean_dataset_9(df):
    cleanDF = df[['pct_kids2_par', 'num_illeg', 'pct_w_inv_inc',
       'black_per_cap', 'race_pct_white', 'racepctblack', 'own_occ_low_quart',
       'hisp_per_cap', 'violent_crimes_per_pop']]
    cleanDF['pct_illeg'] = df['pct_illeg'].apply(convert_to_float)
    cleanDF['id'] = cleanDF.index
    # cleanDF['pct_illeg'] = df['pct_illeg'].astype(float)
    # cleanDF = cleanDF.dropna()
    return cleanDF
    
def read_dataset_9(file):
    return pd.read_csv(file, encoding='iso-8859-1')


In [43]:
class PreparedDataset(object):
    """
    Represents a prepared dataset based on all available cleaning functions.
    The dataset itself is lazily evaluated via the `df` property.
    """

    def __init__(self):
        self._df = None
    
    # Common interface to get clean datasets
    
    def get_clean_dataset_0(self):
        """
        This gets you a cleans dataset 0.
        APPROVED.
        """
        return pd.read_csv('data/0/data.csv')

    def get_clean_dataset_2(self):
        """
        This gets you a cleans dataset 2.
        APPROVED.
        """
        df = read_dataset_2('data/2/data.csv')
        return clean_dataset_2(df)
    
    def get_clean_dataset_3(self):
        """
        This gets you a cleans dataset 3.
        APPROVED.
        """
        return clean_dataset_3()

    def get_clean_dataset_4(self):
        """
        This gets you a cleans dataset 4.
        APPROVED.
        """
        df  = read_dataset_4('data/4/data.tsv')
        return clean_dataset_4(df)

    def get_clean_dataset_5(self):
        """
        This gets you a cleans dataset 5.
        APPROVED.
        """
        return clean_dataset_5()

    def get_clean_dataset_6(self):
        """
        This gets you a cleans dataset 6.
        APPROVED.
        """
        return clean_dataset_6()

    def get_clean_dataset_7(self):
        """
        This gets you a cleans dataset 7.
        APPROVED.
        """
        df1, df2 = read_dataset_7()
        return clean_dataset_7(df1, df2)

    def get_clean_dataset_8(self):
        """
        This gets you a cleans dataset 8.
        APPROVED.
        """
        df = read_dataset_8()
        return clean_dataset_8(df)

    def get_clean_dataset_9(self):
        """
        This gets you a cleans dataset 9.
        APPROVED.
        """
        df = read_dataset_9('data/9/data.csv')
        return clean_dataset_9(df)

    def _pre_merge(self):
        get_clean_datasets = [f for f in dir(self) if f.startswith('get_clean_dataset_')]
        get_clean_datasets_functions = [getattr(self, func_name) for func_name in get_clean_datasets]
        
        logging.info('[serial-predictors] collecting cleaned datasets {}'.format(
            [func_name.split()[-1] for func_name in get_clean_datasets])
        )
        
        datasets = [fn() for fn in get_clean_datasets_functions]
        
        # reset any indexes
        datasets = [
            d.reset_index().set_index('id')
            for d in datasets
        ]
        
        # drop the target column except on the first one
        map(lambda d: d.drop(TARGET, axis=1, inplace=True), datasets[1:])
        
        self.datasets = datasets
    
    def _merge(self):
        from functools import reduce
        self.merged_dataset = self.datasets[0]
        
        for d in self.datasets[1:]:
            self.merged_dataset.join(d)
    
    def _post_merge(self):
        self._df = self.merged_dataset.reset_index().dropna()
    
    def _prepare_dataset(self):
        self._pre_merge()
        self._merge()
        self._post_merge()
    
    @property
    def df(self):
        if not self._df:
            self._prepare_dataset()
        return self._df

In [34]:

        
    # Merges ALL THE DATASETS

df0 = get_clean_dataset_0()
df2 = get_clean_dataset_2()
df3 = get_clean_dataset_3()
df4 = get_clean_dataset_4()
df5 = get_clean_dataset_5()
df6 = get_clean_dataset_6()
df7 = get_clean_dataset_7()
df8 = get_clean_dataset_8()
df9 = get_clean_dataset_9()

df0 = df0.set_index('id')
df2 = df2.set_index('id')
df3 = df3.set_index('id')
# df4 = df4.set_index('id') --> already has index!
# df5 = df5.set_index('id') --> already has index!
# df6 = df6.set_index('id') --> already has index!
# df7 = df7.set_index('id') --> already has index!
df8 = df8.set_index('id')
df9 = df9.set_index('id')

df2 = df2.drop(TARGET, axis=1)
df3 = df3.drop(TARGET, axis=1)
df4 = df4.drop(TARGET, axis=1)
df5 = df5.drop(TARGET, axis=1)
df6 = df6.drop(TARGET, axis=1)
df7 = df7.drop(TARGET, axis=1)
df8 = df8.drop(TARGET, axis=1)
df9 = df9.drop(TARGET, axis=1)


df = df0.join(df2).join(df3).join(df4).join(df5).join(df7).join(df8).join(df9)

df = df.reset_index()
df = df.dropna()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


In [33]:
df = PreparedDataset().df

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


ValueError: columns overlap but no suffix specified: Index(['index', 'violent_crimes_per_pop'], dtype='object')

In [44]:
# MODEL FITTING AND SUBMISSION UTILS

def hyper_parameters_tuning_RFRegressor(X, y):
    # Define the parameter space
    parameter_space_dist = {"max_depth": range(1,10), "n_estimators": range(1,5)}
    
    # Choose the classifier
    rf = RandomForestRegressor()
    
    # Select grid search with cross validation
    random_search = RandomizedSearchCV(rf, parameter_space_dist, n_iter=5, n_jobs=-1)

    # Fit the model based on the train datasets to the random search for the hyper parameters tunning
    random_search.fit(X, y)

    # Redefine the model with best parameters found in the hyper parameters tunning
    rf = RandomForestRegressor(max_depth=random_search.best_params_['max_depth'], n_jobs=-1
                             , n_estimators=random_search.best_params_['n_estimators'])

    # Fit the model
    rf.fit(X,y)

    # Cross validation
    cross_val_score = evaluate(rf, X, y, scoring='r2')
    print('Optimized RF score is {}.'.format(cross_val_score))
    return rf


def hyper_parameters_tuning_GBoostRegressor(X, y):
    # Define the parameter space
    # parameter_space_dist = {"max_depth": range(1,5), "n_estimators": range(1,50)}
    
    # Choose the classifier
    rf = GradientBoostingRegressor()
    
    # Select grid search with cross validation
    #random_search = RandomizedSearchCV(rf, parameter_space_dist, n_iter=5, n_jobs=-1)

    # Fit the model based on the train datasets to the random search for the hyper parameters tunning
    #random_search.fit(X, y)

    # Redefine the model with best parameters found in the hyper parameters tunning
    #rf = RandomForestRegressor(max_depth=random_search.best_params_['max_depth'], n_jobs=-1
                            # , n_estimators=random_search.best_params_['n_estimators'])

    
    # Cross validation
    cross_val_score = evaluate(rf, X, y, scoring='r2')
    print('Optimized RF score is {}.'.format(cross_val_score))
    
    # Fit the model
    rf.fit(X,y)
    
    return rf

def hyper_parameters_tuning_GBoostRegressor_optimized(X, y):
    # Define the parameter space
    parameter_space_dist = {'learning_rate': [0.1, 0.05, 0.02, 0.01],
              'max_depth': [4, 6, 8],
              'min_samples_leaf': [20, 50,100,150],
              'max_features': [1.0, 0.3, 0.1] }
  
    # Choose the classifier
    rf = GradientBoostingRegressor()
    
    # Select grid search with cross validation
    random_search = RandomizedSearchCV(rf, parameter_space_dist, n_iter=100, n_jobs=-1)

    # Fit the model based on the train datasets to the random search for the hyper parameters tunning
    random_search.fit(X, y)

    # Redefine the model with best parameters found in the hyper parameters tunning
    rf = GradientBoostingRegressor(max_depth=random_search.best_params_['max_depth'],
                            learning_rate=random_search.best_params_['learning_rate'],
                            min_samples_leaf=random_search.best_params_['min_samples_leaf'],
                            max_features=random_search.best_params_['max_features'])

    # Cross validation
    cross_val_score = evaluate(rf, X, y, scoring='r2')
    print('Optimized RF score is {}.'.format(cross_val_score))
    
     # Fit the model
    rf.fit(X,y)
    return rf


def evaluate(estimator, X, y, scoring='r2'):
    return cross_val_score(estimator, 
                    X, 
                    y, 
                    scoring=scoring, 
                    cv=None, 
                    n_jobs=-1).mean()

def feature_importance():
    """
    See the feature importances
    """
    # don't forget to import %matplotlib inline
    rf = hyper_parameters_tuning_GBoostRegressor(X_train, y_train)
    my_importances = pd.Series(dict(zip(X_train.columns, rf.feature_importances_))).sort_values()
    # my_importances.plot(kind='barh',figsize=(16,10))
    return my_importances


def prepare_and_generate_submission(model, features):
    X_test_original = pd.read_csv('X_test.csv')
    X_test = X_test_original[features]
    predictions = model.predict(X_test)
    submission_df = X_test.drop([c for c in X_test.columns if c!= 'id'], axis=1)
    submission_df['violent_crimes_per_pop'] = predictions
    
    submission_df.to_csv('submission_serial_predictors_5.csv', index=False)
    return submission_df

In [45]:
TARGET = 'violent_crimes_per_pop'

In [46]:
# Merges ALL THE DATASETS

df0 = get_clean_dataset_0()
df2 = get_clean_dataset_2()
df3 = get_clean_dataset_3()
df4 = get_clean_dataset_4()
df5 = get_clean_dataset_5()
df6 = get_clean_dataset_6()
df7 = get_clean_dataset_7()
df8 = get_clean_dataset_8()
df9 = get_clean_dataset_9()

df0 = df0.set_index('id')
df2 = df2.set_index('id')
df3 = df3.set_index('id')
# df4 = df4.set_index('id') --> already has index!
# df5 = df5.set_index('id') --> already has index!
# df6 = df6.set_index('id') --> already has index!
# df7 = df7.set_index('id') --> already has index!
df8 = df8.set_index('id')
df9 = df9.set_index('id')

df2 = df2.drop(TARGET, axis=1)
df3 = df3.drop(TARGET, axis=1)
df4 = df4.drop(TARGET, axis=1)
df5 = df5.drop(TARGET, axis=1)
df6 = df6.drop(TARGET, axis=1)
df7 = df7.drop(TARGET, axis=1)
df8 = df8.drop(TARGET, axis=1)
df9 = df9.drop(TARGET, axis=1)


df = df0.join(df2).join(df3).join(df4).join(df5).join(df7).join(df8).join(df9)

df = df.reset_index()
df = df.dropna()


df6.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


Unnamed: 0_level_0,pct_immig_rec5,med_rent_pct_hous_inc,pct_immig_rec10,med_own_cost_pct_inc_no_mtg,pct_w_wage,age_pct12t29,pct_fam2_par,med_yr_hous_built,pct_foreign_born,num_under_pov
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
0,0.38,0.37,0.41,0.61,0.34,0.42,0.03,0.29,0.01,0.02
1,0.24,0.6,0.28,0.56,0.74,0.36,0.94,0.52,0.16,0.0
2,0.5,0.53,0.57,0.46,0.51,0.53,0.18,0.46,0.04,1.0
3,0.53,0.71,0.69,0.19,0.55,0.53,0.59,0.31,1.0,0.05
4,0.59,0.41,0.61,0.36,0.5,0.56,0.54,0.25,0.24,0.23


In [47]:
# Generates X and y for training

features = [c for c in df.columns if c != TARGET]
top_features = feature_importance().reset_index()['index'][-30:].values
features = top_features
X_train = df[features]
y_train = df[TARGET]

Optimized RF score is 0.6093342450056064.


In [48]:
submission = prepare_and_generate_submission(
    model=hyper_parameters_tuning_GBoostRegressor_optimized(X_train, y_train),
    features=features
)

Optimized RF score is 0.6170540353147497.


In [None]:
submission.dtypes