In [1]:
import json

import pandas as pd
import numpy as np

# Model imports
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import RandomizedSearchCV, GridSearchCV

In [2]:
# Common interface to get clean datasets

def get_clean_dataset_0():
    """
    This gets you a cleans dataset 0.
    """
    return pd.read_csv('data/0/data.csv')


def get_clean_dataset_2():
    """
    This gets you a cleans dataset 2.
    """
    df = read_dataset_2('data/2/data.csv')
    return clean_dataset_2(df)


def get_clean_dataset_7():
    """
    This gets you a cleans dataset 7.
    """
    df1, df2 = read_dataset_7()
    return clean_dataset_7(df1, df2)


def get_clean_dataset_8():
    """
    This gets you a cleans dataset 8.
    """
    df = read_dataset_8()
    return clean_dataset_8(df)


def get_clean_dataset_9():
    """
    This gets you a cleans dataset 9.
    """
    df = read_dataset_9('data/9/data.csv')
    return clean_dataset_9(df)



# Helper functions to clean datasets from all the team!

def read_dataset_2(path):
    """
    This reads the dataset 2.
    """
    #In this case path is 'data/2/data.csv' as there is only one csv
    df = pd.read_csv(path)
    return df.copy()

def clean_dataset_2(df):
    """
    This cleans dataset 2.
    """
    df = df.copy()
    df['id'] = df.squirrel_id.str.replace("_", "").astype(int)
    return df.drop('squirrel_id', axis=1)


def read_dataset_7():
    """
    Reads and returns both datasets 7 as a tuple.
    """
    df1 = pd.read_csv('data/7/data-1.csv')
    df2 = pd.read_csv('data/7/data-2.csv')
    
    return df1, df2

def clean_dataset_7(df1, df2):
    clean_df1 = df1.set_index('id')
    clean_df2 = df2.set_index('id')
    
    def _clean_float_columns(some_df):
        """
        This assumes all the columns are floats!
        """
        # Treat single whitespace as null values
        float_cleaned_df = some_df.replace(' ', np.nan)
        
        float_columns = [c for c in float_cleaned_df.columns]

        for c in float_columns:
            float_cleaned_df[c] = float_cleaned_df[c].astype(float)
        
        return float_cleaned_df
    
    clean_df1 = _clean_float_columns(clean_df1)
    clean_df2 = _clean_float_columns(clean_df2)
    
    clean_df1 = clean_df1.dropna(how='all')
    clean_df2 = clean_df2.dropna(how='all')
    
    return pd.concat([clean_df1, clean_df2])


def read_dataset_8():
    #this loads dataset 8
    last = 1595
    dirty8 = load_partial_dataset_8(str(0))
    dirty8['id'] = 0
    for x in range(1, last):
        dirty8_x = load_partial_dataset_8(str(x))
        dirty8_x['id'] = x
        dirty8 = dirty8.append(dirty8_x)
    return dirty8

def clean_dataset_8(df):
    #This cleans dataset 8
    return df

def load_partial_dataset_8(index):
    #this loads dataset 8
    filename = 'data/8/' + index +'.csv'
    with open(filename, 'r') as f:
        data = json.load(f)
    df = pd.DataFrame(data, index=[index])
    return df


def convert_to_float(x):
    try:
        return float(x)
    except ValueError:
        return None

def clean_dataset_9(df):
    cleanDF = df[['pct_kids2_par', 'num_illeg', 'pct_w_inv_inc',
       'black_per_cap', 'race_pct_white', 'racepctblack', 'own_occ_low_quart',
       'hisp_per_cap', 'violent_crimes_per_pop']]
    cleanDF['pct_illeg'] = df['pct_illeg'].apply(convert_to_float)
    cleanDF = cleanDF.dropna()
    return cleanDF
    
def read_dataset_9(file):
    return pd.read_csv(file, encoding='cp1254')




In [3]:
df9 = get_clean_dataset_9()

EmptyDataError: No columns to parse from file

In [None]:
df9

In [None]:
# MODEL FITTING AND SUBMISSION UTILS

def hyper_parameters_tuning_RFRegressor(X, y):
    # Define the parameter space
    parameter_space_dist = {"max_depth": range(1,10), "n_estimators": range(1,5)}
    
    # Choose the classifier
    rf = RandomForestRegressor()
    
    # Select grid search with cross validation
    random_search = RandomizedSearchCV(rf, parameter_space_dist, n_iter=5, n_jobs=-1)

    # Fit the model based on the train datasets to the random search for the hyper parameters tunning
    random_search.fit(X, y)

    # Redefine the model with best parameters found in the hyper parameters tunning
    rf = RandomForestRegressor(max_depth=random_search.best_params_['max_depth'], n_jobs=-1
                             , n_estimators=random_search.best_params_['n_estimators'])

    # Fit the model
    rf.fit(X,y)

    # Cross validation
    cross_val_score = evaluate(rf, X, y, scoring='r2')
    print('Optimized RF score is {}.'.format(cross_val_score))
    return rf


def evaluate(estimator, X, y, scoring='r2'):
    return cross_val_score(estimator, 
                    X, 
                    y, 
                    scoring=scoring, 
                    cv=None, 
                    n_jobs=-1).mean()


def prepare_and_generate_submission(model, features):
    X_test_original = pd.read_csv('X_test.csv')
    X_test = X_test_original.drop([c for c in X_test_original.columns if c not in features], axis=1)
    predictions = model.predict(X_test)
    submission_df = X_test.drop([c for c in X_test.columns if c!= 'id'], axis=1)
    submission_df['violent_crimes_per_pop'] = predictions
    
    submission_df.to_csv('submission_serial_predictors_1.csv', index=False)

In [None]:
df = get_clean_dataset_0()


# Generates X and y for training
target = 'violent_crimes_per_pop'
features = [c for c in df.columns if c != target]
X_train = df[features]
y_train = df[target]

In [None]:
prepare_and_generate_submission(
    hyper_parameters_tuning_RFRegressor(X_train, y_train),
    features
)