In [None]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

import matplotlib.pyplot as plt #plotting library
%matplotlib inline
import seaborn as sns    #plotting library
sns.set(color_codes=True)
sns.set_style("white")

In [None]:
import sklearn.ensemble
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, log_loss

In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
train_data = pd.read_json("../input/train.json")
test_data = pd.read_json("../input/test.json")
display_count = 2
target = 'interest_level'

In [None]:
train_data.iloc[0]

###Calculate number of occurrences of a particular category###

In [None]:
def get_value_counts(col, df):
    result = pd.DataFrame(df[col].value_counts())
    result = result.sort_values(by=[col], ascending=False)
    return result

In [None]:
target_values = list(train_data[target].unique())
target_groups = train_data.groupby(target)

In [None]:
global_chart_settings = {
    'height' : 4,             # height of chart
    'width' : 8,              # width of chart
    'bar_width' : 0.9,        # width of bar
    'title' : 'Number of occurrences of {0}', #default title
    'ylabel' : 'Occurrence',  #label of y axis
    'alpha' : None,           # alpha of chart(transparency factor)
    'lbl_fontsize' : 13,      # font size of labels
    'title_fontsize' : 13     # font size of title
}

### Function to plot a bar chart with customied settings ###

In [None]:
def plot_distributions(xcol, huecol, data, width, height):
    plt.figure(figsize=(width, height))
    sns.countplot(x=xcol, hue=huecol, data=data)
    plt.ylabel('Number of Occurrences', fontsize=12)
    plt.xlabel(xcol, fontsize=12)
    plt.show()
    plt.close()

In [None]:
def plot_seaborn_bar(df, column, ax, i, color, title, chart_styles):
    n = len(df.index)
    bar_locations = np.arange(n)
    ax[i, 0].bar(bar_locations, df[column], color=color)
    ax[i, 0].set_xticks(bar_locations + 0.1 / 2)
    ax[i, 0].set_xticklabels(df.index)
    ax[i, 0].set_ylabel(chart_styles['ylabel'], fontsize=chart_styles['lbl_font'])
    ax[i, 0].set_title(title, fontsize=chart_styles['title_font'])
    for x,y in zip(bar_locations, df[column]):
        ax[i, 0].text(x + 0.05, y + 0.01, '%.0f' % y, ha='center', va= 'bottom')

In [None]:
def plot_histogram(df, column, ax, i, color, title, chart_styles):
    sns.distplot(df[column], bins=50, kde=False, color=color, ax=ax[i,0])
    ax[i, 0].set_ylabel(chart_styles['ylabel'], fontsize=chart_styles['lbl_font'])
    ax[i, 0].set_title(title, fontsize=chart_styles['title_font'])

In [None]:
def draw_boxplot(df, column, ax, i, color, title, chart_styles):
    sns.boxplot(x=df[column], color=color, ax=ax[i, 0])
    ax[i, 0].set_ylabel(chart_styles['ylabel'], fontsize=chart_styles['lbl_font'])
    ax[i, 0].set_title(title, fontsize=chart_styles['title_font'])

In [None]:
def plot_scatter(df, column, ax, i, color, title, chart_styles):
    plt.scatter(x=df.index, y=df[column])

In [None]:
def plot_chart(subplot_count, dataframes, columns, colors, chart_styles, titles, chart_types):
    
    width = chart_styles['width']
    height = chart_styles['height']
    fig, ax = plt.subplots(subplot_count, figsize = (width, height), facecolor='w', squeeze=False)
    for i in range(subplot_count):
        if chart_types[i] == 'bar':
            plot_seaborn_bar(dataframes[i], columns[i], ax, i, colors[i], titles[i], chart_styles)
        
        if chart_types[i] == 'histogram':
            plot_histogram(dataframes[i], columns[i], ax, i, colors[i], titles[i], chart_styles)
            
        if chart_types[i] == 'boxplot':
            draw_boxplot(dataframes[i], columns[i], ax, i, colors[i], titles[i], chart_styles) 
            
        if chart_types[i] == 'scatter':
            plot_scatter(dataframes[i], columns[i], ax, i, colors[i], titles[i], chart_styles)
            
plt.show()
plt.close()

## Bathrooms ##

In [None]:
subplot_count = 2
colors = ['blue', 'blue']
columns = ['bathrooms', 'bathrooms']
chart_types = ['bar', 'bar']
dataframes = []
df = get_value_counts('bathrooms', train_data)
dataframes.append(df)
dataframes.append(df[df.index >= 3])
chart_styles = {
    'height' : 11,
    'width' : 12,
    'ylabel' : 'Count',
    'lbl_font' : 15,
    'title_font' : 15
}
titles = ['Occurrences of bathrooms', 'Occurrences of bathrooms from 3 and above']
plot_chart(subplot_count, dataframes, columns, colors, chart_styles, titles, chart_types)

**In the upper graph we can see that there are quite many outliers.There are some with only count 1 that can be considered outliers**

### Bathrooms count in different target groups ###

In [None]:
subplot_count = len(target_values)
colors = ['blue', 'blue', 'blue']
columns = ['bathrooms', 'bathrooms', 'bathrooms']
chart_types = ['bar', 'bar', 'bar']
dataframes = []
titles = []
title = 'Bathroom count for target({})'

for value in target_values:
    df = get_value_counts('bathrooms', target_groups.get_group(value))
    dataframes.append(df)
    titles.append(title.format(value))
    
chart_styles = {
    'height' : 15,
    'width' : 10,
    'title' : 'Occurrences of {0}',
    'ylabel' : 'Count',
    'lbl_font' : 15,
    'title_font' : 15
}
plot_chart(subplot_count, dataframes, columns, colors, chart_styles, titles, chart_types)

## Bedrooms ##

In [None]:
subplot_count = 1
colors = ['blue']
columns = ['bedrooms', 'bathrooms']
chart_types = ['bar']
dataframes = []
df = get_value_counts('bedrooms', train_data)
dataframes.append(df)
chart_styles = {
    'height' : 4,
    'width' : 8,
    'title' : 'Occurrences of {0}',
    'ylabel' : 'Count',
    'lbl_font' : 15,
    'title_font' : 15
}
titles = ['Occurrence count for bedrooms']
plot_chart(subplot_count, dataframes, columns, colors, chart_styles, titles, chart_types)

In [None]:
subplot_count = len(target_values)
colors = ['blue', 'blue', 'blue']
columns = ['bedrooms', 'bedrooms', 'bedrooms']
chart_types = ['bar', 'bar', 'bar']
dataframes = []
titles = []
title = 'Bedroom count for target({})'

for value in target_values:
    df = get_value_counts('bedrooms', target_groups.get_group(value))
    dataframes.append(df)
    titles.append(title.format(value))
    
chart_styles = {
    'height' : 15,
    'width' : 10,
    'title' : 'Occurrences of {0}',
    'ylabel' : 'Count',
    'lbl_font' : 15,
    'title_font' : 15
}
plot_chart(subplot_count, dataframes, columns, colors, chart_styles, titles, chart_types)

## Price ##

In [None]:
subplot_count = 3
colors = ['blue', 'blue', 'blue']
columns = ['price', 'price', 'price']
chart_types = ['bar', 'bar', 'bar']

dataframes = []
df1 = train_data.groupby('bedrooms').agg({'price' : np.mean})
df2 = train_data.groupby('bathrooms').agg({'price' : np.mean})
df3 = train_data.groupby(target).agg({'price' : np.mean})
dataframes.append(df1)
dataframes.append(df2)
dataframes.append(df3)
chart_styles = {
    'height' : 15,
    'width' : 10,
    'ylabel' : 'Count',
    'lbl_font' : 15,
    'title_font' : 15
}
titles = ['Total price across bedrooms', 'Total price across bathrooms', 'Total price across interest level']
plot_chart(subplot_count, dataframes, columns, colors, chart_styles, titles, chart_types)

## Plot outliers in price ##

In [None]:
ulimit = np.percentile(train_data['price'], 99)
print(ulimit)

plt.figure(figsize=(8, 4))
plt.scatter(np.random.rand(len(train_data)), train_data['price'])
plt.show()
plt.close()

In [None]:
chart_df = train_data.copy(deep=True)
chart_df.ix[chart_df['price'] > 100000, 'price'] = 13000

plt.figure(figsize=(8, 4))
plt.scatter(np.random.rand(len(chart_df)), chart_df['price'])
plt.show()
plt.close()

In [None]:
ulimit = np.percentile(target_groups.get_group('low')['price'], 99)
print(ulimit)

plt.figure(figsize=(8, 4))
plt.scatter(np.random.rand(len(target_groups.get_group('low'))), target_groups.get_group('low')['price'])
plt.show()
plt.close()

In [None]:
chart_df = train_data.copy(deep=True)
chart_df.ix[(chart_df[target] == 'low') & (chart_df['price'] > 1000000), 'price'] = 14500

plt.figure(figsize=(8, 4))
plt.scatter(np.random.rand(len(chart_df.groupby(target).get_group('low'))), chart_df.groupby(target).get_group('low')['price'])
plt.show()
plt.close()

In [None]:
train_data['rooms'] = train_data['bedrooms'] + train_data['bathrooms']
train_data['living_rooms'] = train_data['bedrooms'] - train_data['bathrooms']
train_data['even_rooms'] = train_data['rooms'].apply(lambda x : (x%2) == 0)

In [None]:
def price_per_room(row):
    rooms = row['rooms']
    if rooms == 0:
        return -1
    price_per_room = row['price'] / rooms
    return price_per_room

train_data['price_per_room'] = train_data.apply(lambda row: price_per_room(row), axis=1)

## Date ##

In [None]:
train_data['created'] = pd.to_datetime(train_data['created'])
train_data['year'] = train_data['created'].apply(lambda x: x.year)
train_data['month'] = train_data['created'].apply(lambda x: x.month)
train_data['day'] = train_data['created'].apply(lambda x: x.day)
train_data['hour'] = train_data['created'].apply(lambda x: x.hour)

## Building_id ##

In [None]:
build_group = train_data.groupby([target, 'building_id'])

In [None]:
unstacked_df = build_group.size().unstack(level=0).reset_index()

In [None]:
def get_sum(row):
    return row['high'] + row['medium'] + row['low']

In [None]:
unstacked_df['total'] = unstacked_df.apply(lambda row: get_sum(row), axis=1)

In [None]:
non_nulls = unstacked_df.ix[pd.notnull(unstacked_df['total'])]
non_nulls = non_nulls.sort_values(by=['total'], ascending=False)
non_nulls = non_nulls.reset_index()

In [None]:
plt.figure(figsize=(8, 4))
plt.scatter(x=np.arange(non_nulls.shape[0]), y=non_nulls['total'])
plt.show()
plt.close()

In [None]:
high_build_ids = train_data.groupby(target).get_group('high')['building_id']

### Naive Modeling ###

In [None]:
scores = []
log_scores = []

In [None]:
def split_X_y(train_data, features):
    X = train_data[features]
    y = train_data[target]
    return X, y

In [None]:
def build_model(name):
    clf = None
    if name == 'Random Forest':
        clf = RandomForestClassifier()
    if name == 'gbm':
        clf = GradientBoostingClassifier()
    return clf

In [None]:
def fit_model(clf, X_train, y_train):
    return clf.fit(X_train, y_train)

In [None]:
def model_and_predict(model_name, data, features):
    X, y = split_X_y(data, features)
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=0)

    clf = build_model(model_name)
    clf = fit_model(clf, X_train, y_train)

    y_pred = clf.predict(X_test)
    accuracy = accuracy_score(y_test, y_pred)
    print("The accuracy is {}".format(accuracy))
    
    y_proba = clf.predict_proba(X_test)
    log_loss_score = log_loss(y_test, y_proba)
    print("The log_loss_score is {}".format(log_loss_score))
    return clf, accuracy, log_loss_score

### Feature engineering ###

In [None]:
def filter_records(df, column, query):
    value_counts = df[column].value_counts()
    value_counts = pd.DataFrame(value_counts)
    value_counts = value_counts.query(query)
    res = df.ix[~df[column].isin(list(value_counts.index))]
    return res

In [None]:
def process_bathrooms(train_data):
    # ----------- filter high -------------------
    df = get_value_counts('bathrooms', target_groups.get_group('high'))
    df = df.query('bathrooms == 1')
    train_data = train_data.ix[~train_data['bathrooms'].isin(list(df.index))]
    return train_data

In [None]:
def remove_outliers(df, column, ulimit):
    df[column] = df[column].clip(upper=ulimit)
    return df

In [None]:
train_copy = train_data.copy(deep=True)
train_copy = process_bathrooms(train_copy)
train_copy = remove_outliers(train_copy, 'price', np.percentile(train_copy['price'], 99))
train_copy = remove_outliers(train_copy, 'price_per_room', np.percentile(train_copy['price_per_room'], 99))

In [None]:
train_features = ['bathrooms', 'rooms', 'living_rooms', 'price']
clf, accuracy, log_score = model_and_predict('Random Forest', train_copy, train_features)
scores.append(accuracy)
log_scores.append(log_score)

In [None]:
scores_df = pd.DataFrame({"accuracy" :scores, "log_loss" : log_scores})
scores_df

### Predict on test data ###

In [None]:
test_data['rooms'] = test_data['bedrooms'] + test_data['bathrooms']
test_data['living_rooms'] = test_data['bedrooms'] - test_data['bathrooms']
test_data['even_rooms'] = test_data['rooms'].apply(lambda x : (x%2) == 0)
test_data['created'] = pd.to_datetime(test_data['created'])
test_data['year'] = test_data['created'].apply(lambda x: x.year)
test_data['month'] = test_data['created'].apply(lambda x: x.month)
test_data['day'] = test_data['created'].apply(lambda x: x.day)
test_data['hour'] = test_data['created'].apply(lambda x: x.hour)
test_data['price_per_room'] = test_data.apply(lambda row: price_per_room(row), axis=1)

In [None]:
X_test = test_data[train_features]
y_proba = clf.predict_proba(X_test)

In [None]:
set(pd.isnull(test_data['listing_id']))

In [None]:
clf.classes_

In [None]:
results = []
for i in range(len(y_proba)):
    row = {}
    proba = y_proba[i]
    for j in range(len(proba)):
        row[clf.classes_[j]] = proba[j]
    results.append(row)

In [None]:
results_df = pd.DataFrame(results)

In [None]:
id_df = test_data[['listing_id']]
id_df = id_df.reset_index(drop=True)

In [None]:
set(pd.isnull(id_df['listing_id']))

In [None]:
result = pd.concat([results_df, id_df], axis=1)

In [None]:
print(result.shape)
print(set(pd.isnull(result['high'])))
print(set(pd.isnull(result['low'])))
print(set(pd.isnull(result['medium'])))
print(set(pd.isnull(result['listing_id'])))

In [None]:
result.head(3)

In [None]:
result.to_csv("submission.csv", index=False)