In [None]:
fig=plt.figure()

ax1=fig.add_subplot(221)
sns.distplot(train_df.price_doc.values, bins=50,kde=False)
ax2=fig.add_subplot(222)
sns.distplot(train_df.full_sq.values, bins=200,kde=False)
plt.xlim(0,500)
ax3=fig.add_subplot(223)
sns.countplot(train_df.floor.values)
ax4=fig.add_subplot(224)
sns.distplot(train_df.num_room.dropna().values, bins=50,kde=False)
plt.xlim(0,10)
ax1.set_xlabel('$')
ax1.set_title('House Price')

ax2.set_xlabel('Housing square footage.')
ax2.set_title('Square Meters')

plt.tight_layout()
plt.show()

train_df['yearmonth']=100*train_df['yearsale']+train_df['monthsale']
grouped_priceyear=train_df.groupby('yearmonth')['price_doc'].aggregate(np.median).reset_index()
fig=plt.figure()
sns.barplot(grouped_priceyear.yearmonth.values,grouped_priceyear.price_doc,color=color[0])
plt.xticks(rotation='vertical')
plt.xlabel('Year-month')
plt.ylabel('Median house price')

train_df=pd.read_csv("train.csv",parse_dates=['timestamp'])
dtype_df=train_df.dtypes.reset_index()
dtype_df.columns=["count","coltype"]
dtype_df.groupby("coltype").aggregate('count').head(n=100)


#Find number of NA's
navalues=train_df.isnull().sum(axis=0).reset_index()
navalues.columns=['colname','number_missing']
missingvalues=navalues.loc[navalues['number_missing']>0]
sortedmissingvaldf=missingvalues.sort_values('number_missing',ascending=False)
index=missingvalues.shape[0]
fig= plt.figure()
sns.barplot(sortedmissingvaldf.number_missing,sortedmissingvaldf.colname,color=color[0])

internal_chars = ['full_sq', 'life_sq', 'floor', 'max_floor', 'build_year', 'num_room', 'kitch_sq', 'state', 'price_doc']
corrmat = train_df[internal_chars].corr()
f, ax = plt.subplots(figsize=(10, 7))
plt.xticks(rotation='90')
sns.heatmap(corrmat, square=True, linewidths=.5, annot=True)

missing_df = train_df.isnull().sum(axis=0).reset_index()
missing_df.columns = ['column_name', 'missing_count']
missing_df = missing_df.loc[missing_df['missing_count']>0]
index = np.arange(missing_df.shape[0])
width = 0.9
missing_df.head()

train_df=pd.read_csv("train.csv",parse_dates=['timestamp'])
catlabels=pd.DataFrame()
for f in train_df.columns:
    if train_df[f].dtype=='object':
        label=preprocessing.LabelEncoder()
        label.fit(list(train_df[f].values))
        catlabels[f]=label.transform(list(train_df[f].values))
        train_df[f]=label.transform(list(train_df[f].values))
catlabels.head()

train_y = np.log1p(train_df.price_doc.values)
train_X = train_df.drop(["id", "timestamp", "price_doc"], axis=1)


xgb_params = {
    'eta': 0.05,
    'max_depth': 8,
    'subsample': 0.7,
    'colsample_bytree': 0.7,
    'objective': 'reg:linear',
    'eval_metric': 'rmse',
    'silent': 1
}
dtrain = xgb.DMatrix(train_X, train_y, feature_names=train_X.columns.values)
model = xgb.train(dict(xgb_params, silent=0), dtrain, num_boost_round=100)

# plot the important features 
fig, ax = plt.subplots(figsize=(12,18))
xgb.plot_importance(model, height=0.8, ax=ax)
plt.show()

importance = model.get_fscore()
VariableImpDf=pd.DataFrame(list(importance.items()),columns=['Variable','Fscore'])
SortedVIDf=VariableImpDf.sort_values('Fscore',ascending=False)
fig=plt.figure()
sns.barplot(SortedVIDf.Fscore[0:20],SortedVIDf.Variable[0:20],color=color[0])

#force the outliers to be equal to the .5% and 99.5% percentile respectively so the
#plot is easier to read.

ulimit = np.percentile(train_df.price_doc.values, 99.5)
llimit = np.percentile(train_df.price_doc.values, 0.5)
train_df['price_doc'].loc[train_df['price_doc']>ulimit] = ulimit
train_df['price_doc'].loc[train_df['price_doc']<llimit] = llimit

col = "full_sq"
ulimit = np.percentile(train_df[col].values, 99.5)
llimit = np.percentile(train_df[col].values, 0.5)
train_df[col].loc[train_df[col]>ulimit] = ulimit
train_df[col].loc[train_df[col]<llimit] = llimit

plt.figure(figsize=(12,12))
sns.jointplot(x=np.log1p(train_df.full_sq.values), y=np.log1p(train_df.price_doc.values), size=10)
plt.ylabel('Log of Price', fontsize=12)
plt.xlabel('Log of Total area in square metre', fontsize=12)
plt.show()

col = "life_sq"
train_df[col].fillna(0, inplace=True)
ulimit = np.percentile(train_df[col].values, 95)
llimit = np.percentile(train_df[col].values, 5)
train_df[col].loc[train_df[col]>ulimit] = ulimit
train_df[col].loc[train_df[col]<llimit] = llimit

plt.figure(figsize=(12,12))
sns.jointplot(x=np.log1p(train_df[col]), y=np.log1p(train_df['price_doc']), size=10,kind="kde")
plt.ylabel('Log of Price', fontsize=12)
plt.xlabel('Log of Total area in square metre', fontsize=12)
plt.show()

plt.figure(figsize=(12,8))
FloorPricedf=train_df.groupby('floor').agg({'price_doc':'median'}).reset_index()
sns.pointplot(FloorPricedf['floor'],FloorPricedf['price_doc'],color=color[1])
plt.xticks(rotation='vertical')
plt.ylabel('Median Price')

plt.figure(figsize=(12,8))
sns.boxplot(x="max_floor", y="price_doc", data=train_df)
plt.ylabel('Median Price', fontsize=12)
plt.xlabel('Max Floor number', fontsize=12)
plt.xticks(rotation='vertical')
plt.show()