# Categorical and Missing Data

In this session we will work with *airbnb* data. The goal is to predict the review scores rating. 

There are many entries (i.e rows) with missing attributes in our dataset. 

We will come around this issue by employing two approaches:
1. *Remove rows with missing values*
2. *Single imputation with median*

In [1]:
import pandas as pd
import numpy as np

from sklearn import preprocessing
from sklearn import linear_model
from sklearn.model_selection import cross_val_predict
from sklearn.impute import SimpleImputer

import matplotlib.pyplot as plt
import os

In [None]:
# Load data
if os.path.exists('data.csv'):
    df = pd.read_csv('data.csv')
else:
    df = pd.read_csv('http://data.insideairbnb.com/spain/comunidad-de-madrid/madrid/2018-01-17/data/listings.csv.gz', 
                      compression='gzip')
    df.to_csv('data.csv')

print(df.info())

In [None]:
# We will focus on three attributes only
df = df[['host_response_time','host_response_rate','review_scores_rating']]

print(df.head())
print(df.host_response_time.unique())

In [None]:
df.host_response_rate = df.host_response_rate.str.strip('%')
df.host_response_rate = pd.to_numeric(df.host_response_rate)

print(df.info(), '\n')
print(df.head())

### Removing rows with missing values

In [None]:
# Remove all rows with null values
df = df.dropna()

In [None]:
# Converting host_response_time attributes to categorical values.
# Two approaches:

# 1 - Encoding label encoder...
le = preprocessing.LabelEncoder()

cols = ['within an hour', 'within a day', 'a few days or more']

arr = le.fit_transform(df.host_response_time)

df.host_response_time = arr

# 2 - ...OR alternative way 
df_label = df.apply(preprocessing.LabelEncoder().fit_transform)

In [None]:
print(arr)

In [None]:
# Perform Linear Regression
lr = linear_model.LinearRegression()

# define labels and data (i.e y and X)
y = df.review_scores_rating
X = df.drop(columns='review_scores_rating')

predict = cross_val_predict(lr, X, y, cv=10)

fig, ax = plt.subplots(figsize=(20, 10))
ax.scatter(y, predict, edgecolors=(0, 0, 0))
# ax.plot([y.min(), y.max()], [y.min(), y.max()], 'k--', lw=4)
ax.set_xlabel('Measured')
ax.set_ylabel('Predicted')
plt.show()


In [None]:
print(df.corr())

### Single imputation with median.

In [None]:
# re-load the data
df = pd.read_csv('data.csv')

#select a subset of attributes 
df = df[['review_scores_accuracy','review_scores_cleanliness',
         'review_scores_checkin','review_scores_communication',
         'review_scores_location','review_scores_value',
         'review_scores_rating']]

print(df.isnull().sum())

In [None]:
# Drop rows where all are nan
df.dropna(axis=0, how='all', inplace=True)

print(df.isnull().sum())

In [None]:
# Impute median value for original missing values and generate new dataframe
imputer = SimpleImputer(strategy='median')
df_imp = pd.DataFrame(imputer.fit_transform(df))

df_imp.columns = df.columns
df_imp.index = df.index

print(df_imp.isnull().sum())


In [None]:
# Run Linear Regression
lr = linear_model.LinearRegression()

y = df_imp.review_scores_rating
X = df_imp.drop(columns='review_scores_rating')

predict = cross_val_predict(lr, X, y, cv=10)

fig, ax = plt.subplots(figsize=(20, 10))
ax.scatter(y, predict, edgecolors=(0, 0, 0))
ax.plot([y.min(), y.max()], [y.min(), y.max()], 'k--', lw=4)
ax.set_xlabel('Measured')
ax.set_ylabel('Predicted')
plt.show()


In [None]:
print(df.corr()['review_scores_rating'])

In [None]:
print('thank you')