# 3. Machine Learning for Classification

We'll use logistic regression to predict churn


## 3.1 Churn prediction project

* Dataset: https://www.kaggle.com/blastchar/telco-customer-churn
* https://raw.githubusercontent.com/alexeygrigorev/mlbookcamp-code/master/chapter-03-churn-prediction/WA_Fn-UseC_-Telco-Customer-Churn.csv


In [71]:
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt

## Data preparation

In [74]:
df = pd.read_csv('AB_NYC_2019.csv')

df = df[[
    'neighbourhood_group',
    'room_type',
    'latitude',
    'longitude',
    'price',
    'minimum_nights',
    'number_of_reviews',
    'reviews_per_month',
    'calculated_host_listings_count',
    'availability_365'
]]

df.columns = df.columns.str.lower().str.replace(' ', '_')
categorical_columns = list(df.dtypes[df.dtypes == 'object'].index)
numerical_columns = list(df.dtypes[df.dtypes != 'object'].index)
numerical_columns.remove('price')

for c in categorical_columns:
    df[c] = df[c].str.lower().str.replace(' ', '_')

df.reviews_per_month = df.reviews_per_month.fillna(0)
print(numerical_columns)
# df.totalcharges = pd.to_numeric(df.totalcharges, errors='coerce').fillna(0)
# df.churn = (df.churn == 'yes').astype(int)

['latitude', 'longitude', 'minimum_nights', 'number_of_reviews', 'reviews_per_month', 'calculated_host_listings_count', 'availability_365']


# Question 1
What is the most frequent observation (mode) for the column 'neighbourhood_group'?

In [75]:
df.groupby(['neighbourhood_group']).size().sort_values(ascending=False)

neighbourhood_group
manhattan        21661
brooklyn         20104
queens            5666
bronx             1091
staten_island      373
dtype: int64

## Setting up the validation framework
Perform the train/validation/test split with Scikit-Learn

# Split the data

- Split your data in train/val/test sets, with 60%/20%/20% distribution.
- Use Scikit-Learn for that (the train_test_split function) and set the seed to 42.
- Make sure that the target value ('price') is not in your dataframe.


In [99]:
from sklearn.model_selection import train_test_split

df.price = (df.price >= 152).astype(int)
df_full_train, df_test = train_test_split(df, test_size=0.2, random_state=42)
df_train, df_val = train_test_split(df_full_train, test_size=0.25, random_state=42)

df_train = df_train.reset_index(drop=True)
df_val = df_val.reset_index(drop=True)
df_test = df_test.reset_index(drop=True)

y_train = df_train.price.values
y_val = df_val.price.values
y_test = df_test.price.values

del df_train['price']
del df_val['price']
del df_test['price']


# Question 2
- Create the correlation matrix for the numerical features of your train dataset.
- In a correlation matrix, you compute the correlation coefficient between every pair of features in the dataset.
- What are the two features that have the biggest correlation in this dataset?

In [100]:
numerical_columns

['latitude',
 'longitude',
 'minimum_nights',
 'number_of_reviews',
 'reviews_per_month',
 'calculated_host_listings_count',
 'availability_365']

In [101]:
df_full_train = df_full_train.reset_index(drop=True)

df_full_train.price.value_counts(normalize=True)

df_full_train.price.mean()

numerical = numerical_columns
categorical = categorical_columns 
df_full_train[numerical].nunique()

latitude                          17462
longitude                         13527
minimum_nights                      100
number_of_reviews                   376
reviews_per_month                   903
calculated_host_listings_count       47
availability_365                    366
dtype: int64

In [102]:
from IPython.display import display

corr_matrix = df_train[numerical_columns].corr().abs()
display(corr_matrix)

corr_matrix.unstack().sort_values(ascending=False)

Unnamed: 0,latitude,longitude,minimum_nights,number_of_reviews,reviews_per_month,calculated_host_listings_count,availability_365
latitude,1.0,0.080301,0.027441,0.006246,0.007159,0.019375,0.005891
longitude,0.080301,1.0,0.06066,0.055084,0.134642,0.117041,0.083666
minimum_nights,0.027441,0.06066,1.0,0.07602,0.120703,0.118647,0.138901
number_of_reviews,0.006246,0.055084,0.07602,1.0,0.590374,0.073167,0.174477
reviews_per_month,0.007159,0.134642,0.120703,0.590374,1.0,0.048767,0.165376
calculated_host_listings_count,0.019375,0.117041,0.118647,0.073167,0.048767,1.0,0.225913
availability_365,0.005891,0.083666,0.138901,0.174477,0.165376,0.225913,1.0


latitude                        latitude                          1.000000
longitude                       longitude                         1.000000
calculated_host_listings_count  calculated_host_listings_count    1.000000
reviews_per_month               reviews_per_month                 1.000000
minimum_nights                  minimum_nights                    1.000000
number_of_reviews               number_of_reviews                 1.000000
availability_365                availability_365                  1.000000
number_of_reviews               reviews_per_month                 0.590374
reviews_per_month               number_of_reviews                 0.590374
availability_365                calculated_host_listings_count    0.225913
calculated_host_listings_count  availability_365                  0.225913
availability_365                number_of_reviews                 0.174477
number_of_reviews               availability_365                  0.174477
availability_365         

# Make price binary
We need to turn the price variable from numeric into binary.
Let's create a variable above_average which is 1 if the price is above (or equal to) 152.

# Question 3
Calculate the mutual information score with the (binarized) price for the two categorical variables that we have (number_of_reviews               reviews_per_month). Use the training set only.
Which of these two variables has bigger score?
Round it to 2 decimal digits using round(score, 2)

In [104]:
from sklearn.metrics import mutual_info_score

def mutual_info_price_score(series):
    above_average = df_full_train.price 
    return mutual_info_score(series, above_average)

mi = df_full_train[categorical_columns].apply(mutual_info_price_score)
mi.sort_values(ascending=False).round(2)

room_type              0.14
neighbourhood_group    0.05
dtype: float64

# Question 4
Now let's train a logistic regression
- Remember that we have two categorical variables in the data. Include them using one-hot encoding.
- Fit the model on the training dataset.
- To make sure the results are reproducible across different versions of Scikit-Learn, fit the model with these parameters:
-- model = LogisticRegression(solver='lbfgs', C=1.0, random_state=42)
- Calculate the accuracy on the validation dataset and rount it to 2 decimal digits.


In [105]:
from sklearn.feature_extraction import DictVectorizer

dv = DictVectorizer(sparse=False)

train_dict = df_train[categorical_columns+numerical_columns].to_dict(orient='records')
X_train = dv.fit_transform(train_dict)
print (X_train)
print (dv.get_feature_names())


val_dict = df_val[categorical + numerical].to_dict(orient='records')
X_val = dv.transform(val_dict)
print (X_val)

[[ 50.       13.       40.7276  ...   1.        0.        0.     ]
 [  7.        1.       40.70847 ...   0.        1.        0.     ]
 [  0.        1.       40.83149 ...   1.        0.        0.     ]
 ...
 [ 88.        1.       40.79994 ...   0.        1.        0.     ]
 [  0.        1.       40.69585 ...   0.        1.        0.     ]
 [281.        2.       40.64438 ...   1.        0.        0.     ]]
['availability_365', 'calculated_host_listings_count', 'latitude', 'longitude', 'minimum_nights', 'neighbourhood_group=bronx', 'neighbourhood_group=brooklyn', 'neighbourhood_group=manhattan', 'neighbourhood_group=queens', 'neighbourhood_group=staten_island', 'number_of_reviews', 'reviews_per_month', 'room_type=entire_home/apt', 'room_type=private_room', 'room_type=shared_room']
[[ 52.        1.       40.70239 ...   0.        1.        0.     ]
 [343.        2.       40.68498 ...   1.        0.        0.     ]
 [260.        1.       40.66911 ...   1.        0.        0.     ]
 ...
 [  0

In [108]:
from sklearn.linear_model import LogisticRegression

model = LogisticRegression(solver='lbfgs', C=1.0, random_state=42, max_iter=10000)
model.fit(X_train, y_train)

y_pred = model.predict_proba(X_val)[:, 1]

price_decision = (y_pred >= 0.5)

(y_val == price_decision).mean().round(3)

0.791

# Question 5
- We have 9 features: 7 numerical features and 2 categorical.
- Let's find the least useful one using the feature elimination technique.
- Train a model with all these features (using the same parameters as in Q4).
- Now exclude each feature from this set and train a model without it. Record the accuracy for each model.
- For each feature, calculate the difference between the original accuracy and the accuracy without the feature.
- Which of following feature has the smallest difference?
-- neighbourhood_group
-- room_type
-- number_of_reviews
-- reviews_per_month

note: the difference doesn't have to be positive

In [120]:
from sklearn.feature_extraction import DictVectorizer

from sklearn.linear_model import LogisticRegression

def model_training(cols):
    dv = DictVectorizer(sparse=False)
    train_dict = df_train[cols].to_dict(orient='records')
    X_train = dv.fit_transform(train_dict)
    val_dict = df_val[cols].to_dict(orient='records')
    X_val = dv.transform(val_dict)
    
    model = LogisticRegression(solver='lbfgs', C=1.0, random_state=42, max_iter=10000)
    model.fit(X_train, y_train)
    y_pred = model.predict_proba(X_val)[:, 1]
    price_decision = (y_pred >= 0.5)
    res = (y_val == price_decision).mean().round(3)
    return res


all_features = categorical_columns+numerical_columns

orig_acc = model_training(all_features)
res = {}

for i in range(len(all_features)):
    cols = np.delete(all_features, i)
    res[all_features[i]] = orig_acc - model_training(cols)
    
sorted(res.items(), key=lambda x: x[1], reverse=True)[:1]

[('room_type', 0.062000000000000055)]

# Question 6
- For this question, we'll see how to use a linear regression model from Scikit-Learn
- We'll need to use the original column 'price'. Apply the logarithmic transformation to this column.
- Fit the Ridge regression model on the training data.
- This model has a parameter alpha. Let's try the following values: [0, 0.01, 0.1, 1, 10]
- Which of these alphas leads to the best RMSE on the validation set? Round your RMSE scores to 3 decimal digits.
- If there are multiple options, select the smallest alpha.

In [129]:
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt

df = pd.read_csv('AB_NYC_2019.csv')

df = df[[
    'neighbourhood_group',
    'room_type',
    'latitude',
    'longitude',
    'price',
    'minimum_nights',
    'number_of_reviews',
    'reviews_per_month',
    'calculated_host_listings_count',
    'availability_365'
]]

df.columns = df.columns.str.lower().str.replace(' ', '_')
categorical_columns = list(df.dtypes[df.dtypes == 'object'].index)
numerical_columns = list(df.dtypes[df.dtypes != 'object'].index)
numerical_columns.remove('price')

for c in categorical_columns:
    df[c] = df[c].str.lower().str.replace(' ', '_')

df.reviews_per_month = df.reviews_per_month.fillna(0)

from sklearn.model_selection import train_test_split

df.price = np.log1p(df.price)


df_full_train, df_test = train_test_split(df, test_size=0.2, random_state=42)
df_train, df_val = train_test_split(df_full_train, test_size=0.25, random_state=42)

df_train = df_train.reset_index(drop=True)
df_val = df_val.reset_index(drop=True)
df_test = df_test.reset_index(drop=True)

y_train = df_train.price.values
y_val = df_val.price.values
y_test = df_test.price.values

del df_train['price']
del df_val['price']
del df_test['price']


from sklearn.feature_extraction import DictVectorizer

dv = DictVectorizer(sparse=False)

train_dict = df_train[categorical_columns+numerical_columns].to_dict(orient='records')
X_train = dv.fit_transform(train_dict)

val_dict = df_val[categorical + numerical].to_dict(orient='records')
X_val = dv.transform(val_dict)

from sklearn.linear_model import Ridge

alpha_list = [0, 0.01, 0.1, 1, 10]
result = []
for i in alpha_list:
    model = Ridge(i)
    model.fit(X_train, y_train)
    score = model.score(X_val, y_val)
    result.append((i, score.round(3)))

result

[(0, 0.488), (0.01, 0.488), (0.1, 0.488), (1, 0.488), (10, 0.486)]