In [None]:
import pandas as pd
import numpy as np
import json

In [None]:
from numpy import arange

import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns

from sklearn.cluster import KMeans
from sklearn.decomposition import PCA
import plotly.graph_objs as go
from plotly.offline import init_notebook_mode, iplot
init_notebook_mode()

In [None]:
house_search_df = pd.read_csv('House_search_responses.csv')

In [None]:
house_search_df.columns

In [None]:
house_search_df = house_search_df[['1.Name','2.What is your preferred living arrangement?',
       '3.What neighbourhoods are you looking into?',
       '4.What price range are you looking for?',
       '5.Are you looking for decorated apartments?',
       '6.With how many IESEans do you wish to share?','7.What is your gender?',
       '8.How do you feel about roommates throwing parties at home?',
       '9.How do you feel about smokers?', '10.What about guests?',
       '11.What are your interests?'
        ]]

house_search_df.rename(columns={'1.Name':'Name','2.What is your preferred living arrangement?':'Apartment_type',
       '3.What neighbourhoods are you looking into?':'Neighbourhood',
       '4.What price range are you looking for?':'Price_preference',
       '5.Are you looking for decorated apartments?':'Furnishing_preference',
       '6.With how many IESEans do you wish to share?':'Sharing_quantity','7.What is your gender?':'Gender_preference',
       '8.How do you feel about roommates throwing parties at home?':'Party_preference',
       '9.How do you feel about smokers?':'Smoking_preference', '10.What about guests?':'Guest_preference',
       '11.What are your interests?':'Interests'}, inplace=True)

In [None]:
house_search_df['First_Name'] = house_search_df['Name'].str.split(' ', expand=True)[0]
house_search_df['Last_Name'] = house_search_df['Name'].str.split(' ', expand=True)[1]

In [None]:
house_search_df.head(5)

In [None]:
print("Total Number of rows in the response sheet", len(house_search_df))

In [None]:
#Types of datatypes for each columns
house_search_df.dtypes

# Exploratory Analysis

In [None]:
responses_df = house_search_df.copy()

In [None]:
column_list = responses_df.columns

In [None]:
#Choosing columns that are object and convert it into category. 
#This step is needed to map the categories into numerical values
column_list

In [None]:
mappedDictionary={}
for column_names in column_list[1:11]:
    #This step converts objects -> category
    responses_df[column_names] = responses_df[column_names].astype('category')
    
    #This step encodes the categorical values into numerical keys. 
    ''' For example
    "0": "a. Living alone",
    "1": "b. Living with partner",
    "2": "c. Sharing with classmates"
    '''
    
    #This step stores the mapped keys to the dictionary
    mappedDictionary["dict_{0}".format(column_names)]=dict(enumerate(responses_df[column_names].astype('category').cat.categories))
    
    responses_df[column_names] = responses_df[column_names].cat.codes

In [None]:
responses_df.head()

## Printing the Mapped results

- This gives the encoded values

In [None]:
json_map_results = (json.dumps(mappedDictionary, indent=4, sort_keys=True))

In [None]:
print(json_map_results)

## How are the responses stacked for each category

In [None]:
plt.figure(figsize=(20,9))
sns.set(font_scale=2)
plt.title('What type of sharing is preferred the most?')
sns.barplot(x=house_search_df['Apartment_type'].value_counts().index, y=house_search_df['Apartment_type'].value_counts())
plt.xlabel("Sharing type")
plt.ylabel("Number of Responses")

 ## Mapping of the neighhbourhoods
 

In [None]:
plt.figure(figsize=(15,10))
sns.set(font_scale=2)
plt.title('Which is the most favoured neighbourhood to live?')
sns.barplot(x=responses_df['Neighbourhood'].value_counts().index, y=house_search_df['Neighbourhood'].value_counts())
plt.xlabel("Neighbourhoods")
plt.ylabel("Number of Responses")
(mappedDictionary.get('dict_Neighbourhood'))

## What type of Pricing is preferred the most

In [None]:
plt.figure(figsize=(15,9))
sns.set(font_scale=2)
plt.title('What type of Pricing for the accomodation is preferred the most?')
sns.barplot(x=responses_df['Price_preference'].value_counts().index, y=responses_df['Price_preference'].value_counts())
plt.xlabel("Price Preference Category")
plt.ylabel("Number of Responses")
print('The Mappings are value is\n')
(mappedDictionary.get('dict_Price_preference'))

# What type of Furnishing is preferred the most

In [None]:
plt.figure(figsize=(15,9))
sns.set(font_scale=2)
plt.title('Is furnished apartment the best choice?')
sns.barplot(x=responses_df['Furnishing_preference'].value_counts().index, y=responses_df['Furnishing_preference'].value_counts())
plt.xlabel("Price Preference Category")
plt.ylabel("Number of Responses")
print('The Mappings are value is\n')
(mappedDictionary.get('dict_Furnishing_preference'))

# Sharing Prefrence

In [None]:
plt.figure(figsize=(15,9))
sns.set(font_scale=2)
plt.title('How many people do you want to share your apartment with?')
sns.barplot(x=responses_df['Sharing_quantity'].value_counts().index, y=responses_df['Sharing_quantity'].value_counts())
plt.xlabel("Sharing_quantity Category")
plt.ylabel("Number of Responses")
print('The Mappings are value is\n')
(mappedDictionary.get('dict_Sharing_quantity'))[-1] = 'No response'
(mappedDictionary.get('dict_Sharing_quantity'))

In [None]:
plt.figure(figsize=(15,9))
sns.set(font_scale=2)
plt.title('Do you like sharing with other genders?')
sns.barplot(x=responses_df['Gender_preference'].value_counts().index, y=responses_df['Gender_preference'].value_counts())
plt.xlabel("Gender_preference Category")
plt.ylabel("Number of Responses")
print('The Mappings are value is\n')
(mappedDictionary.get('dict_Gender_preference'))[-1] = 'No response'
(mappedDictionary.get('dict_Gender_preference'))

In [None]:
plt.figure(figsize=(15,9))
sns.set(font_scale=2)
plt.title('What type of Party_preference is preferred the most?')
sns.barplot(x=responses_df['Party_preference'].value_counts().index, y=responses_df['Party_preference'].value_counts())
plt.xlabel("Party_preference Category")
plt.ylabel("Number of Responses")
print('The Mappings are value is\n')
(mappedDictionary.get('dict_Party_preference'))[-1] = 'No response'
(mappedDictionary.get('dict_Party_preference'))

In [None]:
plt.figure(figsize=(15,9))
sns.set(font_scale=2)
plt.title('Do you want to live with smokers?')
sns.barplot(x=responses_df['Smoking_preference'].value_counts().index, y=responses_df['Smoking_preference'].value_counts())
plt.xlabel("Smoking_preference Category")
plt.ylabel("Number of Responses")
print('The Mappings are value is\n')
(mappedDictionary.get('dict_Smoking_preference'))[-1] = 'No response'
(mappedDictionary.get('dict_Smoking_preference'))

In [None]:
plt.figure(figsize=(15,9))
sns.set(font_scale=2)
plt.title('Are guests fine to visit the apartment?')
sns.barplot(x=responses_df['Guest_preference'].value_counts().index, y=responses_df['Guest_preference'].value_counts())
plt.xlabel("Guest_preference Category")
plt.ylabel("Number of Responses")
print('The Mappings are value is\n')
(mappedDictionary.get('dict_Guest_preference'))[-1] = 'No response'
(mappedDictionary.get('dict_Guest_preference'))

In [None]:
plt.figure(figsize=(40,9))
sns.set(font_scale=2)
plt.title('What type of Interests is preferred the most?')
sns.barplot(x=responses_df['Interests'].value_counts().index, y=responses_df['Interests'].value_counts())
plt.xlabel("Interests Category")
plt.ylabel("Responses")
print('The Mappings are value is\n')
(mappedDictionary.get('dict_Interests'))[-1] = 'No response'
(mappedDictionary.get('dict_Interests'))

# Using Clustering Approach to segment user groups

- K Means clustering is used as the preferred choice for segmenting the clusters in an unsupervised way

In [None]:
transpose_flatmatePreferences = pd.get_dummies(data=house_search_df, columns=['Apartment_type', 'Neighbourhood', 'Price_preference',
       'Furnishing_preference', 'Sharing_quantity', 'Gender_preference',
       'Party_preference', 'Smoking_preference', 'Guest_preference',
       'Interests'])

## Show columns with Null / NaN values

- Removing the columns for Names using dropNa, else we would have replaces by 0 if the names value was present

In [None]:
transpose_flatmatePreferences.isnull().sum()

In [None]:
transpose_flatmatePreferences = transpose_flatmatePreferences.dropna()

In [None]:
# To generate random hex colors
import random
r = lambda: random.randint(0,255)

In [None]:
# Using this function to select the closest lower odd number. This will be used to create clusters
def round_up_to_odd(f):
    f = int(np.ceil(f))
    return f - 1 if f % 2 == 0 else f

In [None]:
cluster = KMeans(n_clusters=round_up_to_odd(len(transpose_flatmatePreferences)/5))

In [None]:
# Adding 1 to initiate the clusters from 1 onwards instead of 0
transpose_flatmatePreferences['cluster'] = cluster.fit_predict(transpose_flatmatePreferences[transpose_flatmatePreferences.columns[3:]]) + 1 

In [None]:
transpose_flatmatePreferences.head(5)

## Introducing PCA to reduce dimensionality

In [None]:
cols = transpose_flatmatePreferences.columns[3:]

In [None]:
pca = PCA(n_components= 2)

transpose_flatmatePreferences['x'] = pca.fit_transform(transpose_flatmatePreferences[cols])[:,0]
transpose_flatmatePreferences['y'] =  pca.fit_transform(transpose_flatmatePreferences[cols])[:,1]


In [None]:
segmented_flatmates = transpose_flatmatePreferences[['Name','First_Name','Last_Name','cluster','x','y']]

In [None]:
print("Total segments created for the flatmates ",set(segmented_flatmates.cluster))

In [None]:
def scatterPlot(ClusterNumber, studentsCluster_dataframe):
    return(plotGraph(ClusterNumber,studentsCluster_dataframe))

In [None]:
def plotGraph(ClusterNumber, studentsCluster_dataframe):
        plot_data = go.Scatter(x = studentsCluster_dataframe[studentsCluster_dataframe['cluster']==ClusterNumber]['x'],
                    y = studentsCluster_dataframe[studentsCluster_dataframe['cluster']==ClusterNumber]['y'],
                    name = "Cluster "+str(ClusterNumber),
                    mode = "markers",
                    marker = dict(size =10,
                    color = '#%02X%02X%02X' % (r(),r(),r()),
                    line = dict(width = 1, color='#%02X%02X%02X' % (r(),r(),r()) ))
                   )
        return plot_data

In [None]:
chart = []
for clusterNumber in set(segmented_flatmates.cluster):
    chart.append(scatterPlot(clusterNumber, segmented_flatmates))

In [None]:
iplot(chart)

# Measuring the performance of the clusters

- Using RMSE to calculate the performance of the clusters.
- Lower the RMSE, the better is the performance of the cluster

In [None]:
# Run the Kmeans algorithm and get the index of data points clusters
sse = []
list_k = list(set(segmented_flatmates.cluster))

for k in list_k[0:]:
    km = KMeans(n_clusters=k)
    km.fit(transpose_flatmatePreferences[transpose_flatmatePreferences.columns[3:]])
    sse.append(km.inertia_)

# Plot sse against k
plt.figure(figsize=(15, 6))
plt.plot(list_k[0:], sse, '-o')
plt.title('How RMSE changes with number of clusters')
plt.xlabel(r'Number of clusters *k*')
plt.ylabel('Sum of squared distance')
print('Lower the RMSE, the better is the performance of the cluster')

## Displaying first 5 names in the segment

In [None]:
segmented_flatmates.sort_values(by=['cluster']).head()

# Storing the results as csv

In [None]:
transpose_flatmatePreferences.to_csv("students_preference_matrix.csv")

segmented_flatmates = segmented_flatmates.sort_values(by=['cluster'])

segmented_flatmates.to_csv('flatmate_clusters.csv')