## Lab 5 - Clustering

#### 1. Read the dataset

In [0]:
# Your code goes in here
from pyspark.sql import SQLContext # imports SQlContext
sqlContext = SQLContext(sc) # creates the SQL context
spark_df = sqlContext.sql("Select * from wa_hdma") # Gets the data and saves it as spark_df

In [0]:
# Function to drop the empty columns of a DF
def dropColumns(df):
    for col in df.columns:
        # Get the distinct values of the column
        unique_val = spark_df.filter(spark_df[col]!='null').select(col).distinct().count()
        # See whether the unique value is only none/nan or null
        if unique_val == 1:
            print("Dropping " + col + " because of all null values.")
            df = df.drop(col)
    return(df)

  
spark_df = dropColumns(spark_df) # drops the columns w/ all null values if they exist

#### 2. Identify categorical and numerical variables

In [0]:
# categorical variables
dtypes = spark_df.dtypes
cat_input = [] # creates a variable of list type to save the categorical input variables
for i in range(0, len(spark_df.columns)): # iterate over the spark_df variables
  if dtypes[i][1] == 'string': #identify categorical variable
    cat_input.append(dtypes[i][0]) # add te name of the categorical variable to cat_input
cat_input

In [0]:
# numerical variables
num_input = list(set(spark_df.columns) - set(cat_input))
num_input

#### 3. Create an imputer for numerical data

In [0]:
from pyspark import keyword_only  ## < 2.0 -> pyspark.ml.util.keyword_only
from pyspark.ml import Transformer
from pyspark.ml.param.shared import HasInputCol, HasOutputCol, Param
from pyspark.sql.functions import udf
from pyspark.sql.types import ArrayType, StringType
from pyspark.sql.functions import col
from pyspark.sql.functions import when 

class NumericImputer(Transformer, HasInputCol, HasOutputCol):

    def __init__(self, inputCol=None, outputCol=None): # do not change this part at all when creating custom transformers
        super(NumericImputer, self).__init__()  # do not change this part at all when creating custom transformers
        self.setParams(inputCol = inputCol , outputCol = outputCol)  # do not change this part at all when creating custom transformers

        
    def setParams(self, inputCol=None, outputCol=None):  # do not change this part at all when creating custom transformers
      return self._set(inputCol = inputCol, outputCol = outputCol)  # do not change this part at all when creating custom transformers
        

    def _transform(self, dataset):  # do not change this part at all when creating custom transformers

      out_col = self.getOutputCol()  # do not change this part at all when creating custom transformers
      in_col = self.getInputCol()  # do not change this part at all when creating custom transformers
      
      
      
      from pyspark.sql.functions import when  
      from pyspark.sql.types import DoubleType
      median_v = dataset.approxQuantile(in_col, [0.5], 0)[0] # compute median #dataset = dataset.fillna(median_v, subset=in_col)
      return dataset.withColumn(out_col, when(col(in_col).isNull(), median_v).otherwise(col(in_col)).cast(DoubleType())) # replacing null values with median if the value is null otherwise keeping it as is

In [0]:
numericimputers = [] # creating an empty list
for column in num_input: # creating a list of numeric imputers each of which responsible for imputing a variable
  numericimputers.append(NumericImputer(inputCol = column , outputCol = column))
numericimputers

#### 4. Create an imputer for categorical data

In [0]:
from pyspark import keyword_only  ## < 2.0 -> pyspark.ml.util.keyword_only
from pyspark.ml import Transformer
from pyspark.ml.param.shared import HasInputCol, HasOutputCol, Param
from pyspark.sql.functions import udf
from pyspark.sql.types import ArrayType, StringType
from pyspark.sql.functions import col

class CategoricalImputer(Transformer, HasInputCol, HasOutputCol): # replacing missing values in categorical columns

    def __init__(self, inputCol=None, outputCol=None): # do not change this part at all when creating custom transformers
        super(CategoricalImputer, self).__init__()  # do not change this part at all when creating custom transformers
        self.setParams(inputCol = inputCol , outputCol = outputCol)  # do not change this part at all when creating custom transformers

        
    def setParams(self, inputCol=None, outputCol=None):  # do not change this part at all when creating custom transformers
      return self._set(inputCol = inputCol, outputCol = outputCol)  # do not change this part at all when creating custom transformers
        

    def _transform(self, dataset):  # do not change this part at all when creating custom transformers
      
      from pyspark.sql.functions import when

      out_col = self.getOutputCol()  # do not change this part at all when creating custom transformers
      in_col = self.getInputCol()  # do not change this part at all when creating custom transformers
      
      
      #Groupby column name in_col
      temp = dataset.groupby(in_col).count() # groupby and count the levels (such as how many observations there are for each level)
      temp = temp.filter(temp[in_col]!= 'null') # exclude the null in case the domintaing level is null
      #sort it and convert to pandas
      sorted_df = temp.sort(temp['count'].desc()).toPandas() # sort it find the most occuring level (which is the mode)
      #mode 
      mode_v = sorted_df.loc[0][in_col] # get the mode
      
      return dataset.withColumn(out_col, when(col(in_col).isNull(), mode_v).otherwise(col(in_col))) # repace the nulls with the mode for that variable

In [0]:
cat_imputers = []
for column in cat_input:
  cat_imputers.append(CategoricalImputer(inputCol = column, outputCol = column))
cat_imputers

#### 5. Standardize variables

In [0]:
# normalize a dataset
from pyspark.sql.types import IntegerType
from pyspark import keyword_only  ## < 2.0 -> pyspark.ml.util.keyword_only
from pyspark.ml import Transformer
from pyspark.ml.param.shared import HasInputCol, HasOutputCol, Param
from pyspark.sql.functions import udf
from pyspark.sql.types import ArrayType, StringType


class Standardizer(Transformer, HasInputCol, HasOutputCol): # creating a standardizer class that will standardize columns (x-mean/3std)
  
    def __init__(self, inputCol=None, outputCol=None): # do not change this part at all when creating custom transformers
        super(Standardizer, self).__init__() # do not change this part at all when creating custom transformers
        self.setParams(inputCol = inputCol , outputCol = outputCol) # do not change this part at all when creating custom transformers

        
        
    def setParams(self, inputCol=None, outputCol=None): # do not change this part at all when creating custom transformers
      return self._set(inputCol = inputCol, outputCol = outputCol) # do not change this part at all when creating custom transformers
        

    def _transform(self, dataset): # do not change this part at all when creating custom transformers

      out_col = self.getOutputCol()  # do not change this part at all when creating custom transformers
      in_col = self.getInputCol()  # do not change this part at all when creating custom transformers
      
      from pyspark.sql.functions import stddev, mean, col 
      from pyspark.sql.types import DoubleType
      from pyspark.sql.functions import udf

      mean_ = dataset.select(mean(in_col)).first()[0] # compute mean
      std_ = dataset.select(stddev(in_col)).first()[0] # compute stdev
      udf_c =  udf(lambda x: (x-mean_)/std_, DoubleType()) # create a custom udf that will iteratre over a column and subtract the mean fom each value and divide the result by stdev
      return dataset.withColumn(out_col, udf_c(dataset[in_col])) # standardize the variable and saves the variable as out_col
    

In [0]:
standardizers = [] # creating an empty list
for column in num_input: # creating a list of numeric imputers each of which responsible for imputing a variable
  standardizers.append(Standardizer(inputCol = column, outputCol = column+"_standardized"))
standardizers

#### 6. Encode categorical variables

In [0]:
from pyspark.ml import Pipeline
from pyspark.ml.feature import StringIndexer
from pyspark.ml.feature import OneHotEncoder
from pyspark.ml.feature import StandardScaler
from pyspark.ml.linalg import Vectors
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.regression import LinearRegression

indexers = [StringIndexer(inputCol = column, outputCol = column+"_index", handleInvalid='keep') for column in cat_input] # this will first create indexers (it means it will number each category in a variable)
encoders = [OneHotEncoder(inputCol = column+"_index", outputCol = column+"_dummy", handleInvalid='keep') for column in cat_input] # This will use the numbers to create the binary variables
encoders

#### 7. Determine the variables

In [0]:
input_cols = [] # creates an empty list
for i in cat_input: #iterates over categorical variables
  input_cols.append(i+"_dummy") # saves the name of the dummy encoded categorical variables (they are not categorical now though)
for i in num_input: # iterates over the numerical variables
  input_cols.append(i+"_standardized") # saves the name of standardized numerical variables
input_cols

#### 9. Add more stages to the Pipeline

In [0]:
# your code goes in here
from pyspark.ml.feature import StringIndexer
import functools 
import operator
from pyspark.ml.linalg import Vectors
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.classification import LogisticRegression

stages = []
assembler = VectorAssembler(inputCols= input_cols, outputCol="features") #conactenates all input variables and names as features [[0,1,0],30,20,40000]
stages = functools.reduce(operator.concat, [numericimputers, cat_imputers, indexers, encoders, standardizers]) #indexers,  encoders, standardizers])
stages.append(assembler)
stages

#### 10. Combine the stages as a list

In [0]:
from pyspark.ml import Pipeline
pipeline = Pipeline(stages=stages)
spark_df_ = pipeline.fit(spark_df).transform(spark_df)

#### 9. Fit data using K-Means

In [0]:
from pyspark.ml.clustering import KMeans # import K-means
kmeans = KMeans().setK(2).setSeed(1) # 2 clusters here
model = kmeans.fit(spark_df_.select('features')) # train k-means

In [0]:
transformed = model.transform(spark_df_) # find the clusters (similar to making predictions)
transformed.select('features', 'prediction').show() # display features and predictions

#### 10. Analyze the results

In [0]:
transformed.groupBy('prediction').mean().toPandas() # analyze the clusters

Unnamed: 0,prediction,avg(tract_to_msamd_income),avg(rate_spread),avg(population),avg(minority_population),avg(number_of_owner_occupied_units),avg(number_of_1_to_4_family_units),avg(loan_amount_000s),avg(hud_median_family_income),avg(applicant_income_000s),avg(sequence_number),avg(census_tract_number),avg(application_date_indicator),avg(respondent_id_index),avg(purchaser_type_name_index),avg(property_type_name_index),avg(preapproval_name_index),avg(owner_occupancy_name_index),avg(msamd_name_index),avg(loan_type_name_index),avg(loan_purpose_name_index),avg(lien_status_name_index),avg(hoepa_status_name_index),avg(denial_reason_name_3_index),avg(denial_reason_name_2_index),avg(denial_reason_name_1_index),avg(county_name_index),avg(co_applicant_sex_name_index),avg(co_applicant_race_name_4_index),avg(co_applicant_race_name_3_index),avg(co_applicant_race_name_2_index),avg(co_applicant_race_name_1_index),avg(co_applicant_ethnicity_name_index),avg(applicant_sex_name_index),avg(applicant_race_name_5_index),avg(applicant_race_name_4_index),avg(applicant_race_name_3_index),avg(applicant_race_name_2_index),avg(applicant_race_name_1_index),avg(applicant_ethnicity_name_index),avg(agency_name_index),avg(agency_abbr_index),avg(census_tract_number_standardized),avg(applicant_income_000s_standardized),avg(sequence_number_standardized),avg(minority_population_standardized),avg(number_of_owner_occupied_units_standardized),avg(rate_spread_standardized),avg(hud_median_family_income_standardized),avg(number_of_1_to_4_family_units_standardized),avg(population_standardized),avg(application_date_indicator_standardized),avg(tract_to_msamd_income_standardized),avg(loan_amount_000s_standardized),avg(prediction)
0,1,117.395206,1.727513,6579.951761,20.813937,1850.164987,2409.645826,300.727366,73408.85724,116.25092,112235.268309,1735.942285,0.21098,48.477912,1.386643,0.041537,0.17427,0.08219,2.666639,0.47276,0.525638,0.179648,2.1e-05,0.005545,0.035202,0.142938,5.789271,0.976511,1e-05,8.4e-05,0.001927,0.9949,0.892824,0.582124,1e-05,7.9e-05,0.000298,0.003979,0.615645,0.406734,0.829919,0.829919,0.115907,0.055273,-0.001973,-0.259533,0.9106,0.000667,-0.261268,0.862134,0.776248,0.006062,0.358569,0.003242,1.0
1,0,99.631662,1.727332,4392.758826,27.495108,1069.433698,1414.782754,296.551635,79137.63467,105.32221,112910.310831,1157.18804,0.204724,47.05606,1.381304,0.035862,0.170937,0.100225,1.487022,0.419789,0.507898,0.177091,3.3e-05,0.006085,0.037376,0.149724,3.624746,0.851209,1.8e-05,8.3e-05,0.001847,0.904801,0.762135,0.647418,1.8e-05,6.5e-05,0.000319,0.005051,0.751801,0.426535,0.747976,0.747976,-0.080327,-0.038306,0.001367,0.179863,-0.63107,-0.000462,0.181066,-0.597482,-0.537961,-0.004201,-0.248498,-0.002247,0.0
