# k-means clustering
The following exercise is taken from <b> Machine Learning with R</b> by <b> Brett Lantz </b> (Third Edition)

The dataset was originally published by Brett Lantz in research he conducted at the University of Notre Dame

## Step 1: Collecting the data

In [2]:
teens <- read.csv("https://raw.githubusercontent.com/PacktPublishing/Machine-Learning-with-R-Third-Edition/master/Chapter09/snsdata.csv")

## Step 2: Exploring and Preparing the data

In [3]:
str(teens)

'data.frame':	30000 obs. of  40 variables:
 $ gradyear    : int  2006 2006 2006 2006 2006 2006 2006 2006 2006 2006 ...
 $ gender      : Factor w/ 2 levels "F","M": 2 1 2 1 NA 1 1 2 1 1 ...
 $ age         : num  19 18.8 18.3 18.9 19 ...
 $ friends     : int  7 0 69 0 10 142 72 17 52 39 ...
 $ basketball  : int  0 0 0 0 0 0 0 0 0 0 ...
 $ football    : int  0 1 1 0 0 0 0 0 0 0 ...
 $ soccer      : int  0 0 0 0 0 0 0 0 0 0 ...
 $ softball    : int  0 0 0 0 0 0 0 1 0 0 ...
 $ volleyball  : int  0 0 0 0 0 0 0 0 0 0 ...
 $ swimming    : int  0 0 0 0 0 0 0 0 0 0 ...
 $ cheerleading: int  0 0 0 0 0 0 0 0 0 0 ...
 $ baseball    : int  0 0 0 0 0 0 0 0 0 0 ...
 $ tennis      : int  0 0 0 0 0 0 0 0 0 0 ...
 $ sports      : int  0 0 0 0 0 0 0 0 0 0 ...
 $ cute        : int  0 1 0 1 0 0 0 0 0 1 ...
 $ sex         : int  0 0 0 0 1 1 0 2 0 0 ...
 $ sexy        : int  0 0 0 0 0 0 0 1 0 0 ...
 $ hot         : int  0 0 0 0 0 0 0 0 0 1 ...
 $ kissed      : int  0 0 0 0 5 0 0 0 0 0 ...
 $ dance       : int

In [4]:
table(teens$gender)


    F     M 
22054  5222 

In [5]:
table(teens$gender, useNA = "ifany")


    F     M  <NA> 
22054  5222  2724 

In [6]:
summary(teens$age)

   Min. 1st Qu.  Median    Mean 3rd Qu.    Max.    NA's 
  3.086  16.312  17.287  17.994  18.259 106.927    5086 

In [8]:
teens$age <- ifelse(teens$age >= 13 & teens$age < 20, teens$age, NA)
summary(teens$age)

   Min. 1st Qu.  Median    Mean 3rd Qu.    Max.    NA's 
  13.03   16.30   17.27   17.25   18.22   20.00    5523 

### Data preparation - dummy coding missing values

In [10]:
teens$female <- ifelse(teens$gender == "F" & !is.na(teens$gender), 1, 0)
teens$no_gender <- ifelse(is.na(teens$gender), 1, 0)

In [11]:
table(teens$gender, useNA = "ifany")
table(teens$female, useNA = "ifany")
table(teens$no_gender, useNA ="ifany")


    F     M  <NA> 
22054  5222  2724 


    0     1 
 7946 22054 


    0     1 
27276  2724 

### Data preparation - imputing the missing values

In [12]:
aggregate(data = teens, age ~ gradyear, mean, na.rm = T)

gradyear,age
<int>,<dbl>
2006,18.65586
2007,17.70617
2008,16.7677
2009,15.81957


In [15]:
ave_age <- ave(teens$age, teens$gradyear, FUN = 
               function(x) mean(x, na.rm = TRUE))
teens$age <- ifelse(is.na(teens$age), ave_age, teens$age)
summary(teens$age)

   Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
  13.03   16.28   17.24   17.24   18.21   20.00 

## Step 3: Training a model on the data

In [18]:
library(stats)
interests <- teens[5:40]
interests_z <- as.data.frame(lapply(interests, scale))
summary(interests$basketball)
summary(interests_z$basketball)

   Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
 0.0000  0.0000  0.0000  0.2673  0.0000 24.0000 

   Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
-0.3322 -0.3322 -0.3322  0.0000 -0.3322 29.4923 

In [20]:
RNGversion("3.5.2")
set.seed(2345)
teen_clusters <- kmeans(interests_z, 5)

"non-uniform 'Rounding' sampler used"


## Step 4: Evaluating model performance

In [21]:
teen_clusters$size
teen_clusters$centers

Unnamed: 0,basketball,football,soccer,softball,volleyball,swimming,cheerleading,baseball,tennis,sports,...,blonde,mall,shopping,clothes,hollister,abercrombie,die,death,drunk,drugs
1,0.16001227,0.2364174,0.10385512,0.07232021,0.18897158,0.23970234,0.3931445,0.02993479,0.13532387,0.10257837,...,0.0613734,0.60368108,0.79806891,0.5651537331,4.1521844,3.9649381,0.043475966,0.09857501,0.035614771,0.03443294
2,-0.09195886,0.0652625,-0.09932124,-0.01739428,-0.06219308,0.03339844,-0.1101103,-0.1148751,0.04062204,-0.09899231,...,-0.01146396,-0.08724304,-0.03865318,-0.0003526292,-0.16783,-0.14129577,0.009447317,0.05135888,-0.08677322,-0.06878491
3,0.52755083,0.487348,0.29778605,0.37178877,0.37986175,0.29628671,0.3303485,0.35231971,0.14057808,0.3296713,...,0.03471458,0.48318495,0.66327838,0.375972512,-0.0553846,-0.07417839,0.037989066,0.1197219,-0.009688746,-0.05973769
4,0.34081039,0.3593965,0.1272225,0.16384661,0.110322,0.26943332,0.1856664,0.27527088,0.10980958,0.7971192,...,0.36134138,0.62256686,0.27101815,1.2306917174,0.1610784,0.26324494,1.71218187,0.93631312,1.8973882,2.73326605
5,-0.16695523,-0.1641499,-0.0903352,-0.11367669,-0.11682181,-0.10595448,-0.1136077,-0.10918483,-0.05097057,-0.13135334,...,-0.02918252,-0.18625656,-0.22865236,-0.1865419798,-0.1557662,-0.14861104,-0.09487518,-0.08370729,-0.087520105,-0.11423381


## Step 5: Improving model performance

In [22]:
teens$cluster <- teen_clusters$cluster
teens[1:5, c("cluster", "gender", "age", "friends")]

Unnamed: 0_level_0,cluster,gender,age,friends
Unnamed: 0_level_1,<int>,<fct>,<dbl>,<int>
1,5,M,18.982,7
2,3,F,18.801,0
3,5,M,18.335,69
4,5,F,18.875,0
5,4,,18.995,10


In [23]:
aggregate(data = teens, age ~ cluster, mean)

cluster,age
<int>,<dbl>
1,16.86497
2,17.39037
3,17.07656
4,17.11957
5,17.29849


In [24]:
aggregate(data = teens, female ~ cluster, mean)

cluster,female
<int>,<dbl>
1,0.8381171
2,0.725
3,0.8378198
4,0.8027079
5,0.6994515


In [25]:
aggregate(data = teens, friends ~ cluster, mean)

cluster,friends
<int>,<dbl>
1,41.43054
2,32.57333
3,37.16185
4,30.5029
5,27.70052
