Classifying Pulsar Stars

In [1]:
set.seed(2001)
library(tidyverse)
library(tidymodels)

── [1mAttaching packages[22m ─────────────────────────────────────── tidyverse 1.3.0 ──

[32m✔[39m [34mggplot2[39m 3.3.2     [32m✔[39m [34mpurrr  [39m 0.3.4
[32m✔[39m [34mtibble [39m 3.0.3     [32m✔[39m [34mdplyr  [39m 1.0.2
[32m✔[39m [34mtidyr  [39m 1.1.2     [32m✔[39m [34mstringr[39m 1.4.0
[32m✔[39m [34mreadr  [39m 1.3.1     [32m✔[39m [34mforcats[39m 0.5.0

“package ‘ggplot2’ was built under R version 4.0.1”
“package ‘tibble’ was built under R version 4.0.2”
“package ‘tidyr’ was built under R version 4.0.2”
“package ‘dplyr’ was built under R version 4.0.2”
── [1mConflicts[22m ────────────────────────────────────────── tidyverse_conflicts() ──
[31m✖[39m [34mdplyr[39m::[32mfilter()[39m masks [34mstats[39m::filter()
[31m✖[39m [34mdplyr[39m::[32mlag()[39m    masks [34mstats[39m::lag()

“package ‘tidymodels’ was built under R version 4.0.2”
── [1mAttaching packages[22m ────────────────────────────────────── tidymodels 0.1.1 ──

[32m✔

Preliminary exploratory data analysis

In [9]:
#reading the pulsar dataset directly from web
url <- "https://archive.ics.uci.edu/ml/machine-learning-databases/00372/HTRU2.zip"

temp <- tempfile()
download.file(url,temp)
pulsar_data <- read_csv(unz(temp, "HTRU_2.csv"))
unlink(temp)

pulsar_data

Parsed with column specification:
cols(
  `140.5625` = [32mcol_double()[39m,
  `55.68378214` = [32mcol_double()[39m,
  `-0.234571412` = [32mcol_double()[39m,
  `-0.699648398` = [32mcol_double()[39m,
  `3.199832776` = [32mcol_double()[39m,
  `19.11042633` = [32mcol_double()[39m,
  `7.975531794` = [32mcol_double()[39m,
  `74.24222492` = [32mcol_double()[39m,
  `0` = [32mcol_double()[39m
)



140.5625,55.68378214,-0.234571412,-0.699648398,3.199832776,19.11042633,7.975531794,74.24222492,0
<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>
102.50781,58.88243,0.465318154,-0.51508791,1.6772575,14.860146,10.576487,127.39358,0
103.01562,39.34165,0.323328365,1.05116443,3.1212375,21.744669,7.735822,63.17191,0
136.75000,57.17845,-0.068414638,-0.63623837,3.6429766,20.959280,6.896499,53.59366,0
88.72656,40.67223,0.600866079,1.12349169,1.1789298,11.468720,14.269573,252.56731,0
93.57031,46.69811,0.531904850,0.41672112,1.6362876,14.545074,10.621748,131.39400,0
119.48438,48.76506,0.031460220,-0.11216757,0.9991639,9.279612,19.206230,479.75657,0
130.38281,39.84406,-0.158322759,0.38954045,1.2207358,14.378941,13.539456,198.23646,0
107.25000,52.62708,0.452688025,0.17034738,2.3319398,14.486853,9.001004,107.97251,0
107.25781,39.49649,0.465881961,1.16287712,4.0794314,24.980418,7.397080,57.78474,0
142.07812,45.28807,-0.320328426,0.28395251,5.3762542,29.009897,6.076266,37.83139,0


In [10]:
#added column names
colnames(pulsar_data) <- c("mean_pf", "std_pf", "kurt_pf", "skew_pf", "mean_dm", "std_dm", "kurt_dm", "skew_dm", "class")

pulsar_mutate <- pulsar_data %>%
                mutate(class = as_factor(class))
pulsar_mutate

mean_pf,std_pf,kurt_pf,skew_pf,mean_dm,std_dm,kurt_dm,skew_dm,class
<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<fct>
102.50781,58.88243,0.465318154,-0.51508791,1.6772575,14.860146,10.576487,127.39358,0
103.01562,39.34165,0.323328365,1.05116443,3.1212375,21.744669,7.735822,63.17191,0
136.75000,57.17845,-0.068414638,-0.63623837,3.6429766,20.959280,6.896499,53.59366,0
88.72656,40.67223,0.600866079,1.12349169,1.1789298,11.468720,14.269573,252.56731,0
93.57031,46.69811,0.531904850,0.41672112,1.6362876,14.545074,10.621748,131.39400,0
119.48438,48.76506,0.031460220,-0.11216757,0.9991639,9.279612,19.206230,479.75657,0
130.38281,39.84406,-0.158322759,0.38954045,1.2207358,14.378941,13.539456,198.23646,0
107.25000,52.62708,0.452688025,0.17034738,2.3319398,14.486853,9.001004,107.97251,0
107.25781,39.49649,0.465881961,1.16287712,4.0794314,24.980418,7.397080,57.78474,0
142.07812,45.28807,-0.320328426,0.28395251,5.3762542,29.009897,6.076266,37.83139,0


In [11]:
pulsar_split <- initial_split(pulsar_mutate, prop = 0.75, strata = class)  
pulsar_train <- training(pulsar_split)   
pulsar_test <- testing(pulsar_split)
pulsar_train
pulsar_test

mean_pf,std_pf,kurt_pf,skew_pf,mean_dm,std_dm,kurt_dm,skew_dm,class
<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<fct>
103.01562,39.34165,0.323328365,1.05116443,3.1212375,21.744669,7.735822,63.17191,0
136.75000,57.17845,-0.068414638,-0.63623837,3.6429766,20.959280,6.896499,53.59366,0
88.72656,40.67223,0.600866079,1.12349169,1.1789298,11.468720,14.269573,252.56731,0
93.57031,46.69811,0.531904850,0.41672112,1.6362876,14.545074,10.621748,131.39400,0
119.48438,48.76506,0.031460220,-0.11216757,0.9991639,9.279612,19.206230,479.75657,0
130.38281,39.84406,-0.158322759,0.38954045,1.2207358,14.378941,13.539456,198.23646,0
107.25000,52.62708,0.452688025,0.17034738,2.3319398,14.486853,9.001004,107.97251,0
107.25781,39.49649,0.465881961,1.16287712,4.0794314,24.980418,7.397080,57.78474,0
142.07812,45.28807,-0.320328426,0.28395251,5.3762542,29.009897,6.076266,37.83139,0
133.25781,44.05824,-0.081059862,0.11536151,1.6321070,12.007806,11.972067,195.54345,0


mean_pf,std_pf,kurt_pf,skew_pf,mean_dm,std_dm,kurt_dm,skew_dm,class
<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<fct>
102.50781,58.88243,0.46531815,-0.51508791,1.677258,14.86015,10.5764867,127.393580,0
99.36719,41.57220,1.54719697,4.15410604,27.555184,61.71902,2.2088080,3.662680,1
105.44531,41.13997,0.14265380,0.32041968,3.551839,20.75502,7.7395523,68.519771,0
95.86719,42.05992,0.32638692,0.80350179,1.832776,12.24897,11.2493310,177.230771,0
106.64844,56.36718,0.37835507,-0.26637161,2.436455,18.40537,9.3786597,96.860225,0
112.71875,50.30127,0.27939095,-0.12901071,8.281773,37.81001,4.6918269,21.276210,0
110.10938,41.31817,0.09486040,0.68311261,1.010033,13.02628,14.6665108,231.204136,0
142.05469,53.87316,-0.47077269,-0.12594642,4.423077,27.08351,6.6816583,45.944030,0
107.87500,37.33066,0.49600476,1.48181586,1.173913,12.01691,14.5342897,252.694738,0
106.28125,43.02179,0.40886801,1.03201467,1.610368,17.25116,12.1101933,152.014956,0


Using only training data, summarize the data in at least one table (this is exploratory data analysis). An example of a useful table could be one that reports the number of observations in each class, the means of the predictor variables you plan to use in your analysis and how many rows have missing data. 

In [14]:
#number of observations in each class
training_count <- pulsar_train %>%
                group_by(class) %>%
                summarize (n = n())
training_count

`summarise()` ungrouping output (override with `.groups` argument)



class,n
<fct>,<int>
0,12178
1,1245
