In [74]:
install.packages("tidyverse")
install.packages("skimr")
install.packages("lubridate")

In [96]:
library(tidyverse)
library(lubridate)
library(skimr)


**2-Import Data**

In [76]:
daily_activity <- read_csv("../input/fitbit/Fitabase Data 4.12.16-5.12.16/dailyActivity_merged.csv")
daily_calories <- read_csv("../input/fitbit/Fitabase Data 4.12.16-5.12.16/dailyCalories_merged.csv")
daily_intensities <- read_csv("../input/fitbit/Fitabase Data 4.12.16-5.12.16/dailyIntensities_merged.csv")
daily_steps <- read_csv("../input/fitbit/Fitabase Data 4.12.16-5.12.16/dailySteps_merged.csv")
hourly_steps <- read_csv("../input/fitbit/Fitabase Data 4.12.16-5.12.16/hourlySteps_merged.csv")
hourly_calories <- read_csv("../input/fitbit/Fitabase Data 4.12.16-5.12.16/hourlyCalories_merged.csv")
hourly_intensities <- read_csv("../input/fitbit/Fitabase Data 4.12.16-5.12.16/hourlyIntensities_merged.csv")
sleep_day <- read_csv ("../input/fitbit/Fitabase Data 4.12.16-5.12.16/sleepDay_merged.csv")
weight_loginfo <- read_csv("../input/fitbit/Fitabase Data 4.12.16-5.12.16/weightLogInfo_merged.csv")


**2-Data cleaning**

In [77]:
skim(daily_activity)
skim(daily_calories)
skim(daily_intensities)
skim(daily_steps)
skim(hourly_calories)
skim(hourly_intensities)
skim(hourly_steps)
skim(sleep_day)
skim(weight_loginfo)

In [78]:
head(daily_activity)
head(daily_calories)
head(daily_intensities)
head(daily_steps)
head(hourly_calories)
head(hourly_intensities)
head(hourly_steps)
head(sleep_day)
head(weight_loginfo)


**Check duplicates**

In [79]:
#check duplicates
sum(duplicated(daily_activity))
sum(duplicated(daily_calories))
sum(duplicated(daily_intensities))
sum(duplicated(daily_steps))
sum(duplicated(hourly_calories))
sum(duplicated(hourly_intensities))
sum(duplicated(hourly_steps))
sum(duplicated(sleep_day))
sum(duplicated(weight_loginfo))


In [80]:
library(tidymodels)

In [81]:
require(devtools)

In [82]:
sleep_day <- sleep_day %>%
distinct()

In [83]:
# checking for duplicate data
sum(duplicated(sleep_day))


In [84]:
#Check NA data 

sum(is.na(daily_activity))
sum(is.na(daily_calories))
sum(is.na(daily_intensities))
sum(is.na(daily_steps))
sum(is.na(hourly_calories))
sum(is.na(hourly_intensities))
sum(is.na(hourly_steps))
sum(is.na(sleep_day))



**3- Process**

In [85]:
#basic understanding of dataset 
n_distinct(daily_activity$Id)
n_distinct(daily_calories$Id)
n_distinct(daily_intensities$Id)
n_distinct(daily_steps$Id)
n_distinct(hourly_calories$Id)
n_distinct(hourly_calories$Id)
n_distinct(hourly_steps$Id)
n_distinct(sleep_day$Id)
n_distinct(weight_loginfo$Id)


**3-Formatting date and time columns**

In [86]:
#convert string data type to datetime data type

daily_activity$Activity_Date <- parse_date_time(daily_activity$ActivityDate, "%m/%d/%y")
daily_calories$Activity_Date <- parse_date_time(daily_calories$ActivityDay, "%m/%d/%y")
daily_intensities$Activity_Date <- parse_date_time(daily_intensities$ActivityDay, "%m/%d/%y")
daily_steps$Activity_Date<- parse_date_time(daily_steps$ActivityDay, "%m/%d/%y")
hourly_calories$Activity_Hour <-parse_date_time(hourly_calories$ActivityHour, "%m/%d/%y %H:%M:%S, %p") 
hourly_intensities$Activity_Hour <-parse_date_time(hourly_intensities$ActivityHour, "%m/%d/%y %H:%M:%S, %p") 
hourly_steps$Activity_Hour <-parse_date_time(hourly_steps$ActivityHour, "%m/%d/%y %H:%M:%S, %p") 
sleep_day$sleepDate <- parse_date_time (sleep_day$SleepDay, "%m/%d/%y %H:%M:%S, %p")
weight_loginfo$WeightDate <- parse_date_time (weight_loginfo$Date, "%m/%d/%y %H:%M:%S, %p")


In [87]:
#Checking data types

str(daily_activity)
str(daily_calories)
str(daily_intensities)
str(daily_steps)
str(hourly_calories)
str(hourly_intensities)
str(hourly_steps)
str(sleep_day)
str(weight_loginfo)


**3-MERGE DATASET**

In [88]:
#Merge all hourly dataframes 

hourly_data <-merge(hourly_calories,hourly_intensities)%>%
left_join(hourly_steps, by=c("Id","ActivityHour"))%>%
separate(ActivityHour, sep=" ",into=c("date","time"))



In [89]:

head(hourly_data)
str(hourly_data)


In [90]:
#convert hourly dataframes date format

mutate(hourly_data, date_format=as.Date(date,format="%m/%d/%y"))
hourly_data$Subject_Id <- as.character(hourly_data$Id)
glimpse(hourly_data)

In [91]:
# Merge all daily data 

daily_data1 <- merge(daily_activity,daily_calories)
daily_data2 <- merge(daily_data1,daily_intensities)
daily_data <- merge(daily_data2,daily_steps)


In [92]:
head(daily_data)
str(daily_data)

In [93]:
#convert ID's data type to string typa as subject ID 

daily_data$Subject_Id <- as.character(daily_data$Id)
head(daily_data,3)







In [100]:
# Remove duplicate columns

daily_data <- select(daily_data,-c(TotalDistance,Id,StepTotal,ActivityDay,ActivityDate))
head(daily_data,3)

In [101]:
head(daily_data)

In [102]:
head(sleep_day)

In [103]:
#convert Id data type to string 

sleep_day$Subject_Id <- as.character(sleep_day$Id)
sleep_day$Activity_Date <- sleep_day$sleepDate

In [105]:
#merge sleep data and daily data 

daily_activity_sleep <- merge(sleep_day,daily_data,by = c("Subject_Id","Activity_Date"))
skim(daily_activity_sleep)
head(daily_activity_sleep,3)



**Analyze Phase**

**Check average activity pattern of the users**

In [106]:
#check overall users activity data 
daily_activity_sleep %>%
    select(TotalSteps,TrackerDistance,SedentaryMinutes,VeryActiveMinutes,FairlyActiveMinutes,LightlyActiveMinutes,TotalTimeInBed,TotalMinutesAsleep)%>%
summary()

In [107]:
head(sleep_day,2)

**4- Sleepday across week days**

In [109]:
# Convert total hours 
sleep_day$TotalHourSleep <- sleep_day$TotalMinutesAsleep/60

#Add weekday column 
sleep_day$SleepWeekday<- weekdays(as.Date(sleep_day$sleepDate))

# Group By Weekday

weeklySleepHour <- sleep_day %>% group_by(SleepWeekday)%>%
        summarise(Average_Sleep_Hour =mean(TotalHourSleep))


In [112]:
# Reorder the plot by sequence of the weekday
Week_day <- factor(weeklySleepHour$SleepWeekday,level=c('Sunday','Monday','Tuesday','Wednesday','Thursday','Friday','Saturday'))

#plot

ggplot(weeklySleepHour,aes(x=Week_day,y=Average_Sleep_Hour))+geom_col()



In [113]:
head(daily_activity_sleep,2)

**4- Group subjects by activity levels**

In [118]:
#Group subjects by activity levels

data_by_usertype <- daily_activity_sleep %>%
    summarise(
    TotalHourInBed=TotalTimeInBed/60,
    TotalHourSleep=TotalMinutesAsleep/60,
    activity_level=factor(case_when(
    SedentaryMinutes > mean(SedentaryMinutes) & LightlyActiveMinutes < mean(LightlyActiveMinutes) & FairlyActiveMinutes < mean(FairlyActiveMinutes) & VeryActiveMinutes < mean(VeryActiveMinutes)~"Sedentary",
    SedentaryMinutes < mean(SedentaryMinutes) & LightlyActiveMinutes > mean(LightlyActiveMinutes) & FairlyActiveMinutes < mean(FairlyActiveMinutes) & VeryActiveMinutes < mean(VeryActiveMinutes)~"Lightly Active",
    SedentaryMinutes < mean(SedentaryMinutes) & LightlyActiveMinutes < mean(LightlyActiveMinutes) & FairlyActiveMinutes > mean(FairlyActiveMinutes) & VeryActiveMinutes < mean(VeryActiveMinutes)~"Fairly Active",
    SedentaryMinutes < mean(SedentaryMinutes) & LightlyActiveMinutes < mean(LightlyActiveMinutes) & FairlyActiveMinutes < mean(FairlyActiveMinutes) & VeryActiveMinutes > mean(VeryActiveMinutes)~"Very Active"),
        levels=c("Sedentary","Lightly Active","Fairly Active","Very Active")),Calories,TotalSteps, TrackerDistance, TotalHourInBed,TotalHourSleep,Subject_Id) %>%
    drop_na()
    


In [121]:
# Convert Total usertypes by percentage

user_percentage <- data_by_usertype %>%
    group_by(activity_level)%>%
    summarise(total=n())%>%
    mutate(totals=sum(total))%>%
    group_by(activity_level) %>%
    summarise(total_percent =total/totals)

#user distribution of different level of activity type 

ggplot(data=user_percentage)+
    geom_col(mapping = aes(x=activity_level,y=total_percent),fill="lightblue")+
    labs(title="Percentage of Different Activity level",size=50)


**4** -**Total steps and tracker distance differ across activity groups**

In [123]:
#Total steps and tracker distance differ across activity groups 

ggplot(data=data_by_usertype)+
    geom_col(mapping = aes(x=activity_level,y= TotalSteps),fill="blue")+
    labs(title="Total steps across different activity levels",size=50)

ggplot(data=data_by_usertype)+
    geom_col(mapping = aes(x=activity_level,y=TrackerDistance),fill="pink")+
    labs(title="Total distance Across different activity levels", size=50)


**User Group by Usage Days**

Based onn activity days, users are categorized into different groups

In [126]:
user_frequency <- daily_activity_sleep %>%
    group_by(Subject_Id)%>%
    summarize(Activity_Date=sum(n())) %>%
    mutate(Usage = case_when(
    Activity_Date >= 1 & Activity_Date <= 7 ~ "Low Use",
    Activity_Date >= 7 & Activity_Date <= 21 ~ "Moderate Use", 
    Activity_Date >= 22 & Activity_Date <= 31 ~ "High Use"))%>%
    mutate(Usage = factor(Usage, level = c("Low Use", "Moderate Use", "High Use")))%>%
    rename(Use_days = Activity_Date)%>%
    group_by(Usage)

head(user_frequency)

In [127]:
#convert the user frequency by percentage 

freq_percentage <- user_frequency %>%
  group_by(Usage) %>%
  summarise(total = n()) %>%
  mutate(totals = sum(total)) %>%
  group_by(Usage) %>%
  summarise(total_percent = total / totals)%>%
  drop_na()

#User distribution of different level of activity type
ggplot(data = freq_percentage)+
    geom_col(mapping = aes(x = Usage, y = total_percent), fill = "grey")+
    labs(title = "Activity Level vs Total Percentage", size = 50)





In [130]:
head(hourly_data,2)

**Relation between days of wek and average intensity**

In [131]:
#Create daily intensity dataset
Intensities_Day <- hourly_data%>%
  group_by(Subject_Id,Activity_Hour.x) %>%
  drop_na() %>%
  summarise(sum_TotalIntensity = sum(TotalIntensity)) 

#Add weekday column
Intensities_Day$Intensity_Weekday <- weekdays(as.Date(Intensities_Day$Activity_Hour.x))


#Grouping to by Day of the weeks:
Intensities_WeekDay_group <- Intensities_Day %>%
   group_by(Intensity_Weekday) %>%
   drop_na() %>%
   summarise(mean_sum_TotalIntensity = mean(sum_TotalIntensity))



In [132]:
#How users do activity with different intensity during a week
ggplot(data=Intensities_WeekDay_group) +
  geom_col(mapping = aes(x = Intensity_Weekday, y=mean_sum_TotalIntensity, fill=mean_sum_TotalIntensity)) +
  labs(title="Intensity During a Week",
       x="Day of the week",
       y="Average intensity",
       fill="Intensity") 
 

Saturday has the highest Intensity, followed by Tuesday

**Workout Pattern during Day**

In [134]:
#Change hour format and create daily intensity dataset
hourly_data$hour <- format(as.POSIXct(hourly_data$Activity_Hour.x,"%m/%d/%y %H:%M:%S"),"%H")
Intensity_hourly<- hourly_data%>%
group_by(Activity_Hour.x, hour) %>%
  drop_na() %>%
  summarise(mean_TotalIntensity = mean(TotalIntensity),StepTotal) 

# #Add weekday column
Intensity_hourly$Intensity_Weekday <- weekdays(as.Date(Intensity_hourly$Activity_Hour.x))


ggplot(data=Intensity_hourly) +
  geom_col(mapping = aes(x = hour, y=mean_TotalIntensity)) +
  labs(title="Hourly Intensity in a Day",
       x="Time(hour)",
       y="Average intensity") 
  

Mostly people prefer going to wrokouts in the evening between 5-7pm