-
Notifications
You must be signed in to change notification settings - Fork 1
/
Team Random - Code.R
1655 lines (1336 loc) · 74.1 KB
/
Team Random - Code.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
# Cleaning the workspace
rm(list = ls())
# Setting the required working directory
setwd("~/Desktop/IAS")
library(data.table)
# Getting data
data = fread('Training.csv', header = TRUE)
# data1 = fread('Test.csv', header = TRUE)
str(data)
# Changing data type
data$Education_Level <- as.factor(data$Education_Level)
data$Age_Range <- as.factor(data$Age_Range)
data$Employment_Status <- as.factor(data$Employment_Status)
data$Gender <- as.factor(data$Gender)
data$Year <- as.factor(data$Year)
# data1$Education_Level <- as.factor(data1$Education_Level)
# data1$Age_Range <- as.factor(data1$Age_Range)
# data1$Gender <- as.factor(data1$Gender)
# data1$Year <- as.factor(data1$Year)
# Re-checking newly-formatted data types
str(data)
#/
#/
#/
# Cleaning data Process - Step 1
summary(data)
summary(data$Employment_Status)
# Taking a closer look at Employed individuals
subset1 <- data[data$Employment_Status == 'Employed',]
summary(subset1)
# We see that the minimum value for weekly hours worked and weekly earnings is 0. Something is wrong here.
subset2 <- data[data$Employment_Status == 'Unemployed',]
summary(subset2)
# Weekly hours and weekly earnings for unemployed are all 0. This is as we suspected.
subset3 <- data[data$Employment_Status == 'Not in labor force',]
summary(subset3)
# Weekly earnings and weekly hours worked for Not in labor force are all 0. Is there a differentiating factor between subset2 and subset 3?
#/
#/
#/
# Cleaning data Process - Step 2
# Yearly total
table(data[,'Year'])
# 2005 2006 2007 2008 2009 2010 2011 2012
# 7999 8001 8001 8001 8001 8001 8001 8001
#Taking a look at the individuals with status of 'Employed'
nrow(data[data$Employment_Status == 'Employed' & data$Weekly_Earnings == 0 & data$Weekly_Hours_Worked == 0,])
# 599 rows should be re-labelled or dropped
table(data[data$Employment_Status == 'Employed' & data$Weekly_Earnings == 0 & data$Weekly_Hours_Worked == 0,'Year'])
# 2005 2006 2007 2008 2009 2010 2011 2012
# 51 62 53 69 81 106 87 90
nrow(data[data$Employment_Status == 'Employed' & data$Weekly_Earnings > 0 & data$Weekly_Hours_Worked == 0,])
# 1533 rows should be re-labelled or dropped
# Break-down of 1533 rows by year
table(data[data$Employment_Status == 'Employed' & data$Weekly_Earnings > 0 & data$Weekly_Hours_Worked == 0, 'Year'])
# Results
# 2005 2006 2007 2008 2009 2010 2011 2012
# 125 163 160 197 229 234 218 207
nrow(data[data$Employment_Status == 'Employed' & data$Weekly_Earnings == 0 & data$Weekly_Hours_Worked > 0,])
# 4181 rows should be re-labelled or dropped
table(data[data$Employment_Status == 'Employed' & data$Weekly_Earnings == 0 & data$Weekly_Hours_Worked > 0, 'Year'])
# 2005 2006 2007 2008 2009 2010 2011 2012
# 612 539 549 539 496 502 444 500
# As the dirty rows make up for less than 10% (<800) of the total observations within a year, we can drop it
# Removal of first dirty subset
newdata <- data[!(data$Employment_Status == 'Employed' & data$Weekly_Earnings == 0 & data$Weekly_Hours_Worked == 0),]
# Removal of second dirty subset
newdata <- newdata[!(newdata$Employment_Status=="Employed" & newdata$Weekly_Earnings == 0 & newdata$Weekly_Hours_Worked > 0),]
# Removal of third dirty subset
newdata <- newdata[!(newdata$Employment_Status=="Employed" & newdata$Weekly_Earnings > 0 & newdata$Weekly_Hours_Worked == 0),]
# Finding the different yearly median for weekly earnings among Employed;
for (year in c(2005,2006,2007,2008,2009,2010,2011,2012)){
print(paste("The median for weekly earnings", year, "is", summary(newdata[newdata$Year == year & newdata$Employment_Status == 'Employed', Weekly_Earnings])[3]))
}
# [1] "The median for weekly earnings 2005 is Median : 648.0 "
# [1] "The median for weekly earnings 2006 is Median : 650.0 "
# [1] "The median for weekly earnings 2007 is Median : 692.0 "
# [1] "The median for weekly earnings 2008 is Median : 720.0 "
# [1] "The median for weekly earnings 2009 is Median : 701.0 "
# [1] "The median for weekly earnings 2010 is Median : 750.0 "
# [1] "The median for weekly earnings 2011 is Median : 750.0 "
# [1] "The median for weekly earnings 2012 is Median : 760.0 "
for (year in c(2005,2006,2007,2008,2009,2010,2011,2012)){
print(paste("The median for weekly hours worked for", year, "is", summary(newdata[newdata$Year == year & newdata$Employment_Status == 'Employed', Weekly_Hours_Worked])[3]))
}
# [1] "The median for weekly hours worked for 2005 is Median : 40 "
# [1] "The median for weekly hours worked for 2006 is Median : 40 "
# [1] "The median for weekly hours worked for 2007 is Median : 40 "
# [1] "The median for weekly hours worked for 2008 is Median : 40 "
# [1] "The median for weekly hours worked for 2009 is Median : 40 "
# [1] "The median for weekly hours worked for 2010 is Median : 40 "
# [1] "The median for weekly hours worked for 2011 is Median : 40 "
# [1] "The median for weekly hours worked for 2012 is Median : 40 "
# Descriptive Analysis
# Mean
for (year in c(2005,2006,2007,2008,2009,2010,2011,2012)){
print(paste("The mean for", year, "is",names(newdata[,11:24]), "is", ceiling(sapply(newdata[newdata$Year == year,11:24], mean, na.rm = TRUE)), 'minutes'))
}
# Median
for (year in c(2005,2006,2007,2008,2009,2010,2011,2012)){
print(paste("The median for", year, "is",names(newdata[,11:24]), "is", ceiling(sapply(newdata[newdata$Year == year,11:24], median, na.rm = TRUE)), 'minutes'))
}
names(newdata)
newdata1 <-newdata[,-c("Id", "Age_Range", "Total")]
library(psych)
summary <- describe.by(newdata1, list(newdata$Employment_Status, newdata1$Year), mat = TRUE, digits = 2)
# Frequencies and Cross-Tabulations (for Categorical Variables)
str(newdata)
# Employment Status - Education - Year
mytable2 <- xtabs(~newdata$Employment_Status + newdata$Education_Level + newdata$Year)
ftable(mytable2)
summary(mytable2) #Chi-square shows that certain groups have too little observations. Need to reclassify
#Reclassification
newdata$Education_Level <- as.character(newdata$Education_Level)
# data1$Education_Level <- as.character(data1$Education_Level)
New_Education <- NULL
classEducation <- function(Education_Level) {
if(Education_Level == '10th grade' | Education_Level == '11th grade' | Education_Level == '12th grade' | Education_Level == '9th grade' | Education_Level == 'High School') {return('High School')}
else if(Education_Level == 'Associate Degree' | Education_Level == 'Bachelor') {return('Undergraduate Studies')}
else if(Education_Level == 'Doctoral Degree' | Education_Level == 'Master'| Education_Level == 'Prof. Degree') {return('Postgraduate Studies')}
else {return('Some College Education')}
}
New_Education <- sapply(newdata$Education_Level, classEducation, simplify = TRUE, USE.NAMES = TRUE)
# New_Education <- sapply(data1$Education_Level, classEducation, simplify = TRUE, USE.NAMES = TRUE)
newdata$Education_Level<- as.factor(New_Education)
# data1$Education_Level<- as.factor(New_Education)
table(unique(newdata$Education_Level))
str(newdata)
# Employment Status - Education - Year
mytable3 <- xtabs(~newdata$Employment_Status + newdata$Education_Level + newdata$Year)
ftable(mytable3)
summary(mytable3)
# Employment Status - Education
mytable4 <- xtabs(~newdata$Employment_Status + newdata$Education_Level)
ftable(mytable4)
summary(mytable4)
library(gmodels)
crosstable <- CrossTable(newdata$Employment_Status, newdata$Education_Level)
print(crosstable)
# Correlation Checks
# Checking the correlation Matrix between the time variables
# From Sleeping to Volunteering
library(corrplot)
corrplot(cor(newdata[,11:24]),type = "lower",diag = FALSE, addCoef.col = "black")
# There is correlation between:
# 1) Caring for children - Parenting children - correlation = 0.69
# 2) Socializing and Relaxing - Television - correlation = 0.71
#Testing for normality
library(nortest)
lapply(newdata[,c('Sleeping', "Grooming", "Housework", "Food _Drink_Prep", "Caring_for_Children", "Playing_with_Children", "Job_Searching", "Shopping", "Eating_and_Drinking", "Socializing_Relaxing", "Television", "Golfing", "Running", "Volunteering")], function(x) cvm.test(x))
# Performing T-test to see differences in mean between Genders
# Assumes Normal Distribution
lapply(newdata[,c('Sleeping', "Grooming", "Housework", "Food _Drink_Prep", "Caring_for_Children", "Playing_with_Children", "Job_Searching", "Shopping", "Eating_and_Drinking", "Socializing_Relaxing", "Television", "Golfing", "Running", "Volunteering")], function(x) t.test(x ~ newdata$Gender, var.equal = TRUE))
# All variables have a difference in mean between Genders (Gender is a significiant variable)
# Performing wilcoxon test to see differences in mean between Genders
# Does not assume Normal Distribution
lapply(newdata[,c('Sleeping', "Grooming", "Housework", "Food _Drink_Prep", "Caring_for_Children", "Playing_with_Children", "Job_Searching", "Shopping", "Eating_and_Drinking", "Socializing_Relaxing", "Television", "Golfing", "Running", "Volunteering")], function(x) wilcox.test(x ~ newdata$Gender))
# Male and female do not have the same continouous distribution
# Running ANOVA test to see differences between the 3 different populations in employment status
lapply(newdata[,c('Sleeping', "Grooming", "Housework", "Food _Drink_Prep", "Caring_for_Children", "Playing_with_Children", "Job_Searching", "Shopping", "Eating_and_Drinking", "Socializing_Relaxing", "Television", "Golfing", "Running", "Volunteering")], function(x) anova(lm(x ~ newdata$Employment_Status)))
# There is no significant difference in the mean of Golfing and Shopping among the different groups of Employment Status
lapply(newdata[,c('Shopping')], function(x) anova(lm(x ~ newdata$Gender)))
lapply(newdata[,c('Shopping')], function(x) anova(lm(x ~ newdata$Age_Range)))
lapply(newdata[,c('Shopping')], function(x) anova(lm(x ~ newdata$Education_Level)))
# Running ANOVA to see if there is a difference in childcare
lapply(newdata[,c('Caring_for_Children')], function(x) anova(lm(x ~ newdata$Gender)))
lapply(newdata[,c('Caring_for_Children')], function(x) anova(lm(x ~ newdata$Age_Range)))
lapply(newdata[,c('Caring_for_Children')], function(x) anova(lm(x ~ newdata$Education_Level)))
# Seeing time spent-patterns
hist(newdata$Sleeping, breaks=12, col="red")
summary(newdata$Sleeping)
quantile(newdata$Sleeping, 0.98)
# Removing outliers from sleeping data (max = 840 mins = 14 hours)
sum(newdata$Sleeping >= 840) #Total of 1257 rows
newdata <- newdata[newdata$Sleeping <= quantile(newdata$Sleeping, 0.98),]
# While it is deemed impossible for a person to sleep more than 14 hours on the average, it will also be deemed impossible for a person to sleep less than 4 hours
quantile(newdata$Sleeping, 0.02)
sum(newdata$Sleeping <= quantile(newdata$Sleeping, 0.02)) #Total of 1147 rows
newdata <- newdata[newdata$Sleeping >= quantile(newdata$Sleeping, 0.02),]
hist(newdata$Sleeping, breaks=12, col="red")
hist(data1$Sleeping, breaks=12, col="red")
summary(data1$Sleeping)
quantile(data1$Sleeping, 0.98)
# Removing outliers from sleeping data (max = 840 mins = 14 hours)
sum(data1$Sleeping >= 840) #Total of 1257 rows
data1 <- data1[data1$Sleeping <= quantile(data1$Sleeping, 0.98),]
# While it is deemed impossible for a person to sleep more than 14 hours on the average, it will also be deemed impossible for a person to sleep less than 4 hours
quantile(data1$Sleeping, 0.02)
sum(data1$Sleeping <= quantile(data1$Sleeping, 0.02)) #Total of 1147 rows
data1 <- data1[data1$Sleeping >= quantile(data1$Sleeping, 0.02),]
hist(data1$Sleeping, breaks=12, col="red")
# Grooming is different from sleeping. Grooming is leisure, while sleeping is a necessity.
hist(newdata$Grooming, breaks = 12, col = 'yellow')
grooming <- newdata[which(newdata$Grooming != 0),]
summary(grooming$Grooming)
# The maximum time spent on grooming is 1043 minutes, which is deemed ridiculous
quantile(grooming$Grooming, 0.98) #140 mins is within the 98% percentile
sum(grooming$Grooming >= quantile(grooming$Grooming, 0.98)) #908 rows exceeding
newdata <- newdata[newdata$Grooming <= quantile(grooming$Grooming, 0.98),]
quantile(grooming$Grooming, 0.01) #5 mins minimum
newdata <- newdata[newdata$Grooming >= 5 | newdata$Grooming == 0,]
hist(newdata$Grooming, breaks = 12, col = 'yellow')
grooming <- grooming[grooming$Grooming <= 140 & grooming$Grooming >= 5,]
hist(grooming$Grooming, breaks = 12, col = 'yellow')
hist(data1$Grooming, breaks = 12, col = 'yellow')
grooming <- data1[which(data1$Grooming != 0),]
summary(grooming$Grooming)
# The maximum time spent on grooming is 1043 minutes, which is deemed ridiculous
quantile(grooming$Grooming, 0.98) #140 mins is within the 98% percentile
sum(grooming$Grooming >= quantile(grooming$Grooming, 0.98)) #908 rows exceeding
data1 <- data1[data1$Grooming <= quantile(grooming$Grooming, 0.98),]
quantile(grooming$Grooming, 0.01) #5 mins minimum
data1 <- data1[data1$Grooming >= 5 | data1$Grooming == 0,]
hist(data1$Grooming, breaks = 12, col = 'yellow')
grooming <- grooming[grooming$Grooming <= 140 & grooming$Grooming >= 5,]
hist(grooming$Grooming, breaks = 12, col = 'yellow')
# Housework
names(newdata)
hist(newdata$Housework, breaks = 12, col = 'green')
housework <- newdata[which(newdata$Housework != 0),]
summary(housework$Housework)
# Maximum time is 1030 minutes = impossible
quantile(housework$Housework, 0.98) #Set at 400 mins
sum(housework$Housework >= quantile(housework$Housework, 0.98)) #447 rows exceeding
newdata <- newdata[newdata$Housework <= quantile(housework$Housework, 0.98),]
quantile(housework$Housework, 0.02) # 5 mins minimum
newdata <- newdata[newdata$Housework >= 5 | newdata$Housework == 0,]
housework <- housework[housework$Housework <= 408.04 & housework$Housework >= 5,]
hist(housework$Housework, breaks = 12, col = 'green')
hist(data1$Housework, breaks = 12, col = 'green')
housework <- data1[which(data1$Housework != 0),]
summary(housework$Housework)
# Maximum time is 1030 minutes = impossible
quantile(housework$Housework, 0.98) #Set at 400 mins
sum(housework$Housework >= quantile(housework$Housework, 0.98)) #447 rows exceeding
data1 <- data1[data1$Housework <= quantile(housework$Housework, 0.98),]
quantile(housework$Housework, 0.02) # 5 mins minimum
data1 <- data1[data1$Housework >= 5 | data1$Housework == 0,]
housework <- housework[housework$Housework <= 408.04 & housework$Housework >= 5,]
hist(housework$Housework, breaks = 12, col = 'green')
#Food _Drink_Prep
names(newdata)
hist(newdata$`Food _Drink_Prep`, breaks = 12, col = 'cyan')
Food_Drink_Prep <- newdata[which(newdata$`Food _Drink_Prep` != 0),]
summary(Food_Drink_Prep$`Food _Drink_Prep`)
# Maximum time is 755 minutes = impossible
quantile(Food_Drink_Prep$`Food _Drink_Prep`, 0.98) #Set at 230 mins
sum(Food_Drink_Prep$`Food _Drink_Prep` >= quantile(Food_Drink_Prep$`Food _Drink_Prep`, 0.98)) #630 rows exceeding
newdata <- newdata[newdata$`Food _Drink_Prep` <= quantile(Food_Drink_Prep$`Food _Drink_Prep`, 0.98),]
quantile(Food_Drink_Prep$`Food _Drink_Prep`, 0.02) # 4 mins minimum
newdata <- newdata[newdata$`Food _Drink_Prep` >= 4 | newdata$`Food _Drink_Prep` == 0,]
Food_Drink_Prep <- Food_Drink_Prep[Food_Drink_Prep$`Food _Drink_Prep` <= 230 & Food_Drink_Prep$`Food _Drink_Prep` >= 4,]
hist(Food_Drink_Prep$`Food _Drink_Prep`, breaks = 12, col = 'cyan')
hist(data1$`Food _Drink_Prep`, breaks = 12, col = 'cyan')
Food_Drink_Prep <- data1[which(data1$`Food _Drink_Prep` != 0),]
summary(Food_Drink_Prep$`Food _Drink_Prep`)
# Maximum time is 755 minutes = impossible
quantile(Food_Drink_Prep$`Food _Drink_Prep`, 0.98) #Set at 230 mins
sum(Food_Drink_Prep$`Food _Drink_Prep` >= quantile(Food_Drink_Prep$`Food _Drink_Prep`, 0.98)) #630 rows exceeding
data1 <- data1[data1$`Food _Drink_Prep` <= quantile(Food_Drink_Prep$`Food _Drink_Prep`, 0.98),]
quantile(Food_Drink_Prep$`Food _Drink_Prep`, 0.02) # 4 mins minimum
data1 <- data1[data1$`Food _Drink_Prep` >= 4 | data1$`Food _Drink_Prep` == 0,]
Food_Drink_Prep <- Food_Drink_Prep[Food_Drink_Prep$`Food _Drink_Prep` <= 230 & Food_Drink_Prep$`Food _Drink_Prep` >= 4,]
hist(Food_Drink_Prep$`Food _Drink_Prep`, breaks = 12, col = 'cyan')
#Caring_for_Children
names(newdata)
hist(newdata$Caring_for_Children, breaks = 12, col = 'blue')
Caring <- newdata[which(newdata$Caring_for_Children != 0),]
summary(Caring$Caring_for_Children)
# Maximum time is 973 minutes = impossible
quantile(Caring$Caring_for_Children, 0.98) #Set at 435 mins
sum(Caring$Caring_for_Children >= quantile(Caring$Caring_for_Children, 0.98)) #289 rows exceeding
newdata <- newdata[newdata$Caring_for_Children <= quantile(Caring$Caring_for_Children, 0.98),]
quantile(Caring$Caring_for_Children, 0.02) # 2 mins minimum
newdata <- newdata[newdata$Caring_for_Children >= 2 | newdata$Caring_for_Children == 0,]
Caring <- Caring[Caring$Caring_for_Children <= 435 & Caring$Caring_for_Children >= 2,]
hist(Caring$Caring_for_Children, breaks = 12, col = 'blue')
hist(data1$Caring_for_Children, breaks = 12, col = 'blue')
Caring <- data1[which(data1$Caring_for_Children != 0),]
summary(Caring$Caring_for_Children)
# Maximum time is 973 minutes = impossible
quantile(Caring$Caring_for_Children, 0.98) #Set at 435 mins
sum(Caring$Caring_for_Children >= quantile(Caring$Caring_for_Children, 0.98)) #289 rows exceeding
data1 <- data1[data1$Caring_for_Children <= quantile(Caring$Caring_for_Children, 0.98),]
quantile(Caring$Caring_for_Children, 0.02) # 2 mins minimum
data1 <- data1[data1$Caring_for_Children >= 2 | data1$Caring_for_Children == 0,]
Caring <- Caring[Caring$Caring_for_Children <= 435 & Caring$Caring_for_Children >= 2,]
hist(Caring$Caring_for_Children, breaks = 12, col = 'blue')
#Playing with Children
names(newdata)
hist(newdata$Playing_with_Children, breaks = 12, col = 'orange')
Playing <- newdata[which(newdata$Playing_with_Children != 0),]
summary(Playing$Playing_with_Children)
# Maximum time is 420 minutes... maybe possible but highly unlikely
quantile(Playing$Playing_with_Children, 0.98) #Set at 305 mins
sum(Playing$Playing_with_Children >= quantile(Playing$Playing_with_Children, 0.98)) #82 rows exceeding
newdata <- newdata[newdata$Playing_with_Children <= quantile(Playing$Playing_with_Children, 0.98),]
quantile(Playing$Playing_with_Children, 0.02) # 15 mins minimum
newdata <- newdata[newdata$Playing_with_Children >= 15 | newdata$Playing_with_Children == 0,]
Playing <- Playing[Playing$Playing_with_Children <= 305 & Playing$Playing_with_Children >= 15,]
hist(Playing$Playing_with_Children, breaks = 12, col = 'orange')
hist(data1$Playing_with_Children, breaks = 12, col = 'orange')
Playing <- data1[which(data1$Playing_with_Children != 0),]
summary(Playing$Playing_with_Children)
# Maximum time is 420 minutes... maybe possible but highly unlikely
quantile(Playing$Playing_with_Children, 0.98) #Set at 305 mins
sum(Playing$Playing_with_Children >= quantile(Playing$Playing_with_Children, 0.98)) #82 rows exceeding
data1 <- data1[data1$Playing_with_Children <= quantile(Playing$Playing_with_Children, 0.98),]
quantile(Playing$Playing_with_Children, 0.02) # 15 mins minimum
data1 <- data1[data1$Playing_with_Children >= 15 | data1$Playing_with_Children == 0,]
Playing <- Playing[Playing$Playing_with_Children <= 305 & Playing$Playing_with_Children >= 15,]
hist(Playing$Playing_with_Children, breaks = 12, col = 'orange')
#Job Searching
names(newdata)
hist(newdata$Job_Searching, breaks = 12, col = 'pink')
Job_Search <- newdata[which(newdata$Job_Searching != 0),]
summary(Job_Search$Job_Searching)
# Maximum time is 983 minutes... maybe possible but highly unlikely
quantile(Job_Search$Job_Searching, 0.98) #Set at 545.52 mins
sum(Job_Search$Job_Searching >= quantile(Job_Search$Job_Searching, 0.98)) #15 rows exceeding
newdata <- newdata[newdata$Job_Searching <= quantile(Job_Search$Job_Searching, 0.98),]
quantile(Job_Search$Job_Searching, 0.02) # 7 mins minimum
newdata <- newdata[newdata$Job_Searching >= 7 | newdata$Job_Searching == 0,]
Job_Search <- Job_Search[Job_Search$Job_Searching <= 545.52 & Job_Search$Job_Searching >= 7,]
hist(Job_Search$Job_Searching, breaks = 12, col = 'pink')
names(data1)
hist(data1$Job_Searching, breaks = 12, col = 'pink')
Job_Search <- data1[which(data1$Job_Searching != 0),]
summary(Job_Search$Job_Searching)
# Maximum time is 983 minutes... maybe possible but highly unlikely
quantile(Job_Search$Job_Searching, 0.98) #Set at 545.52 mins
sum(Job_Search$Job_Searching >= quantile(Job_Search$Job_Searching, 0.98)) #15 rows exceeding
data1 <- data1[data1$Job_Searching <= quantile(Job_Search$Job_Searching, 0.98),]
quantile(Job_Search$Job_Searching, 0.02) # 7 mins minimum
data1 <- data1[data1$Job_Searching >= 7 | data1$Job_Searching == 0,]
Job_Search <- Job_Search[Job_Search$Job_Searching <= 545.52 & Job_Search$Job_Searching >= 7,]
hist(Job_Search$Job_Searching, breaks = 12, col = 'pink')
#Shopping
names(newdata)
hist(newdata$Shopping, breaks = 12, col = 'magenta')
Shop <- newdata[which(newdata$Shopping != 0),]
summary(Shop$Shopping)
# Maximum time is 640 minutes... maybe possible but highly unlikely
quantile(Shop$Shopping, 0.98) #Set at 240 mins
sum(Shop$Shopping >= quantile(Shop$Shopping, 0.98)) #506 rows exceeding
newdata <- newdata[newdata$Shopping <= quantile(Shop$Shopping, 0.98),]
quantile(Shop$Shopping, 0.02) # 3 mins minimum
newdata <- newdata[newdata$Shopping >= 3 | newdata$Shopping == 0,]
Shop <- Shop[Shop$Shopping <= quantile(Shop$Shopping, 0.98) & Shop$Shopping >= quantile(Shop$Shopping, 0.02),]
hist(Shop$Shopping, breaks = 12, col = 'magenta')
names(data1)
hist(data1$Shopping, breaks = 12, col = 'magenta')
Shop <- data1[which(data1$Shopping != 0),]
summary(Shop$Shopping)
# Maximum time is 640 minutes... maybe possible but highly unlikely
quantile(Shop$Shopping, 0.98) #Set at 240 mins
sum(Shop$Shopping >= quantile(Shop$Shopping, 0.98)) #506 rows exceeding
data1 <- data1[data1$Shopping <= quantile(Shop$Shopping, 0.98),]
quantile(Shop$Shopping, 0.02) # 3 mins minimum
data1 <- data1[data1$Shopping >= 3 | data1$Shopping == 0,]
Shop <- Shop[Shop$Shopping <= quantile(Shop$Shopping, 0.98) & Shop$Shopping >= quantile(Shop$Shopping, 0.02),]
hist(Shop$Shopping, breaks = 12, col = 'magenta')
#Eating_and_Drinking like sleeping is a necessity. It is impossible for anyone to spend 0 mins eating and drinking
names(newdata)
hist(newdata$Eating_and_Drinking, breaks = 12, col = 'salmon')
EatDrink <- newdata[which(newdata$Eating_and_Drinking != 0),]
summary(EatDrink$Eating_and_Drinking)
# Maximum time is 895 minutes... impossible
quantile(EatDrink$Eating_and_Drinking, 0.98) #Set at 210 mins
sum(EatDrink$Eating_and_Drinking >= quantile(EatDrink$Eating_and_Drinking, 0.98)) #1080 rows exceeding
newdata <- newdata[newdata$Eating_and_Drinking <= quantile(EatDrink$Eating_and_Drinking, 0.98),]
quantile(EatDrink$Eating_and_Drinking, 0.02) # 10 mins minimum
newdata <- newdata[newdata$Eating_and_Drinking >= quantile(EatDrink$Eating_and_Drinking, 0.02),]
EatDrink <- EatDrink[EatDrink$Eating_and_Drinking <= quantile(EatDrink$Eating_and_Drinking, 0.98) & EatDrink$Eating_and_Drinking >= quantile(EatDrink$Eating_and_Drinking, 0.02),]
hist(EatDrink$Eating_and_Drinking, breaks = 12, col = 'salmon')
hist(data1$Eating_and_Drinking, breaks = 12, col = 'salmon')
EatDrink <- data1[which(data1$Eating_and_Drinking != 0),]
summary(EatDrink$Eating_and_Drinking)
# Maximum time is 895 minutes... impossible
quantile(EatDrink$Eating_and_Drinking, 0.98) #Set at 210 mins
sum(EatDrink$Eating_and_Drinking >= quantile(EatDrink$Eating_and_Drinking, 0.98)) #1080 rows exceeding
data1 <- data1[data1$Eating_and_Drinking <= quantile(EatDrink$Eating_and_Drinking, 0.98),]
quantile(EatDrink$Eating_and_Drinking, 0.02) # 10 mins minimum
data1 <- data1[data1$Eating_and_Drinking >= quantile(EatDrink$Eating_and_Drinking, 0.02),]
EatDrink <- EatDrink[EatDrink$Eating_and_Drinking <= quantile(EatDrink$Eating_and_Drinking, 0.98) & EatDrink$Eating_and_Drinking >= quantile(EatDrink$Eating_and_Drinking, 0.02),]
hist(EatDrink$Eating_and_Drinking, breaks = 12, col = 'salmon')
#Socializing_Relaxing
names(newdata)
hist(newdata$Socializing_Relaxing, breaks = 12, col = 'lightseagreen')
social <- newdata[which(newdata$Socializing_Relaxing != 0),]
summary(social$Socializing_Relaxing)
# Maximum time is 1180 minutes... impossible
quantile(social$Socializing_Relaxing, 0.98) #Set at 797 mins
sum(social$Socializing_Relaxing >= quantile(social$Socializing_Relaxing, 0.98)) #919 rows exceeding
newdata <- newdata[newdata$Socializing_Relaxing <= quantile(social$Socializing_Relaxing, 0.98),]
quantile(social$Socializing_Relaxing, 0.02) # 30 mins minimum
newdata <- newdata[newdata$Socializing_Relaxing >= quantile(social$Socializing_Relaxing, 0.02) | newdata$Socializing_Relaxing == 0,]
social <- social[social$Socializing_Relaxing <= quantile(social$Socializing_Relaxing, 0.98) & social$Socializing_Relaxing >= quantile(social$Socializing_Relaxing, 0.02),]
hist(social$Socializing_Relaxing, breaks = 12, col = 'lightseagreen')
names(data1)
hist(data1$Socializing_Relaxing, breaks = 12, col = 'lightseagreen')
social <- data1[which(data1$Socializing_Relaxing != 0),]
summary(social$Socializing_Relaxing)
# Maximum time is 1180 minutes... impossible
quantile(social$Socializing_Relaxing, 0.98) #Set at 797 mins
sum(social$Socializing_Relaxing >= quantile(social$Socializing_Relaxing, 0.98)) #919 rows exceeding
data1 <- data1[data1$Socializing_Relaxing <= quantile(social$Socializing_Relaxing, 0.98),]
quantile(social$Socializing_Relaxing, 0.02) # 30 mins minimum
data1 <- data1[data1$Socializing_Relaxing >= quantile(social$Socializing_Relaxing, 0.02) | data1$Socializing_Relaxing == 0,]
social <- social[social$Socializing_Relaxing <= quantile(social$Socializing_Relaxing, 0.98) & social$Socializing_Relaxing >= quantile(social$Socializing_Relaxing, 0.02),]
hist(social$Socializing_Relaxing, breaks = 12, col = 'lightseagreen')
#Television
names(newdata)
hist(newdata$Television, breaks = 12, col = 'indianred')
TV <- newdata[which(newdata$Television != 0),]
summary(TV$Television)
# Maximum time is 795 minutes... is highly improbable
quantile(TV$Television, 0.98) #Set at 620 mins
sum(TV$Television >= quantile(TV$Television, 0.98)) #783 rows exceeding
newdata <- newdata[newdata$Television <= quantile(TV$Television, 0.98),]
quantile(TV$Television, 0.02) # 28 mins minimum
newdata <- newdata[newdata$Television >= quantile(TV$Television, 0.02) | newdata$Television == 0,]
TV <- TV[TV$Television <= quantile(TV$Television, 0.98) & TV$Television >= quantile(TV$Television, 0.02),]
hist(TV$Television, breaks = 12, col = 'indianred')
names(data1)
hist(data1$Television, breaks = 12, col = 'indianred')
TV <- data1[which(data1$Television != 0),]
summary(TV$Television)
# Maximum time is 795 minutes... is highly improbable
quantile(TV$Television, 0.98) #Set at 620 mins
sum(TV$Television >= quantile(TV$Television, 0.98)) #783 rows exceeding
data1 <- data1[data1$Television <= quantile(TV$Television, 0.98),]
quantile(TV$Television, 0.02) # 28 mins minimum
data1 <- data1[data1$Television >= quantile(TV$Television, 0.02) | data1$Television == 0,]
TV <- TV[TV$Television <= quantile(TV$Television, 0.98) & TV$Television >= quantile(TV$Television, 0.02),]
hist(TV$Television, breaks = 12, col = 'indianred')
#Golfing
names(newdata)
hist(newdata$Golfing, breaks = 12, col = 'khaki')
golf <- newdata[which(newdata$Golfing != 0),]
summary(golf$Golfing)
# Maximum time is 600 minutes... highly improbable
quantile(golf$Golfing, 0.98) #Set at 417.52 mins
sum(golf$Golfing >= quantile(golf$Golfing, 0.98)) #6 rows exceeding
newdata <- newdata[newdata$Golfing <= quantile(golf$Golfing, 0.98),]
quantile(golf$Golfing, 0.02) # 30 mins minimum
newdata <- newdata[newdata$Golfing >= quantile(golf$Golfing, 0.02) | newdata$Golfing == 0,]
golf <- golf[golf$Golfing <= quantile(golf$Golfing, 0.98) & golf$Golfing >= quantile(golf$Golfing, 0.02),]
hist(golf$Golfing, breaks = 12, col = 'khaki')
names(data1)
hist(data1$Golfing, breaks = 12, col = 'khaki')
golf <- data1[which(data1$Golfing != 0),]
summary(golf$Golfing)
# Maximum time is 600 minutes... highly improbable
quantile(golf$Golfing, 0.98) #Set at 417.52 mins
sum(golf$Golfing >= quantile(golf$Golfing, 0.98)) #6 rows exceeding
data1 <- data1[data1$Golfing <= quantile(golf$Golfing, 0.98),]
quantile(golf$Golfing, 0.02) # 30 mins minimum
data1 <- data1[data1$Golfing >= quantile(golf$Golfing, 0.02) | data1$Golfing == 0,]
golf <- golf[golf$Golfing <= quantile(golf$Golfing, 0.98) & golf$Golfing >= quantile(golf$Golfing, 0.02),]
hist(golf$Golfing, breaks = 12, col = 'khaki')
#Running
names(newdata)
hist(newdata$Running, breaks = 12, col = 'mediumorchid')
Ran <- newdata[which(newdata$Running != 0),]
summary(Ran$Running)
# Maximum time is 505 minutes... highly improbable
quantile(Ran$Running, 0.98) #Set at 145.4 mins
sum(Ran$Running >= quantile(Ran$Running, 0.98)) #12 rows exceeding
newdata <- newdata[newdata$Running <= quantile(Ran$Running, 0.98),]
quantile(Ran$Running, 0.02) # 10 mins minimum
newdata <- newdata[newdata$Running >= quantile(Ran$Running, 0.02) | newdata$Running == 0,]
Ran <- Ran[Ran$Running <= quantile(Ran$Running, 0.98) & Ran$Running >= quantile(Ran$Running, 0.02),]
hist(Ran$Running, breaks = 12, col = 'mediumorchid')
names(data1)
hist(data1$Running, breaks = 12, col = 'mediumorchid')
Ran <- data1[which(data1$Running != 0),]
summary(Ran$Running)
# Maximum time is 505 minutes... highly improbable
quantile(Ran$Running, 0.98) #Set at 145.4 mins
sum(Ran$Running >= quantile(Ran$Running, 0.98)) #12 rows exceeding
data1 <- data1[data1$Running <= quantile(Ran$Running, 0.98),]
quantile(Ran$Running, 0.02) # 10 mins minimum
data1 <- data1[data1$Running >= quantile(Ran$Running, 0.02) | data1$Running == 0,]
Ran <- Ran[Ran$Running <= quantile(Ran$Running, 0.98) & Ran$Running >= quantile(Ran$Running, 0.02),]
hist(Ran$Running, breaks = 12, col = 'mediumorchid')
#Volunteering
names(newdata)
hist(newdata$Volunteering, breaks = 12, col = 'midnightblue')
Volunteers <- newdata[which(newdata$Volunteering != 0),]
summary(Volunteers$Volunteering)
# Maximum time is 877 minutes... highly improbable
quantile(Volunteers$Volunteering, 0.98) #Set at 506.14 mins
sum(Volunteers$Volunteering >= quantile(Volunteers$Volunteering, 0.98)) #67 rows exceeding
newdata <- newdata[newdata$Volunteering <= quantile(Volunteers$Volunteering, 0.98),]
quantile(Volunteers$Volunteering, 0.02) # 5 mins minimum
newdata <- newdata[newdata$Volunteering >= quantile(Volunteers$Volunteering, 0.02) | newdata$Volunteering == 0,]
Volunteers <- Volunteers[Volunteers$Volunteering <= quantile(Volunteers$Volunteering, 0.98) & Volunteers$Volunteering >= quantile(Volunteers$Volunteering, 0.02),]
hist(Volunteers$Volunteering, breaks = 12, col = 'midnightblue')
names(data1)
hist(data1$Volunteering, breaks = 12, col = 'midnightblue')
Volunteers <- data1[which(data1$Volunteering != 0),]
summary(Volunteers$Volunteering)
# Maximum time is 877 minutes... highly improbable
quantile(Volunteers$Volunteering, 0.98) #Set at 506.14 mins
sum(Volunteers$Volunteering >= quantile(Volunteers$Volunteering, 0.98)) #67 rows exceeding
data1 <- data1[data1$Volunteering <= quantile(Volunteers$Volunteering, 0.98),]
quantile(Volunteers$Volunteering, 0.02) # 5 mins minimum
data1 <- data1[data1$Volunteering >= quantile(Volunteers$Volunteering, 0.02) | data1$Volunteering == 0,]
Volunteers <- Volunteers[Volunteers$Volunteering <= quantile(Volunteers$Volunteering, 0.98) & Volunteers$Volunteering >= quantile(Volunteers$Volunteering, 0.02),]
hist(Volunteers$Volunteering, breaks = 12, col = 'midnightblue')
#Weekly Hours Worked
names(newdata)
hist(newdata$Weekly_Hours_Worked, breaks = 12, col = 'blue')
WeeklyHours <- newdata[which(newdata$Weekly_Hours_Worked != 0),]
summary(WeeklyHours$Weekly_Hours_Worked)
# Maximum time is 160 hours... highly improbable
quantile(WeeklyHours$Weekly_Hours_Worked, 0.98) #Set at 68 hours
sum(WeeklyHours$Weekly_Hours_Worked >= quantile(WeeklyHours$Weekly_Hours_Worked, 0.98)) #549 rows exceeding
newdata <- newdata[newdata$Weekly_Hours_Worked <= quantile(WeeklyHours$Weekly_Hours_Worked, 0.98),]
quantile(WeeklyHours$Weekly_Hours_Worked, 0.02) # 10 hours minimum
newdata <- newdata[newdata$Weekly_Hours_Worked >= quantile(WeeklyHours$Weekly_Hours_Worked, 0.02) | newdata$Weekly_Hours_Worked == 0,]
WeeklyHours <- WeeklyHours[WeeklyHours$Weekly_Hours_Worked <= quantile(WeeklyHours$Weekly_Hours_Worked, 0.98) & WeeklyHours$Weekly_Hours_Worked >= quantile(WeeklyHours$Weekly_Hours_Worked, 0.02),]
hist(WeeklyHours$Weekly_Hours_Worked, breaks = 12, col = 'midnightblue')
names(data1)
hist(data1$Weekly_Hours_Worked, breaks = 12, col = 'blue')
WeeklyHours <- data1[which(data1$Weekly_Hours_Worked != 0),]
summary(WeeklyHours$Weekly_Hours_Worked)
# Maximum time is 160 hours... highly improbable
quantile(WeeklyHours$Weekly_Hours_Worked, 0.98) #Set at 68 hours
sum(WeeklyHours$Weekly_Hours_Worked >= quantile(WeeklyHours$Weekly_Hours_Worked, 0.98)) #549 rows exceeding
data1 <- data1[data1$Weekly_Hours_Worked <= quantile(WeeklyHours$Weekly_Hours_Worked, 0.98),]
quantile(WeeklyHours$Weekly_Hours_Worked, 0.02) # 10 hours minimum
data1 <- data1[data1$Weekly_Hours_Worked >= quantile(WeeklyHours$Weekly_Hours_Worked, 0.02) | data1$Weekly_Hours_Worked == 0,]
WeeklyHours <- WeeklyHours[WeeklyHours$Weekly_Hours_Worked <= quantile(WeeklyHours$Weekly_Hours_Worked, 0.98) & WeeklyHours$Weekly_Hours_Worked >= quantile(WeeklyHours$Weekly_Hours_Worked, 0.02),]
hist(WeeklyHours$Weekly_Hours_Worked, breaks = 12, col = 'midnightblue')
#Weekly Earnings
names(newdata)
hist(newdata$Weekly_Earnings, breaks = 12, col = 'blue')
summary(newdata$Weekly_Earnings)
WeeklyEarnings <- newdata[which(newdata$Weekly_Earnings != 0),]
summary(WeeklyEarnings$Weekly_Earnings)
# While it is possible to have earned the maximum of $2885/week but there must be a minimum earnings given the minimum wage policies
# To do this, we need to create another variable called Earnings/Hours
newdata$Hourly_Earnings <- (newdata$Weekly_Earnings/newdata$Weekly_Hours_Worked)
newdata$Hourly_Earnings <- ifelse(newdata$Hourly_Earnings == 'NaN',0, round(newdata$Hourly_Earnings, digits = 2))
summary(newdata$Hourly_Earnings)
hist(newdata$Hourly_Earnings, breaks = 12, col = 'blue')
HourlyEarnings <- newdata[which(newdata$Hourly_Earnings != 0),]
summary(HourlyEarnings$Hourly_Earnings)
quantile(HourlyEarnings$Hourly_Earnings, 0.02) #Set at $4.572 (minimum wage policy)
sum(HourlyEarnings$Hourly_Earnings < quantile(HourlyEarnings$Hourly_Earnings, 0.02)) #538 rows exceeding
newdata <- newdata[newdata$Hourly_Earnings >= quantile(HourlyEarnings$Hourly_Earnings, 0.02) | newdata$Hourly_Earnings == 0,]
HourlyEarnings <- HourlyEarnings[HourlyEarnings$Hourly_Earnings >= quantile(HourlyEarnings$Hourly_Earnings, 0.02),]
hist(HourlyEarnings$Hourly_Earnings, breaks = 12, col = 'midnightblue')
names(data1)
hist(data1$Weekly_Earnings, breaks = 12, col = 'blue')
summary(data1$Weekly_Earnings)
WeeklyEarnings <- data1[which(data1$Weekly_Earnings != 0),]
summary(WeeklyEarnings$Weekly_Earnings)
# While it is possible to have earned the maximum of $2885/week but there must be a minimum earnings given the minimum wage policies
# To do this, we need to create another variable called Earnings/Hours
data1$Hourly_Earnings <- (data1$Weekly_Earnings/data1$Weekly_Hours_Worked)
data1$Hourly_Earnings <- ifelse(data1$Hourly_Earnings == 'NaN',0, round(data1$Hourly_Earnings, digits = 2))
summary(data1$Hourly_Earnings)
hist(data1$Hourly_Earnings, breaks = 12, col = 'blue')
HourlyEarnings <- data1[which(data1$Hourly_Earnings != 0),]
summary(HourlyEarnings$Hourly_Earnings)
quantile(HourlyEarnings$Hourly_Earnings, 0.02) #Set at $4.572 (minimum wage policy)
sum(HourlyEarnings$Hourly_Earnings < quantile(HourlyEarnings$Hourly_Earnings, 0.02)) #538 rows exceeding
data1 <- data1[data1$Hourly_Earnings >= quantile(HourlyEarnings$Hourly_Earnings, 0.02) | data1$Hourly_Earnings == 0,]
HourlyEarnings <- HourlyEarnings[HourlyEarnings$Hourly_Earnings >= quantile(HourlyEarnings$Hourly_Earnings, 0.02),]
hist(HourlyEarnings$Hourly_Earnings, breaks = 12, col = 'midnightblue')
# For question 1-6:
write.csv(newdata, file = "initial_newdata.csv")
write.csv(data1, file = "initial_testdata.csv")
# Linear Modeling for prediction
library(lmtest)
library(sandwich)
colnames(newdata)
# For caring for children and playing with children
# Logically speaking, caring for children is a bigger subset for playing with children. Caring does not only include playing but also other activities.
# But, we know that there is a correlation, therefore, we should be able to predict the time spent playing with children given the amount of time
# spent caring for children.
# Because we want to study the relationship between Caring and Playing, we should exclude those who do not play
newdata2 <- newdata[newdata$Playing_with_Children!=0 ,]
acf(newdata2$Playing_with_Children)
model1 = lm(newdata2$Playing_with_Children ~ newdata2$Caring_for_Children)
summary(model1)
confint(model1, level = 0.95)
plot(newdata2$Caring_for_Children, newdata2$Playing_with_Children, col = 'grey')
abline(model1, col = 'red')
coeftest(model1, vcov. = NeweyWest(model1,lag = NULL))
confint(model1, vcov = NeweyWest(model1, lag = NULL),level = 0.95)
newdata$UL_Playing <- (newdata$Caring_for_Children*0.4532799) + 22.3713459
newdata$Mean_Playing <- (newdata$Caring_for_Children*0.435771) + 18.880603
newdata$LL_Playing <- (newdata$Caring_for_Children*0.4182614) + 15.3898607
lines(newdata$Caring_for_Children,newdata$UL_Playing,col="green")
lines(newdata$Caring_for_Children,newdata$LL_Playing,col="green")
nrow(newdata[newdata$Playing_with_Children!=0,])
newdata$Corrected_Playing_with_Children <- ifelse(newdata$Playing_with_Children != 0 & newdata$Playing_with_Children <= newdata$UL_Playing & newdata$Playing_with_Children >= newdata$LL_Playing, 0, newdata$Playing_with_Children)
nrow(newdata[newdata$Corrected_Playing_with_Children!=0,])
newdata$Corrected_Playing_with_Children <- ifelse(newdata$Playing_with_Children != 0 & newdata$Playing_with_Children > newdata$UL_Playing, newdata$Playing_with_Children - newdata$Mean_Playing, newdata$Corrected_Playing_with_Children)
nrow(newdata[newdata$Corrected_Playing_with_Children!=0,])
newdata$Corrected_Playing_with_Children <- ifelse(newdata$Playing_with_Children != 0 & newdata$Playing_with_Children < newdata$LL_Playing, newdata$Playing_with_Children, newdata$Corrected_Playing_with_Children)
nrow(newdata[newdata$Corrected_Playing_with_Children!=0,])
newdata$Corrected_Playing_with_Children <- ceiling(newdata$Corrected_Playing_with_Children)
data1$UL_Playing <- (data1$Caring_for_Children*0.4532799) + 22.3713459
data1$Mean_Playing <- (data1$Caring_for_Children*0.435771) + 18.880603
data1$LL_Playing <- (data1$Caring_for_Children*0.4182614) + 15.3898607
nrow(data1[data1$Playing_with_Children!=0,])
data1$Corrected_Playing_with_Children <- ifelse(data1$Playing_with_Children != 0 & data1$Playing_with_Children <= data1$UL_Playing & data1$Playing_with_Children >= data1$LL_Playing, 0, data1$Playing_with_Children)
nrow(data1[data1$Corrected_Playing_with_Children!=0,])
data1$Corrected_Playing_with_Children <- ifelse(data1$Playing_with_Children != 0 & data1$Playing_with_Children > data1$UL_Playing, data1$Playing_with_Children - data1$Mean_Playing, data1$Corrected_Playing_with_Children)
nrow(data1[data1$Corrected_Playing_with_Children!=0,])
data1$Corrected_Playing_with_Children <- ifelse(data1$Playing_with_Children != 0 & data1$Playing_with_Children < data1$LL_Playing, data1$Playing_with_Children, data1$Corrected_Playing_with_Children)
nrow(data1[data1$Corrected_Playing_with_Children!=0,])
data1$Corrected_Playing_with_Children <- ceiling(data1$Corrected_Playing_with_Children)
# Now for housework and food&drink prep. Food&drink prep can be counted as housework but not all housework has to do with food&drink prep
# Because we want to study the relationship between Housework and Food&Drink Prep, we should exclude those who do not participate in Food&Drink Prep
newdata3 <- newdata[newdata$`Food _Drink_Prep`!=0 ,]
acf(newdata3$`Food _Drink_Prep`)
model2 = lm(newdata3$`Food _Drink_Prep` ~ newdata3$Housework)
summary(model2)
confint(model2, level = 0.95)
plot(newdata3$Housework, newdata3$`Food _Drink_Prep`)
abline(model1, col = 'red')
coeftest(model2, vcov. = NeweyWest(model2,lag = NULL))
confint(model2, vcov = NeweyWest(model2, lag = NULL),level = 0.95)
newdata$UL_FoodDrinkPrep <- (newdata$`Food _Drink_Prep`*0.1158568) + 53.1969622
newdata$Mean_FoodDrinkPrep <- (newdata$`Food _Drink_Prep`*0.1088764) + 52.5404661
newdata$LL_FoodDrinkPrep <- (newdata$`Food _Drink_Prep`*0.1018959) + 51.8839701
nrow(newdata[newdata$`Food _Drink_Prep`!=0,])
newdata$Corrected_FoodDrinkPrep <- ifelse(newdata$`Food _Drink_Prep` != 0 & newdata$`Food _Drink_Prep` <= newdata$UL_FoodDrinkPrep & newdata$`Food _Drink_Prep` >= newdata$LL_FoodDrinkPrep, 0, newdata$`Food _Drink_Prep`)
nrow(newdata[newdata$Corrected_FoodDrinkPrep != 0,])
newdata$Corrected_FoodDrinkPrep <- ifelse(newdata$`Food _Drink_Prep` != 0 & newdata$`Food _Drink_Prep` > newdata$UL_FoodDrinkPrep, newdata$`Food _Drink_Prep` - newdata$Mean_FoodDrinkPrep, newdata$Corrected_FoodDrinkPrep)
nrow(newdata[newdata$Corrected_FoodDrinkPrep!=0,])
newdata$Corrected_FoodDrinkPrep <- ifelse(newdata$`Food _Drink_Prep` != 0 & newdata$`Food _Drink_Prep` < newdata$LL_FoodDrinkPrep, newdata$`Food _Drink_Prep`, newdata$Corrected_FoodDrinkPrep)
nrow(newdata[newdata$Corrected_FoodDrinkPrep!=0,])
newdata$Corrected_FoodDrinkPrep <- ceiling(newdata$Corrected_FoodDrinkPrep)
data1$UL_FoodDrinkPrep <- (data1$`Food _Drink_Prep`*0.1158568) + 53.1969622
data1$Mean_FoodDrinkPrep <- (data1$`Food _Drink_Prep`*0.1088764) + 52.5404661
data1$LL_FoodDrinkPrep <- (data1$`Food _Drink_Prep`*0.1018959) + 51.8839701
nrow(data1[data1$`Food _Drink_Prep`!=0,])
data1$Corrected_FoodDrinkPrep <- ifelse(data1$`Food _Drink_Prep` != 0 & data1$`Food _Drink_Prep` <= data1$UL_FoodDrinkPrep & data1$`Food _Drink_Prep` >= data1$LL_FoodDrinkPrep, 0, data1$`Food _Drink_Prep`)
nrow(data1[data1$Corrected_FoodDrinkPrep != 0,])
data1$Corrected_FoodDrinkPrep <- ifelse(data1$`Food _Drink_Prep` != 0 & data1$`Food _Drink_Prep` > data1$UL_FoodDrinkPrep, data1$`Food _Drink_Prep` - data1$Mean_FoodDrinkPrep, data1$Corrected_FoodDrinkPrep)
nrow(data1[data1$Corrected_FoodDrinkPrep!=0,])
data1$Corrected_FoodDrinkPrep <- ifelse(data1$`Food _Drink_Prep` != 0 & data1$`Food _Drink_Prep` < data1$LL_FoodDrinkPrep, data1$`Food _Drink_Prep`, data1$Corrected_FoodDrinkPrep)
nrow(data1[data1$Corrected_FoodDrinkPrep!=0,])
data1$Corrected_FoodDrinkPrep <- ceiling(data1$Corrected_FoodDrinkPrep)
# Now for Socializing&Relaxing and Television. Television can be counted as Socializing&Relaxing but not all Socializing&Relaxing has to do with Television
# Because we want to study the relationship between Socializing&Relaxing and Television, we should exclude those who do not participate in Television
newdata4 <- newdata[newdata$Television!=0 ,]
acf(newdata4$Television)
model3 = lm(newdata4$Television ~ newdata4$Socializing_Relaxing)
summary(model3)
confint(model3, level = 0.95)
plot(newdata4$Socializing_Relaxing, newdata4$Television)
abline(model1, col = 'red')
coeftest(model3, vcov. = NeweyWest(model3,lag = NULL))
confint(model3, vcov = NeweyWest(model2, lag = NULL),level = 0.95)
newdata$UL_TV <- (newdata$Television*0.5266691) + 37.6805498
newdata$Mean_TV <- (newdata$Television*0.5208844) + 35.6209855
newdata$LL_TV <- (newdata$Television*0.5150998) + 33.5614213
nrow(newdata[newdata$Television!=0,])
newdata$Corrected_TV <- ifelse(newdata$Television != 0 & newdata$Television <= newdata$UL_TV & newdata$Television >= newdata$LL_TV, 0, newdata$Television)
nrow(newdata[newdata$Corrected_TV != 0,])
newdata$Corrected_TV <- ifelse(newdata$Television != 0 & newdata$Television > newdata$UL_TV, newdata$Television - newdata$Mean_TV, newdata$Corrected_TV)
nrow(newdata[newdata$Corrected_TV!=0,])
newdata$Corrected_TV <- ifelse(newdata$Television != 0 & newdata$Television < newdata$LL_TV, newdata$Television, newdata$Corrected_TV)
nrow(newdata[newdata$Corrected_TV!=0,])
newdata$Corrected_TV <- ceiling(newdata$Corrected_TV)
newdata$Corrected_Total <- (newdata$Sleeping + newdata$Grooming + newdata$Housework + newdata$Corrected_FoodDrinkPrep + newdata$Caring_for_Children + newdata$Corrected_Playing_with_Children + newdata$Job_Searching + newdata$Shopping + newdata$Eating_and_Drinking + newdata$Socializing_Relaxing + newdata$Corrected_TV + newdata$Golfing + newdata$Running + newdata$Volunteering)/60
newdata$Corrected_Total <- round(newdata$Corrected_Total, digits = 2)
data1$UL_TV <- (data1$Television*0.5266691) + 37.6805498
data1$Mean_TV <- (data1$Television*0.5208844) + 35.6209855
data1$LL_TV <- (data1$Television*0.5150998) + 33.5614213
nrow(data1[data1$Television!=0,])
data1$Corrected_TV <- ifelse(data1$Television != 0 & data1$Television <= data1$UL_TV & data1$Television >= data1$LL_TV, 0, data1$Television)
nrow(data1[data1$Corrected_TV != 0,])
data1$Corrected_TV <- ifelse(data1$Television != 0 & data1$Television > data1$UL_TV, data1$Television - data1$Mean_TV, data1$Corrected_TV)
nrow(data1[data1$Corrected_TV!=0,])
data1$Corrected_TV <- ifelse(data1$Television != 0 & data1$Television < data1$LL_TV, data1$Television, data1$Corrected_TV)
nrow(data1[data1$Corrected_TV!=0,])
data1$Corrected_TV <- ceiling(data1$Corrected_TV)
data1$Corrected_Total <- (data1$Sleeping + data1$Grooming + data1$Housework + data1$Corrected_FoodDrinkPrep + data1$Caring_for_Children + data1$Corrected_Playing_with_Children + data1$Job_Searching + data1$Shopping + data1$Eating_and_Drinking + data1$Socializing_Relaxing + data1$Corrected_TV + data1$Golfing + data1$Running + data1$Volunteering)/60
data1$Corrected_Total <- round(data1$Corrected_Total, digits = 2)
# Interesting Patterns
max(newdata$Corrected_Total)
nrow(newdata[newdata$Corrected_Total > 24,])
nrow(newdata[newdata$Corrected_Total > 24 & newdata$Children != 0 & newdata$Sleeping >= 480,])
nrow(newdata[newdata$Corrected_Total > 24 & newdata$Children != 0 & newdata$Socializing_Relaxing >= 360,])
nrow(newdata[((newdata$Sleeping + newdata$Socializing_Relaxing)/60) >= 20,])
study_subset1 <- newdata[((newdata$Sleeping + newdata$Socializing_Relaxing)/60) >= 20,]
nrow(newdata[newdata$Children != 0 & newdata$Caring_for_Children == 0,])
nrow(newdata[newdata$Children != 0 & newdata$Playing_with_Children == 0,])
nrow(newdata[newdata$Children != 0 & newdata$Caring_for_Children == 0 & newdata$Age >= 15 & newdata$Age <= 50 & (newdata$Employment_Status == 'Unemployed' | newdata$Employment_Status == 'Not in labor force'),])
study_subset2 <- newdata[newdata$Children != 0 & newdata$Caring_for_Children == 0 & newdata$Age >= 15 & newdata$Age <= 50 & (newdata$Employment_Status == 'Unemployed' | newdata$Employment_Status == 'Not in labor force') & newdata$Sleeping >= quantile(newdata$Sleeping, 0.98),]
# Using Weekly_Hours_Worked to determine sleep time
corrplot(cor(newdata[,-c('UL_Socializing','LL_Socializing','Mean_Socializing','Id', 'Education_Level', 'Age_Range', 'Employment_Status', 'Gender', 'Year' ,'UL_Playing', 'Mean_Playing', 'LL_Playing', 'UL_FoodDrinkPrep', 'Mean_FoodDrinkPrep', 'LL_FoodDrinkPrep', 'UL_TV', 'Mean_TV', 'LL_TV', 'Total', 'Corrected_Total', 'UL_Sleeping', 'Mean_Sleeping', 'LL_Sleeping')]),type = "lower",diag = FALSE, addCoef.col = "black")
colnames(newdata)
acf(newdata$Sleeping)
subset1 <- newdata[newdata$Weekly_Hours_Worked!=0,]
model4 <- lm(subset1$Sleeping ~ subset1$Weekly_Hours_Worked)
summary(model4)
confint(model4, level = 0.95)
plot(subset1$Weekly_Hours_Worked, subset1$Sleeping)
abline(model4, col = 'red')
coeftest(model4, vcov. = NeweyWest(model4,lag = NULL))
confint(model4, vcov = NeweyWest(model4, lag = NULL),level = 0.95)
newdata$UL_Sleeping <- (newdata$Weekly_Hours_Worked*-1.072797) + 564.035675
newdata$Mean_Sleeping <- (newdata$Weekly_Hours_Worked*-1.200297) + 558.708995
newdata$LL_Sleeping <- (newdata$Weekly_Hours_Worked*-1.327797) + 553.382316
nrow(newdata[newdata$Sleeping!=0,])
newdata$Corrected_Sleeping <- ifelse(newdata$Sleeping <= newdata$UL_Sleeping & newdata$Weekly_Hours_Worked != 0, newdata$Sleeping, ifelse(newdata$Sleeping > newdata$UL_Sleeping & newdata$Weekly_Hours_Worked != 0, newdata$UL_Sleeping, newdata$Sleeping))
newdata$Corrected_Sleeping <- ceiling(newdata$Corrected_Sleeping)
newdata$Corrected_Total <- (newdata$Corrected_Sleeping + newdata$Grooming + newdata$Housework + newdata$Corrected_FoodDrinkPrep + newdata$Caring_for_Children + newdata$Corrected_Playing_with_Children + newdata$Job_Searching + newdata$Shopping + newdata$Eating_and_Drinking + newdata$Socializing_Relaxing + newdata$Corrected_TV + newdata$Golfing + newdata$Running + newdata$Volunteering)/60
newdata$Corrected_Total <- round(newdata$Corrected_Total, digits = 2)
data1$UL_Sleeping <- (data1$Weekly_Hours_Worked*-1.072797) + 564.035675
data1$Mean_Sleeping <- (data1$Weekly_Hours_Worked*-1.200297) + 558.708995
data1$LL_Sleeping <- (data1$Weekly_Hours_Worked*-1.327797) + 553.382316
nrow(data1[data1$Sleeping!=0,])
data1$Corrected_Sleeping <- ifelse(data1$Sleeping <= data1$UL_Sleeping & data1$Weekly_Hours_Worked != 0, data1$Sleeping, ifelse(data1$Sleeping > data1$UL_Sleeping & data1$Weekly_Hours_Worked != 0, data1$UL_Sleeping, data1$Sleeping))
data1$Corrected_Sleeping <- ceiling(data1$Corrected_Sleeping)
data1$Corrected_Total <- (data1$Corrected_Sleeping + data1$Grooming + data1$Housework + data1$Corrected_FoodDrinkPrep + data1$Caring_for_Children + data1$Corrected_Playing_with_Children + data1$Job_Searching + data1$Shopping + data1$Eating_and_Drinking + data1$Socializing_Relaxing + data1$Corrected_TV + data1$Golfing + data1$Running + data1$Volunteering)/60
data1$Corrected_Total <- round(data1$Corrected_Total, digits = 2)
nrow(newdata[newdata$Corrected_Total > 24,])
study_subset3 <- newdata[newdata$Corrected_Total > 24,]
colnames(study_subset3)
study_subset3 <- study_subset3[,-c('Sleeping','Playing_with_Children','Food _Drink_Prep','Television','UL_Playing','Mean_Playing', 'LL_Playing', 'UL_FoodDrinkPrep', 'Mean_FoodDrinkPrep','LL_FoodDrinkPrep','UL_TV','Mean_TV','LL_TV','UL_Sleeping','Mean_Sleeping','LL_Sleeping')]
summary(study_subset3)
study_subset3[study_subset3$Corrected_Total == 28.92,]
# Using Weekly_Hours_Worked to determine Socializing & Relaxing
subset2 <- newdata[newdata$Weekly_Hours_Worked != 0 | newdata$Socializing_Relaxing != 0,]
model5 <- lm(subset2$Socializing_Relaxing ~ subset2$Weekly_Hours_Worked)
summary(model5)
confint(model5, level = 0.95)
plot(subset2$Weekly_Hours_Worked, subset2$Socializing_Relaxing)
abline(model5, col = 'red')
coeftest(model5, vcov. = NeweyWest(model5,lag = NULL))
confint(model5, vcov = NeweyWest(model5, lag = NULL),level = 0.95)
newdata$UL_Socializing <- (newdata$Weekly_Hours_Worked*-3.028677) + 368.119347
newdata$Mean_Socializing <- (newdata$Weekly_Hours_Worked*-3.102957) + 365.693796
newdata$LL_Socializing <- (newdata$Weekly_Hours_Worked*-3.177238) + 363.268245
newdata$Corrected_Socializing <- ifelse(newdata$Socializing_Relaxing <= newdata$UL_Socializing & newdata$Weekly_Hours_Worked != 0, newdata$Socializing_Relaxing, ifelse(newdata$Socializing_Relaxing > newdata$UL_Socializing & newdata$Weekly_Hours_Worked != 0, newdata$UL_Socializing, newdata$Socializing_Relaxing))
newdata$Corrected_Socializing <- ceiling(newdata$Corrected_Socializing)
newdata$Corrected_Total <- (newdata$Corrected_Sleeping + newdata$Grooming + newdata$Housework + newdata$Corrected_FoodDrinkPrep + newdata$Caring_for_Children + newdata$Corrected_Playing_with_Children + newdata$Job_Searching + newdata$Shopping + newdata$Eating_and_Drinking + newdata$Corrected_Socializing + newdata$Corrected_TV + newdata$Golfing + newdata$Running + newdata$Volunteering)/60
newdata$Corrected_Total <- round(newdata$Corrected_Total, digits = 2)
data1$UL_Socializing <- (data1$Weekly_Hours_Worked*-3.028677) + 368.119347
data1$Mean_Socializing <- (data1$Weekly_Hours_Worked*-3.102957) + 365.693796
data1$LL_Socializing <- (data1$Weekly_Hours_Worked*-3.177238) + 363.268245
data1$Corrected_Socializing <- ifelse(data1$Socializing_Relaxing <= data1$UL_Socializing & data1$Weekly_Hours_Worked != 0, data1$Socializing_Relaxing, ifelse(data1$Socializing_Relaxing > data1$UL_Socializing & data1$Weekly_Hours_Worked != 0, data1$UL_Socializing, data1$Socializing_Relaxing))
data1$Corrected_Socializing <- ceiling(data1$Corrected_Socializing)
data1$Corrected_Total <- (data1$Corrected_Sleeping + data1$Grooming + data1$Housework + data1$Corrected_FoodDrinkPrep + data1$Caring_for_Children + data1$Corrected_Playing_with_Children + data1$Job_Searching + data1$Shopping + data1$Eating_and_Drinking + data1$Corrected_Socializing + data1$Corrected_TV + data1$Golfing + data1$Running + data1$Volunteering)/60
data1$Corrected_Total <- round(data1$Corrected_Total, digits = 2)
nrow(newdata[newdata$Corrected_Total > 24,])
study_subset4 <- newdata[newdata$Corrected_Total > 24,]
colnames(study_subset4)
study_subset4 <- study_subset4[,-c('Socializing_Relaxing','UL_Socializing','LL_Socializing', 'Mean_Socializing','Sleeping','Playing_with_Children','Food _Drink_Prep','Television','UL_Playing','Mean_Playing', 'LL_Playing', 'UL_FoodDrinkPrep', 'Mean_FoodDrinkPrep','LL_FoodDrinkPrep','UL_TV','Mean_TV','LL_TV','UL_Sleeping','Mean_Sleeping','LL_Sleeping')]
summary(study_subset4)
study_subset4[study_subset4$Corrected_Total == 28.78,]
# Using Age, Number of Children and Weekly Hours Worked to determine Caring for Children
# model6 <- lm(newdata$Caring_for_Children ~ newdata$Age + newdata$Children + newdata$Weekly_Hours_Worked)
# summary(model6)
# confint(model6, level = 0.95)
# coeftest(model6, vcov. = NeweyWest(model6,lag = NULL))
# confint(model6, vcov = NeweyWest(model6, lag = NULL),level = 0.95)
#
# newdata$UL_Caring <- (newdata$Weekly_Hours_Worked*-0.1395882 + newdata$Children*25.0677057 + newdata$Age*-0.2696688) + 27.9694117
# newdata$Mean_Caring <- (newdata$Weekly_Hours_Worked*-0.167225 + newdata$Children*24.510294 + newdata$Age*-0.307253) + 25.618407
# newdata$LL_Caring <- (newdata$Weekly_Hours_Worked*-0.1948614 + newdata$Children*23.9528818 + newdata$Age*-0.3448382) + 23.2674033
#
# newdata$Corrected_Caring <- ifelse(newdata$Caring_for_Children <= newdata$UL_Caring & newdata$Children != 0, newdata$Caring_for_Children, ifelse(newdata$Caring_for_Children > newdata$UL_Caring & newdata$Children != 0, newdata$UL_Caring, newdata$Caring_for_Children))
# newdata$Corrected_Caring <- ceiling(newdata$Corrected_Caring)
#
# newdata$Corrected_Total <- (newdata$Corrected_Sleeping + newdata$Grooming + newdata$Housework + newdata$Corrected_FoodDrinkPrep + newdata$Corrected_Caring + newdata$Corrected_Playing_with_Children + newdata$Job_Searching + newdata$Shopping + newdata$Eating_and_Drinking + newdata$Corrected_Socializing + newdata$Corrected_TV + newdata$Golfing + newdata$Running + newdata$Volunteering)/60
# newdata$Corrected_Total <- round(newdata$Corrected_Total, digits = 2)
#
# nrow(newdata[newdata$Corrected_Total > 24,])
# study_subset5 <- newdata[newdata$Corrected_Total > 24,]
# colnames(study_subset5)
# study_subset5 <- study_subset5[,-c('Caring_for_Children','LL_Caring','Mean_Caring','UL_Caring','Socializing_Relaxing','UL_Socializing','LL_Socializing', 'Mean_Socializing','Sleeping','Playing_with_Children','Food _Drink_Prep','Television','UL_Playing','Mean_Playing', 'LL_Playing', 'UL_FoodDrinkPrep', 'Mean_FoodDrinkPrep','LL_FoodDrinkPrep','UL_TV','Mean_TV','LL_TV','UL_Sleeping','Mean_Sleeping','LL_Sleeping')]
# summary(study_subset5)
# study_subset5[study_subset5$Corrected_Total == 28.37,]
####
final_newdata <- newdata[newdata$Corrected_Total <= 24, -c('Socializing_Relaxing','UL_Socializing','LL_Socializing', 'Mean_Socializing','Sleeping','Playing_with_Children','Food _Drink_Prep','Television','UL_Playing','Mean_Playing', 'LL_Playing', 'UL_FoodDrinkPrep', 'Mean_FoodDrinkPrep','LL_FoodDrinkPrep','UL_TV','Mean_TV','LL_TV','UL_Sleeping','Mean_Sleeping','LL_Sleeping') ]
final_testdata <- data1[data1$Corrected_Total <= 24, -c('Socializing_Relaxing','UL_Socializing','LL_Socializing', 'Mean_Socializing','Sleeping','Playing_with_Children','Food _Drink_Prep','Television','UL_Playing','Mean_Playing', 'LL_Playing', 'UL_FoodDrinkPrep', 'Mean_FoodDrinkPrep','LL_FoodDrinkPrep','UL_TV','Mean_TV','LL_TV','UL_Sleeping','Mean_Sleeping','LL_Sleeping') ]
# Printing out in CSV format
write.csv(final_newdata, file = "final_newdata.csv")
write.csv(final_testdata, file = "final_testdata.csv")
####### MACHINE - LEARNING ######
rm(list = ls())
setwd("~/Desktop/IAS")
library(data.table)
context1 <- fread('combined_newdata.csv', header = TRUE)
context1 <- context1[,-c('Id', 'Total', 'Hourly_Earnings')]
head(context1, 10)
str(context1)
context1$Education_Level <- as.factor(context1$Education_Level)
context1$Age_Range <- as.factor(context1$Age_Range)
context1$Employment_Status <- as.factor(context1$Employment_Status)
context1$Gender <- as.factor(context1$Gender)
str(context1)
table(context1$Gender)/length(context1$Gender)
pie(table(context1$Gender))
barplot(table(context1$Gender))
# KNN - K nearest neighbor (to find common characteristics between employment status)
table(context1$Employment_Status)
# Ensuring that the employment status is mixed up
head(context1)
colnames(context1)
# Because KNN uses distance of each point, we have to normalize to remove any undue influence
normalize <- function(x){
return( (x - min(x))/(max(x) - min(x)))
}
context2 <- as.data.frame(lapply(context1[,10:23], normalize))
summary(context2)
data_train <- context2[1:43420,]
data_test <- context2[43421:76690,]
data_train_target <- context1[1:43420,4]