In [1]:
submitSample = read.csv('data//sampleSubmission2016.csv')

In [2]:
head(submitSample)

Unnamed: 0,USER_ID,Predictions
1,2,Democrat
2,3,Republican
3,6,Republican
4,7,Democrat
5,14,Democrat
6,28,Republican


In [3]:
nrow(submitSample)

In [4]:
voting = read.csv('data//train2016.csv')

In [5]:
str(voting)

'data.frame':	5568 obs. of  108 variables:
 $ USER_ID        : int  1 4 5 8 9 10 11 12 13 15 ...
 $ YOB            : int  1938 1970 1997 1983 1984 1997 1983 1996 NA 1981 ...
 $ Gender         : Factor w/ 3 levels "","Female","Male": 3 2 3 3 2 2 3 3 3 2 ...
 $ Income         : Factor w/ 7 levels "","$100,001 - $150,000",..: 1 6 5 2 4 6 3 5 1 4 ...
 $ HouseholdStatus: Factor w/ 7 levels "","Domestic Partners (no kids)",..: 5 3 6 5 5 6 4 6 6 5 ...
 $ EducationLevel : Factor w/ 8 levels "","Associate's Degree",..: 1 3 7 3 7 4 5 4 4 1 ...
 $ Party          : Factor w/ 2 levels "Democrat","Republican": 1 1 2 1 2 1 1 2 2 2 ...
 $ Q124742        : Factor w/ 3 levels "","No","Yes": 2 1 1 2 2 1 1 3 2 2 ...
 $ Q124122        : Factor w/ 3 levels "","No","Yes": 1 3 3 3 3 1 1 3 1 2 ...
 $ Q123464        : Factor w/ 3 levels "","No","Yes": 2 2 3 2 2 1 1 2 3 2 ...
 $ Q123621        : Factor w/ 3 levels "","No","Yes": 2 2 2 3 2 1 1 2 2 2 ...
 $ Q122769        : Factor w/ 3 levels "","No","Yes": 2 2 1 

In [6]:
length(names(voting))

In [7]:
summary(voting$YOB)

   Min. 1st Qu.  Median    Mean 3rd Qu.    Max.    NA's 
   1880    1970    1983    1980    1993    2039     333 

## Data Engineering


In [8]:
library(mice)

Loading required package: Rcpp
: package ‘Rcpp’ was built under R version 3.2.4mice 2.25 2015-11-09


In [9]:
## add new feature Age

voting$Age[(2016- voting$YOB ) <= 18] = '1-18'
voting$Age[((2016- voting$YOB ) > 18) & ((2016- voting$YOB ) <= 40)] = '17-40'
voting$Age[(2016- voting$YOB ) > 40] = '40+'
voting$Age = as.factor(voting$Age)
#voting$Age = 2016- voting$YOB

In [10]:
summary(voting$Age)

In [11]:
length(names(voting))

In [12]:
voting = voting[, !(names(voting) %in% c('YOB'))]

In [13]:
length(names(voting))

## Train Test Split

In [14]:
library(caTools)
set.seed(42)
spl = sample.split(voting$Party, 0.7)
train = subset(voting, spl == TRUE)
test = subset(voting, spl == FALSE)

## Super Simple LR model

In [15]:
SimpleMod = glm(Party ~ . -USER_ID, data=train, family=binomial)

PredTest = predict(SimpleMod, newdata=train, type="response")
threshold = 0.5
PredTestLabels = as.factor(ifelse(PredTest<threshold, "Democrat", "Republican"))


In [16]:
table(train$Party, PredTestLabels)

            PredTestLabels
             Democrat Republican
  Democrat       1352        598
  Republican      629       1087

In [17]:
t = table(train$Party, PredTestLabels)
sum(diag(t))/sum(t)

### Check test score on LR:

In [18]:
PredTest = predict(SimpleMod, newdata=test,  type="response")
PredTestLabels = as.factor(ifelse(PredTest<threshold, "Democrat", "Republican"))
t = table(test$Party, PredTestLabels)
t
sum(diag(t))/sum(t)

            PredTestLabels
             Democrat Republican
  Democrat        540        287
  Republican      321        421

## RandomForest

In [19]:
library(randomForest)
set.seed(42)

randomForest 4.6-12
Type rfNews() to see new features/changes/bug fixes.


In [20]:
votingRF = randomForest(Party ~ . -USER_ID, data=train)

ERROR: Error in na.fail.default(structure(list(Party = structure(c(2L, 1L, 2L, : missing values in object


*** 
### Using mice to deal with missing data:

In [21]:
library(mice)

In [22]:
md.pattern(train)

Unnamed: 0,USER_ID,Gender,Income,HouseholdStatus,EducationLevel,Party,Q124742,Q124122,Q123464,Q123621,Unnamed: 11,Q99581,Q99480,Q98869,Q98578,Q98059,Q98078,Q98197,Q96024,Age,Unnamed: 21
3666.0,1,1,1,1,1,1,1,1,1,1,⋯,1,1,1,1,1,1,1,1,1,0
232.0,1,1,1,1,1,1,1,1,1,1,⋯,1,1,1,1,1,1,1,1,0,1
,0,0,0,0,0,0,0,0,0,0,⋯,0,0,0,0,0,0,0,0,232,232


In [23]:
train_imputed = complete(mice(train))


 iter imp variable
  1   1  Age
  1   2  Age
  1   3  Age
  1   4  Age
  1   5  Age
  2   1  Age
  2   2  Age
  2   3  Age
  2   4  Age
  2   5  Age
  3   1  Age
  3   2  Age
  3   3  Age
  3   4  Age
  3   5  Age
  4   1  Age
  4   2  Age
  4   3  Age
  4   4  Age
  4   5  Age
  5   1  Age
  5   2  Age
  5   3  Age
  5   4  Age
  5   5  Age


In [24]:
length(names(train_imputed))

In [25]:
summary(train_imputed$YOB)

Length  Class   Mode 
     0   NULL   NULL 

In [26]:
summary(train_imputed$Age)

***
### continue RF...

In [27]:
votingRF = randomForest(Party ~ . -USER_ID, 
                        data=train_imputed,
                       nodesize=40,
                       ntree=200)

In [28]:
predictRF = predict(votingRF, newdata = train_imputed)
table(train$Party, predictRF)

            predictRF
             Democrat Republican
  Democrat       1767        299
  Republican      412       1420

In [29]:
t = table(train$Party, predictRF)
sum(diag(t))/sum(t)

### Checking test score on RF:

In [30]:
test_imputed = complete(mice(test))
predictRF = predict(votingRF, newdata = test_imputed)

t = table(test$Party, predictRF)
t
sum(diag(t))/sum(t)


 iter imp variable
  1   1  Age
  1   2  Age
  1   3  Age
  1   4  Age
  1   5  Age
  2   1  Age
  2   2  Age
  2   3  Age
  2   4  Age
  2   5  Age
  3   1  Age
  3   2  Age
  3   3  Age
  3   4  Age
  3   5  Age
  4   1  Age
  4   2  Age
  4   3  Age
  4   4  Age
  4   5  Age
  5   1  Age
  5   2  Age
  5   3  Age
  5   4  Age
  5   5  Age


            predictRF
             Democrat Republican
  Democrat        624        261
  Republican      349        436

## To a Kaggle submit with the above mode parameter:

In [None]:
voting_imputed = complete(mice(voting))
votingRF = randomForest(Party ~ . -USER_ID -Gender, 
                        data=voting_imputed,
                       nodesize=40,
                       ntree=200)
kaggle_test = read.csv('data//test2016.csv')
kaggle_test_imputed = complete(mice(kaggle_test))
predict_kaggle_RF = predict(votingRF, newdata = kaggle_test_imputed)

In [None]:
MySubmission = data.frame(USER_ID = kaggle_test$USER_ID, Predictions = predict_kaggle_RF)
write.csv(MySubmission, "submissions/RF_all_nodesize40_ntree200.csv", row.names=FALSE, quote=FALSE)