In [77]:
from __future__ import division
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import raw_loyola as rl
import interests_cleaning as cl
import bot
import operator
import copy
import cPickle as pickle
from collections import Counter
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import f1_score
from sklearn.metrics import classification_report
from sklearn.metrics import log_loss


%matplotlib inline

# Loading and Cleaning the bot data set

Only one small cleaning task, to fill in the missing probabilities for some of the fields. Simply filled with 0 since a missing probability just means that there was 0 chance that based on the bot's answers to that quiz that they were interested in a given field

## Sampling to test model

Took a small sample (20%) of the full 1,000,000 row dataset to see if it worked on a shard.

In [2]:
giant_df = pd.read_csv('../../../data/bot_data.csv')

In [4]:
giant_df.fillna(0, inplace=True)

In [6]:
giant_df['labels'].value_counts()

Math, Sciences, and Engineering    355119
Business and Communication         217359
Public Service, Law, and Policy    196692
Social Sciences                    161355
Creative Arts                       69475
Name: labels, dtype: int64

In [7]:
sample_df = giant_df.sample(frac=0.2)

In [8]:
sample_df['labels'].value_counts()

Math, Sciences, and Engineering    70936
Business and Communication         43608
Public Service, Law, and Policy    39350
Social Sciences                    32069
Creative Arts                      14037
Name: labels, dtype: int64

# Preparing data for modeling and splitting

## Unused columns

Some of the columns in the dataframe were simply for recordkeeping about the information the bot was using. These are unnecssary for modeling.

## Two sets of X and Y

Created test train splits for both the sample dataframe and the full dataframe. Xs/ys are for the sample and Xf/yf are for the full dataframe.

In [9]:
unused_cols = ['labels','quiz_num', 'public_service_law_and_policy_proba','business_and_communication_proba','math_sciences_and_engineering_proba','creative_arts_proba','social_sciences_proba' ]

In [10]:
X_samp = sample_df.drop(unused_cols, axis=1).values
y_samp = sample_df['labels'].values

In [11]:
Xs_train, Xs_test, ys_train, ys_test = train_test_split(X_samp, y_samp, test_size=0.25)

In [12]:
X = giant_df.drop(unused_cols, axis=1).values
y = giant_df['labels'].values

In [13]:
Xf_train, Xf_test, yf_train, yf_test = train_test_split(X, y, test_size=0.25)

# Fitting and evaluating models

## What the model is really being used for

It is important to keep in mind here that the model is only being used to test how important keeping all the questions in the quiz is. Ideally, I would like to reduce the number of questions, and believe that I can without much issue, but using a model to check this assumption is useful. So, in this case, we are not concerned with minimizing anything, we are mostly exploring how the performance of the model changes with different sets of questions and determining which questions are essential to keep in. 

The Random Forest model is a perfect pick for this job as it can build a diverse set of learners that will try different combinations of questions on random bootstrap samples of the data, preventing the model from getting fixated on only one question or consistently building decision paths with questions asked in the same order. This will give us more insight into how important questions are really.

## Parameters

Most of the default parameters will work in this case. We definitely want to bootstrap and the default setting for the subsampling rate is sqrt(num_features), which in our case is ~6 of 35 questions, which seems reasonable. I don't want to limit the depth of the trees here so I can capture all interactions between questions, something you miss out on by limiting tree depth. We also don't need a large number of estimators (and computationally my computer is not capable of building a 500 tree forest on a million rows of data) since we aren't too concerned about maximizing the ability of the model to actually predict things.

One important note is that I did set the class weight parameter to balanced as when I tested it without this parameter, the model NEVER predicted someone was an art major. While that may be semi reasonable from a real life standpoint (art majors are few and far between), I don't want to leave my users unable to ever see art as a possible choice.


## Evaluation

A number of metrics were produced to have a macro view of the performance of the model from one to the next. Log loss is probably the most appropriate measure for this problem since multi-class classification inherently will score extremely poorly in most accuracy related metrics. Log loss will give us an idea of whether the model is getting too confident about predicitions, when for this problem, it should inherently be fairly unsure about it's predictions.



# Fitting forest with shard dataset and evaluating 

In [16]:
forest_s = RandomForestClassifier(n_estimators=100, oob_score=True, n_jobs=-1, class_weight='balanced')

In [17]:
forest_s.fit(Xs_train, ys_train)

RandomForestClassifier(bootstrap=True, class_weight='balanced',
            criterion='gini', max_depth=None, max_features='auto',
            max_leaf_nodes=None, min_impurity_split=1e-07,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=100, n_jobs=-1,
            oob_score=True, random_state=None, verbose=0, warm_start=False)

In [19]:
ys_pred = forest_s.predict(Xs_train)

In [21]:
print classification_report(ys_train, ys_pred)

                                 precision    recall  f1-score   support

     Business and Communication       0.39      0.33      0.36     32698
                  Creative Arts       0.18      0.53      0.27     10441
Math, Sciences, and Engineering       0.55      0.33      0.41     53179
Public Service, Law, and Policy       0.35      0.33      0.34     29543
                Social Sciences       0.32      0.43      0.37     24139

                    avg / total       0.41      0.36      0.37    150000



In [22]:
ys_test_pred = forest_s.predict(Xs_test)

In [23]:
print classification_report(ys_test, ys_test_pred)

                                 precision    recall  f1-score   support

     Business and Communication       0.25      0.22      0.24     10910
                  Creative Arts       0.10      0.27      0.14      3596
Math, Sciences, and Engineering       0.42      0.25      0.31     17757
Public Service, Law, and Policy       0.22      0.21      0.21      9807
                Social Sciences       0.21      0.28      0.24      7930

                    avg / total       0.29      0.24      0.25     50000



In [28]:
pred_probas_train = forest_s.predict_proba(Xs_train)

In [29]:
pred_probas_test = forest_s.predict_proba(Xs_test)

In [30]:
log_loss(ys_train, pred_probas_train)

1.3811316379745511

In [31]:
log_loss(ys_test, pred_probas_test)

3.4103437293214567

In [32]:
questions = rl.get_raw_data()
fields_dict = cl.get_fields_dict()
clean_q = cl.majors_string_to_list(questions)
mapped_lst = cl.create_labels(clean_q, fields_dict)

# Question Importance Sample Set

Taking a first look at questions that are important to classifying users. These will probably not stay consistent with when the model is fitted on the whole data set, but if any turn up as important here AND also when we fit on the entire data set, then that could give us more confidence what actually matters.

In [35]:
for i in np.argsort(forest_s.feature_importances_):
    print "Question num {} importance: {}".format(i+1, forest_s.feature_importances_[i])
    print mapped_lst[i][0] + '\n'
    print mapped_lst[i][2]
    print

Question num 11 importance: 0.0225262740912
I'm interested in science and in the ability to think logically.

Counter({'Math, Sciences, and Engineering': 8, 'Public Service, Law, and Policy': 2})

Question num 8 importance: 0.0226924691257
I like working with people, and I enjoy variety in my work.

Counter({'Social Sciences': 4, 'Business and Communication': 3, 'Public Service, Law, and Policy': 1})

Question num 9 importance: 0.0228725542508
I'm interested in intellectual ideas, including those that are shaped by religious beliefs.

Counter({'Social Sciences': 4, 'Public Service, Law, and Policy': 1})

Question num 2 importance: 0.0230668614605
I enjoy working with people, and I have strong verbal and written communication skills.

Counter({'Business and Communication': 7, 'Math, Sciences, and Engineering': 1})

Question num 15 importance: 0.0237914821955
I like science and math, and I have mechanical aptitude.

Counter({'Math, Sciences, and Engineering': 8, 'Public Service, Law, and

# Fitting forest with entire dataset and evaluating

In [36]:
forest_f = RandomForestClassifier(n_estimators=100, oob_score=True, n_jobs=-1, class_weight='balanced')

In [37]:
forest_f.fit(Xf_train, yf_train)

RandomForestClassifier(bootstrap=True, class_weight='balanced',
            criterion='gini', max_depth=None, max_features='auto',
            max_leaf_nodes=None, min_impurity_split=1e-07,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=100, n_jobs=-1,
            oob_score=True, random_state=None, verbose=0, warm_start=False)

In [38]:
yf_train_proba = forest_f.predict_proba(Xf_train)

In [39]:
log_loss(yf_train, yf_train_proba)

1.519259258080784

In [40]:
yf_test_proba = forest_f.predict_proba(Xf_test)

In [41]:
log_loss(yf_test, yf_test_proba)

1.615844592999224

In [42]:
yf_train_pred = forest_f.predict(Xf_train)

In [43]:
yf_test_pred = forest_f.predict(Xf_test)

In [44]:
print classification_report(yf_train, yf_train_pred)

                                 precision    recall  f1-score   support

     Business and Communication       0.32      0.25      0.28    162846
                  Creative Arts       0.13      0.46      0.20     51923
Math, Sciences, and Engineering       0.49      0.27      0.35    266436
Public Service, Law, and Policy       0.29      0.22      0.25    147534
                Social Sciences       0.26      0.40      0.32    121261

                    avg / total       0.35      0.29      0.30    750000



In [45]:
print classification_report(yf_test, yf_test_pred)

                                 precision    recall  f1-score   support

     Business and Communication       0.28      0.21      0.24     54513
                  Creative Arts       0.10      0.35      0.16     17552
Math, Sciences, and Engineering       0.44      0.25      0.32     88683
Public Service, Law, and Policy       0.24      0.18      0.21     49158
                Social Sciences       0.23      0.35      0.28     40094

                    avg / total       0.31      0.25      0.26    250000



In [93]:
rank = 35
q_ranks = {}
for i in np.argsort(forest_f.feature_importances_):
    print "Rank: {}".format(rank)
    print "Question num {} importance: {}".format(i+1, forest_f.feature_importances_[i])
    print mapped_lst[i][0] + '\n'
    q_ranks[i+1] = rank
    rank -= 1

Rank: 35
Question num 14 importance: 0.0233332883809
I have a great memory and have the ability to recognize general principles in particular situations.

Rank: 34
Question num 13 importance: 0.0234159805546
I like to experiment with better and faster ways of doing things.

Rank: 33
Question num 20 importance: 0.023853648839
I have strong morals and enjoy helping people.

Rank: 32
Question num 16 importance: 0.023951972593
I can work on projects very carefully and thoroughly, with patience and determination.

Rank: 31
Question num 19 importance: 0.0241704714261
I'm interested in law and human nature, and I have the ability to correlate and reason.

Rank: 30
Question num 6 importance: 0.0241743726346
I like to sing and/or play musical instruments.

Rank: 29
Question num 1 importance: 0.0241973331153
I have multiple interests and a natural curiosity about the world.

Rank: 28
Question num 29 importance: 0.0242199145374
I'm interested in sustainability initiatives and the environment.

Ra

# Fully Fitted Model Recap

## Question Importance

Very interesting results when it comes to the question importances as the top two questions for both the shard and the full set remained the same. In fact, when the model was fitted on the entire data set, those two questions distanced themselves significantly from the rest of the questions. I did run both models multiple times and those two questions consistently came back as most important.

After the top two questions however, there isn't much division between the rest of the questions. This is unfortunate, since I can't support the questions I choose with data as well, but it does give me some flexibility in choosing questions I personally find better.

## Evaluation

As expected, the model is pretty horrible at predicting anything. The accuracy metrics are extremely low, but that is often expected when predicting multi-classes. In this project, especially, we would expect this since the original labeled data was already extremely variable since each set of answers a bot gave us was then randomly labeled according to a probability distribution. Log loss is interesting as it is fairly low (though "fairly low" doesn't really mean anything), which was a little unexpected considering I expected my model to be horrible at actually making predictions. However, considering what log loss measures (large penalty for being really sure about a wrong classification), this actually begins to make more sense. My model inherently is never that sure about it's predictions and this lack of commitment prevents it from making high probability but incorrect classficiations. Basically, it is kinda wrong all the time, but log loss only really blows up only when you're sure about something which my model never is. Interesting example of the power and limitations of log loss.  

But what is most important here is that the performance of the model stays fairly consistent from the train to the test set, meaning it should perform about the same even on new (user) data. It will also give us something to compare the limited question data set model to.

# Limited Question Model Fitting

## How did I pick these questions?

Very carefully. Seriously, though, I took the top 4 questions (as ranked by feature importance) and after that the feature importances level off to relatively equal. At that point, I started picking questions that were a little less direct (direct ex: "I like science" vs. indirect: "I can work on projects very carefully and thoroughly, with patience and determination.) for a little subtler touch to the survey. This is a common survey technique and while I can most definitely NOT make any comment on the validity (I'd have to spend possibly years validating good survey questions) of that, I do think it is a common sense thing to do in order to capture more subtle information about my user

In [103]:
q_set1 = copy.deepcopy(giant_df[['q2', 'q3', 'q4', 'q8', 'q10', 'q16', 'q17', 'q20', 'q27', 'q28', 'labels']])

In [64]:
X_q1 = q_set1.drop('labels', axis=1).values
y_q1 = q_set1['labels'].values

In [65]:
X_train, X_test, y_train, y_test = train_test_split(X_q1, y_q1, test_size=0.25)

In [66]:
forest = RandomForestClassifier(n_estimators=100, oob_score=True, n_jobs=-1, class_weight='balanced')

In [67]:
forest.fit(X_train, y_train)

RandomForestClassifier(bootstrap=True, class_weight='balanced',
            criterion='gini', max_depth=None, max_features='auto',
            max_leaf_nodes=None, min_impurity_split=1e-07,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=100, n_jobs=-1,
            oob_score=True, random_state=None, verbose=0, warm_start=False)

In [68]:
y_train_proba = forest.predict_proba(X_train)

In [69]:
y_test_proba = forest.predict_proba(X_test)

In [70]:
log_loss(y_train, y_train_proba)

1.5729066340310696

In [71]:
log_loss(y_test, y_test_proba)

1.5779950364277777

In [72]:
y_train_pred = forest.predict(X_train)

In [73]:
y_test_pred = forest.predict(X_test)

In [74]:
print classification_report(y_train, y_train_pred)

                                 precision    recall  f1-score   support

     Business and Communication       0.28      0.18      0.22    163005
                  Creative Arts       0.11      0.47      0.17     51915
Math, Sciences, and Engineering       0.45      0.14      0.21    266367
Public Service, Law, and Policy       0.26      0.15      0.19    147768
                Social Sciences       0.23      0.47      0.31    120945

                    avg / total       0.31      0.23      0.22    750000



In [75]:
print classification_report(y_test, y_test_pred)

                                 precision    recall  f1-score   support

     Business and Communication       0.27      0.18      0.21     54354
                  Creative Arts       0.10      0.45      0.17     17560
Math, Sciences, and Engineering       0.44      0.14      0.21     88752
Public Service, Law, and Policy       0.25      0.14      0.18     48924
                Social Sciences       0.22      0.46      0.30     40410

                    avg / total       0.31      0.22      0.22    250000



In [78]:
final_question_nums = [2, 3, 4, 8, 10, 16, 17, 20, 27, 28]

In [100]:
for q_num in final_question_nums:
    print "Question: {}".format(mapped_lst[q_num-1][0])
    print "Question num: {}".format(q_num)
    print "Question rank: {}".format(q_ranks[q_num])
    print mapped_lst[q_num-1][2]
    print

Question: I enjoy working with people, and I have strong verbal and written communication skills.
Question num: 2
Question rank: 3
Counter({'Business and Communication': 7, 'Math, Sciences, and Engineering': 1})

Question: I have interest and/or ability in art.
Question num: 3
Question rank: 1
Counter({'Creative Arts': 4})

Question: I'm interested in graphic and/or Web design.
Question num: 4
Question rank: 6
Counter({'Creative Arts': 1, 'Math, Sciences, and Engineering': 1})

Question: I like working with people, and I enjoy variety in my work.
Question num: 8
Question rank: 4
Counter({'Social Sciences': 4, 'Business and Communication': 3, 'Public Service, Law, and Policy': 1})

Question: I have strong verbal ability and enjoy learning about other cultures and civilizations through language and literature.
Question num: 10
Question rank: 2
Counter({'Social Sciences': 9, 'Business and Communication': 1})

Question: I can work on projects very carefully and thoroughly, with patience an

In [98]:
considering_questions = [13, 19, 22, 24, 25]

In [99]:
for q_num in considering_questions:
    print "Question: {}".format(mapped_lst[q_num-1][0])
    print "Question num: {}".format(q_num)
    print "Question rank: {}".format(q_ranks[q_num])
    print mapped_lst[q_num-1][2]
    print

Question: I like to experiment with better and faster ways of doing things.
Question num: 13
Question rank: 34
Counter({'Math, Sciences, and Engineering': 7, 'Public Service, Law, and Policy': 1})

Question: I'm interested in law and human nature, and I have the ability to correlate and reason.
Question num: 19
Question rank: 31
Counter({'Public Service, Law, and Policy': 3})

Question: I'm good at analyzing, comparing, and interpreting data.
Question num: 22
Question rank: 27
Counter({'Business and Communication': 2, 'Math, Sciences, and Engineering': 1})

Question: I have an analytic and systematic mind. I'm also good at organizing and delegating responsibilities.
Question num: 24
Question rank: 12
Counter({'Business and Communication': 2})

Question: I'm patient and active, and I love working with children.
Question num: 25
Question rank: 25
Counter({'Business and Communication': 1})



In [104]:
q_set2 = copy.deepcopy(giant_df[['q2', 'q3', 'q4', 'q8', 'q10', 'q16', 'q17', 'q20', 'q24', 'q27', 'labels']])

In [105]:
X_q2 = q_set2.drop('labels', axis=1).values
y_q2 = q_set2['labels'].values

In [106]:
X2_train, X2_test, y2_train, y2_test = train_test_split(X_q2, y_q2, test_size=0.25)

In [123]:
forest2 = RandomForestClassifier(n_estimators=100, n_jobs=-1, class_weight='balanced')

In [124]:
forest2.fit(X2_train, y2_train)

RandomForestClassifier(bootstrap=True, class_weight='balanced',
            criterion='gini', max_depth=None, max_features='auto',
            max_leaf_nodes=None, min_impurity_split=1e-07,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=100, n_jobs=-1,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False)

In [125]:
y2_train_proba = forest2.predict_proba(X2_train)

In [126]:
y2_test_proba = forest2.predict_proba(X2_test)

In [127]:
log_loss(y2_train, y2_train_proba)

1.5727944669078691

In [128]:
log_loss(y2_test, y2_test_proba)

1.5775857533701321

In [139]:
y2_train_pred = forest2.predict(X2_train)

In [140]:
y2_test_pred = forest2.predict(X2_test)

In [141]:
print classification_report(y2_train, y2_train_pred)

                                 precision    recall  f1-score   support

     Business and Communication       0.28      0.19      0.23    163043
                  Creative Arts       0.11      0.47      0.17     52134
Math, Sciences, and Engineering       0.44      0.14      0.21    266374
Public Service, Law, and Policy       0.26      0.14      0.19    147233
                Social Sciences       0.23      0.46      0.30    121216

                    avg / total       0.31      0.23      0.22    750000



In [142]:
print classification_report(y2_test, y2_test_pred)

                                 precision    recall  f1-score   support

     Business and Communication       0.27      0.18      0.22     54316
                  Creative Arts       0.10      0.46      0.17     17341
Math, Sciences, and Engineering       0.44      0.14      0.21     88745
Public Service, Law, and Policy       0.25      0.14      0.18     49459
                Social Sciences       0.22      0.45      0.30     40139

                    avg / total       0.31      0.22      0.22    250000



In [119]:
final_question_nums = [2, 3, 4, 8, 10, 16, 17, 20, 24, 27]

In [120]:
for q_num in final_question_nums:
    print "Question: {}".format(mapped_lst[q_num-1][0])
    print "Question num: {}".format(q_num)
    print "Question rank: {}".format(q_ranks[q_num])
    print mapped_lst[q_num-1][2]
    print

Question: I enjoy working with people, and I have strong verbal and written communication skills.
Question num: 2
Question rank: 3
Counter({'Business and Communication': 7, 'Math, Sciences, and Engineering': 1})

Question: I have interest and/or ability in art.
Question num: 3
Question rank: 1
Counter({'Creative Arts': 4})

Question: I'm interested in graphic and/or Web design.
Question num: 4
Question rank: 6
Counter({'Creative Arts': 1, 'Math, Sciences, and Engineering': 1})

Question: I like working with people, and I enjoy variety in my work.
Question num: 8
Question rank: 4
Counter({'Social Sciences': 4, 'Business and Communication': 3, 'Public Service, Law, and Policy': 1})

Question: I have strong verbal ability and enjoy learning about other cultures and civilizations through language and literature.
Question num: 10
Question rank: 2
Counter({'Social Sciences': 9, 'Business and Communication': 1})

Question: I can work on projects very carefully and thoroughly, with patience an

In [137]:
with open('./firstmodel.pkl', 'w') as f:
    pickle.dump(forest2, f)

In [138]:
with open('./firstmodel.pkl') as f:
    reload_forest = pickle.load(f)

In [143]:
yr_train_proba = reload_forest.predict_proba(X2_train)

In [144]:
yr_test_proba = reload_forest.predict_proba(X2_test)

In [145]:
log_loss(y2_train, yr_train_proba)

1.5727944669078691

In [146]:
log_loss(y2_test, yr_test_proba)

1.5775857533701321

In [147]:
yr_train_pred = reload_forest.predict(X2_train)

In [148]:
yr_test_pred = reload_forest.predict(X2_test)

In [149]:
print classification_report(y2_train, yr_train_pred)

                                 precision    recall  f1-score   support

     Business and Communication       0.28      0.19      0.23    163043
                  Creative Arts       0.11      0.47      0.17     52134
Math, Sciences, and Engineering       0.44      0.14      0.21    266374
Public Service, Law, and Policy       0.26      0.14      0.19    147233
                Social Sciences       0.23      0.46      0.30    121216

                    avg / total       0.31      0.23      0.22    750000



In [150]:
print classification_report(y2_test, yr_test_pred)

                                 precision    recall  f1-score   support

     Business and Communication       0.27      0.18      0.22     54316
                  Creative Arts       0.10      0.46      0.17     17341
Math, Sciences, and Engineering       0.44      0.14      0.21     88745
Public Service, Law, and Policy       0.25      0.14      0.18     49459
                Social Sciences       0.22      0.45      0.30     40139

                    avg / total       0.31      0.22      0.22    250000

