In [193]:
from model_eval import ModelEvaluator
from data_prep import DataPrep
from model import Model

import pymongo
import datetime
from bs4 import BeautifulSoup
import yaml
import time

import pandas as pd
from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier, GradientBoostingClassifier
from sklearn.multioutput import MultiOutputClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression, RidgeCV, LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import cross_validate
from sklearn.preprocessing import MinMaxScaler
import numpy as np

from sklearn.pipeline import Pipeline
from regression_tools.dftransformers import (
    ColumnSelector, Identity, FeatureUnion, MapFeature, Intercept)

from sklearn.model_selection import GridSearchCV

{'C': [0.001, 0.01, 0.1, 1, 10, 100, 1000]}

In [191]:
personality_traits = [
    'Openness', 
    'Conscientiousness', 
    'Extraversion', 
    'Agreeableness', 
    'Neuroticism'
]

models = [
    'LogisticRegression',
    'RandomForestClassifier',
    'MultinomialNB',
#     'GradientBoostingClassifier',
]

GridSearchCV(cv=None, error_score='raise',
       estimator=LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False),
       fit_params=None, iid=True, n_jobs=1,
       param_grid={'C': [0.001, 0.01, 0.1, 1, 10, 100, 1000]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring=None, verbose=0)

LogisticRegression(C=1, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

In [187]:
dp = DataPrep()
X, y = dp.prep_data('essay', 'O')
me = ModelEvaluator(X, y, 'O')


rfc = RandomForestClassifier(n_jobs=-1,max_features= 'sqrt' ,n_estimators=10, oob_score = True) 

param_grid = { 
    'n_estimators': [80, 90, 100, 110, 120],
}

CV_rfc = GridSearchCV(estimator=rfc, param_grid=param_grid, cv= 5)
CV_rfc.fit(X, y)
print (CV_rfc.best_params_)

{'n_estimators': 110}


In [190]:
# Load/prep data and compare models for essay dataset
for trait in personality_traits:
    dp = DataPrep()
    X, y = dp.prep_data('essay', trait)
    me = ModelEvaluator(X, y, trait)
    me.compare_scores(models)
    print('\n')

Model performance for trait Openness prediction:

LogisticRegression: 
Accuracy score: 0.615913881361
F1 score: 0.674971687429

RandomForestClassifier: 
Accuracy score: 0.587744655224
F1 score: 0.622057001239

MultinomialNB: 
Accuracy score: 0.609861487504
F1 score: 0.639247943596



KeyboardInterrupt: 

In [192]:
# Load/prep data and compare models for statuses dataset
for trait in personality_traits:
    dp = DataPrep()
    X, y = dp.prep_data('status', trait)
    me = ModelEvaluator(X, y, trait)
    me.compare_scores(models)
    print('\n')

Model performance for trait Openness prediction:

LogisticRegression: 
Accuracy score: 0.732355734185
F1 score: 0.844687610307

RandomForestClassifier: 
Accuracy score: 0.937979118591
F1 score: 0.977886977887

MultinomialNB: 
Accuracy score: 0.629077447181
F1 score: 0.767915309446

Highest Accuracy score: 0.937979118591
Model: RandomForestClassifier

Lowest F1 score: 0.767915309446
Model: MultinomialNB



Model performance for trait Conscientiousness prediction:

LogisticRegression: 
Accuracy score: 0.596075355059
F1 score: 0.49963045085

RandomForestClassifier: 
Accuracy score: 0.909591575182
F1 score: 0.971144278607

MultinomialNB: 
Accuracy score: 0.593910332529
F1 score: 0.472858866104

Highest Accuracy score: 0.909591575182
Model: RandomForestClassifier

Lowest F1 score: 0.472858866104
Model: MultinomialNB



Model performance for trait Extraversion prediction:

LogisticRegression: 
Accuracy score: 0.650461398244
F1 score: 0.465324384787

RandomForestClassifier: 
Accuracy score: 0

In [None]:
models = [
    'LinearRegression',
    'RandomForestRegressor',
    'Ridge',
]

for trait in personality_traits:
    dp = DataPrep()
    X, y = dp.prep_data('status', trait, regression=True)
    me = ModelEvaluator(X, y, trait)
    me.compare_scores(models, regression=True)
    print('\n')

Model performance for trait Openness prediction:

LinearRegression: 
MSE: 78423062703.5

RandomForestRegressor: 
MSE: 0.0113510926652



Ill-conditioned matrix detected. Result is not guaranteed to be accurate.
Reciprocal condition number: 3.7664743779568566e-17


Ridge: 


Ill-conditioned matrix detected. Result is not guaranteed to be accurate.
Reciprocal condition number: 7.483162228579032e-17
Ill-conditioned matrix detected. Result is not guaranteed to be accurate.
Reciprocal condition number: 9.954430393395314e-17


MSE: 296043.804727



Model performance for trait Conscientiousness prediction:

LinearRegression: 
MSE: 15718383092.4

RandomForestRegressor: 
MSE: 0.0168183765486



Ill-conditioned matrix detected. Result is not guaranteed to be accurate.
Reciprocal condition number: 3.7664743779568566e-17


Ridge: 


Ill-conditioned matrix detected. Result is not guaranteed to be accurate.
Reciprocal condition number: 7.483162228579032e-17
Ill-conditioned matrix detected. Result is not guaranteed to be accurate.
Reciprocal condition number: 9.954430393395314e-17


MSE: 1263180.96837



Model performance for trait Extraversion prediction:

LinearRegression: 
MSE: 74574769807.8

RandomForestRegressor: 
MSE: 0.0169384941467



Ill-conditioned matrix detected. Result is not guaranteed to be accurate.
Reciprocal condition number: 3.7664743779568566e-17


Ridge: 


Ill-conditioned matrix detected. Result is not guaranteed to be accurate.
Reciprocal condition number: 7.483162228579032e-17
Ill-conditioned matrix detected. Result is not guaranteed to be accurate.
Reciprocal condition number: 9.954430393395314e-17


MSE: 12179.1611765



Model performance for trait Agreeableness prediction:

LinearRegression: 
MSE: 8280583443.6

RandomForestRegressor: 
MSE: 0.0130685272237



Ill-conditioned matrix detected. Result is not guaranteed to be accurate.
Reciprocal condition number: 3.7664743779568566e-17


Ridge: 


Ill-conditioned matrix detected. Result is not guaranteed to be accurate.
Reciprocal condition number: 7.483162228579032e-17
Ill-conditioned matrix detected. Result is not guaranteed to be accurate.
Reciprocal condition number: 9.954430393395314e-17


MSE: 12.271152929



Model performance for trait Neuroticism prediction:

LinearRegression: 
MSE: 8160789008.34

RandomForestRegressor: 


<9917x15185 sparse matrix of type '<class 'numpy.float64'>'
	with 69667 stored elements in Compressed Sparse Row format>

Unnamed: 0,#AUTHID,TEXT,cEXT,cNEU,cAGR,cCON,cOPN,WPS,Unique,Dic,...,Dash,Quote,Apostro,Parenth,Otherp,Swear,Nonfl,Fillers,WC,textVect
0,1997_504851.txt,"Well, right now I just woke up from a mid-day ...",n,y,y,n,y,0.350500,-0.413255,0.707633,...,-0.451544,0.648584,0.241039,4.992247,3.646778,4.822550,4.706556,4.755983,660,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
1,1997_605191.txt,"Well, here we go with the stream of consciousn...",n,n,y,n,n,-0.600111,0.639878,0.232566,...,-0.451544,-0.408772,1.998337,4.456528,3.615503,5.013752,5.133913,4.550620,646,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
2,1997_687252.txt,An open keyboard and buttons to push. The thin...,n,y,n,y,y,1.140361,0.789245,-0.372853,...,-0.451544,0.066692,-0.586689,4.639688,4.133195,4.540890,4.567654,4.947274,756,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
3,1997_568848.txt,I can't believe it! It's really happening! M...,y,n,y,y,n,2.170028,-0.125285,-2.543948,...,-0.451544,-0.408772,1.920435,5.070173,3.759650,5.302607,5.031531,4.965156,344,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
4,1997_688160.txt,"Well, here I go with the good old stream of co...",y,n,y,n,y,0.361767,-0.080132,0.186415,...,-0.451544,0.158965,1.783663,4.743788,3.982751,4.429570,4.580647,4.733271,910,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
5,1997_722902.txt,Today. Had to turn the music down. Today I wen...,y,n,y,n,y,0.044302,-0.042485,-0.785769,...,0.394059,0.840302,1.272379,4.697723,3.527226,4.778733,4.885299,5.270074,725,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
6,1997_724708.txt,Stream of consciousness. What should I write a...,n,n,y,n,n,0.957762,-1.551906,0.710472,...,-0.451544,0.104247,-0.126271,4.655691,3.257753,4.244202,4.729286,5.059499,705,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
7,1997_724794.txt,The RTF305 Usenet site is a piece of garbage! ...,n,n,n,y,y,-0.992716,1.081093,-0.673970,...,-0.451544,0.380231,1.077617,4.402641,3.996814,4.877339,4.345637,4.896502,450,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
8,1997_628043.txt,I'm really unsure about this assignment becaus...,y,y,n,y,y,-1.008314,-0.810013,-2.267089,...,-0.451544,1.600789,-2.100591,4.648025,4.186170,4.487736,3.832334,5.385575,527,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
9,1997_708036.txt,Today was a tough day for me. I can't believed...,y,y,y,y,n,-0.112559,0.638396,-0.912873,...,-0.451544,-0.037619,-0.868729,4.573479,4.118152,4.217063,4.557049,4.408867,485,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."


ValueError: too many values to unpack (expected 2)