In [1]:
%pylab inline
%matplotlib inline
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_curve, auc
from sklearn import grid_search

import seaborn as sns
sns.set_context('talk')
sns.set_style('darkgrid') 

rcParams['figure.figsize'] = (8.0, 6.0)

Populating the interactive namespace from numpy and matplotlib


In [2]:
def plotROCCurve(y_test, prediction_prob):
    fpr, tpr, thresholds = roc_curve(y_test, prediction_prob[:,1])
    roc_auc = auc(fpr, tpr)
    plot(fpr, tpr, label='ROC curve (area = %0.2f)' % roc_auc)
    plot([0, 1], [0, 1], 'k--')
    xlabel('False Positive Rate')
    ylabel('True Positive Rate')
    plt.legend(bbox_to_anchor=(0.9, 0.2),
               bbox_transform=plt.gcf().transFigure)
    savefig('logistic_regression_aucroc.png', bbox_inches='tight', dpi=150)

In [3]:
data = pd.read_csv('processed_data_06-20-15.csv', index_col=0)
data.head()

Unnamed: 0_level_0,date_posted,funding_status,essay,project_resource_type
_projectid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
"""00000ce845c00cbf0686c992fc369df4""",2013-02-02 00:00:00,completed,"""Find a way or make one. That's our class mott...",Technology
"""0000954e7c49ebfbcd91ed9052070bee""",2013-10-03 00:00:00,expired,"""Isn't it more fun to have hands on activities...",Supplies
"""0000fe73a95dae43c4bd72a142760efb""",2014-08-24 00:00:00,completed,"""My students learn by playing! Clarinet and ob...",Supplies
"""00014d8302ca2cc1460f11e6acc12b76""",2013-09-13 00:00:00,completed,"""Do you remember the first time you hit the ke...",Technology
"""00016efa41348375e57a3bc1270114f9""",2013-08-04 00:00:00,completed,"""Remember the first time you touched a frog? O...",Other


In [4]:
print 'n_funded essays:', (data['funding_status']=='completed').sum()
print 'n_expired essays:', (data['funding_status']=='expired').sum()

n_funded essays: 179348
n_expired essays: 60988


In [5]:
# split data into training/test and vectorize essays

funding_map = {'completed':1, 'expired':0}

msk = np.random.rand(len(data)) < 0.9
train = data[msk]
test = data[~msk]

vectorizer = TfidfVectorizer(min_df=1, ngram_range=(1, 1), stop_words='english', max_features=4000)

X_train = vectorizer.fit_transform(train['essay'])
y_train = train['funding_status'].map(funding_map)

X_test = vectorizer.transform(test['essay'])
y_test = test['funding_status'].map(funding_map)

In [6]:
C = [1e-4, 1e-3, 1e-2, 1e-1, 1, 1e2, 1e3, 1e4]
param_grid = {'penalty':['l1', 'l2'], 'C': C}

In [7]:
logistic_regressions = grid_search.GridSearchCV(LogisticRegression(class_weight='auto'), cv=5, 
                                                param_grid=param_grid, scoring='roc_auc', 
                                                n_jobs=-1, verbose=1)

In [8]:
logistic_regressions.fit(X_train, y_train)

In [9]:
for score in logistic_regressions.grid_scores_:
    print score

In [10]:
logistic_regressions.best_params_

In [11]:
logistic_regressions.best_score_

In [12]:
plotROCCurve(y_test, logistic_regressions.predict_proba(X_test))