In [2]:
%reset

Once deleted, variables cannot be recovered. Proceed (y/[n])? y


In [3]:
import random
import warnings

import pandas as pd
import nltk
import string

from sklearn.neighbors import KNeighborsClassifier
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer, TfidfVectorizer
from sklearn.preprocessing import LabelEncoder
from sklearn.svm import LinearSVC, SVC
from sklearn.cross_validation import StratifiedShuffleSplit, train_test_split
from sklearn.feature_extraction.stop_words import ENGLISH_STOP_WORDS
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix 
from sklearn.pipeline import Pipeline
from sklearn.grid_search import GridSearchCV
from sklearn.ensemble import AdaBoostClassifier

from utils.clean_up import *
from utils.categorize_demographics import *
from utils.nonnegative_matrix_factorization import nmf_inspect, nmf_labels
from utils.distinctive_tokens import log_odds_ratio
from utils.classification import betas

warnings.filterwarnings('ignore')
%matplotlib inline

In [4]:
# Keeping track of the names of the essays
essay_dict = {'essay0' : 'My self summary',
              'essay1' : 'What I\'m doing with my life',
              'essay2' : 'I\'m really good at',
              'essay3' : 'The first thing people notice about me',
              'essay4' : 'Favorite books, movies, tv, food',
              'essay5' : 'The six things I could never do without',
              'essay6' : 'I spend a lot of time thinking about',
              'essay7' : 'On a typical Friday night I am',
              'essay8' : 'The most private thing I am willing to admit',
              'essay9' : 'You should message me if'}

In [5]:
df = pd.read_csv('data/profiles.20120630.csv')

essay_list = ['essay0', 'essay4']
df_0, df_4 = clean_up(df, essay_list)

df_4 = recategorize(df_4)

In [6]:
#Separating out the drug users, non drug users, and unknowns
def get_category(text):
    if 'unknown' in text:
        return 0
    if 'yes' in text:
        return 1
    else:
        return -1
main_df = df_4[['essay4']]
main_df['Category'] = df_4['drugs'].apply(get_category)
main_df.columns = ['Text', 'Category']
no_df = main_df[main_df.Category == -1]
no_df = no_df[:6859] # balancing the classes
unknown_df = main_df[main_df.Category == 0]
yes_df = main_df[main_df.Category == 1]

Unnamed: 0,Text,Category
0,"books: absurdistan, the republic, of mice and ...",-1
1,i am die hard christopher moore fan. i don't r...,1
2,okay this is where the cultural matrix gets so...,0
3,"bataille, celine, beckett. . . lynch, jarmusch...",0
4,"music: bands, rappers, musicians at the moment...",-1


In [9]:
#Creating training data
train_df = pd.concat([no_df, yes_df], axis=0) #stack the two together
train_df = train_df.reindex(np.random.permutation(train_df.index)) #reshuffle them
train_df = train_df.reset_index(drop=True)
unknown_df = unknown_df.reset_index(drop=True)
train_df.head()

Unnamed: 0,Text,Category
0,book: janice dickinson: no lifeguard on duty. ...,1
1,"books: satipatthana by analayo, radical accept...",-1
2,"i like to read, and past favorite authors are ...",1
3,"books: 1984 by george orwell, comics movies: a...",-1
4,- favorite books: the girl with the dragon tat...,-1


In [10]:
# REPLACE WITH NMF FEATURES
def add_features(df):
    #tokenize text
    pattern = r'''(?x)    # set flag to allow verbose regexps
         ([A-Z]\.)+        # abbreviations, e.g. U.S.A.
       | \w+([-']\w+)*        # words with optional internal hyphens
       | \$?\d+(\.\d+)?%?  # currency and percentages, e.g. $12.40, 82%
       | [!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~]+  # these are separate tokens (string.punctuation)
     '''
    tokenize = lambda text: nltk.regexp_tokenize(text, pattern)
    df['Tokens'] = df['Text'].apply(tokenize)
    
    def pos_tag(text):
        tuples = nltk.pos_tag(text)
        tags = []
        for t in tuples:
            tags.append(t[1])
        return tags
    
    def get_nouns(text):
        wnlemmatizer = nltk.WordNetLemmatizer()
        tuples = nltk.pos_tag(text)
        tags = []
        for t in tuples:
            if t[1][0] == 'N':
                tags.append(wnlemmatizer.lemmatize(t[0]))
        return tags
    
    df['Nouns'] = df['Tokens'].apply(get_nouns)
    df['POS'] = df['Tokens'].apply(pos_tag)
    make_string = lambda a: ' '.join(i for i in a)
    df['Nouns'] = df['Nouns'].apply(make_string)
    df['POS'] = df['POS'].apply(make_string)
    
    df['TokenString'] = df['Tokens'].apply(make_string)
    
    df['TokensNouns'] = df[['TokenString', 'Nouns']].apply(lambda x: ' '.join(x), axis=1)
    
    return df

In [11]:
#vectorize data
#turns words into a list of vectors - vector length is the total number of words
#Vector elements correspond to 1 word (1/0 if word is/not present in the current item)
def featurize(df, test_df, count_vec):
    pos_vec = CountVectorizer()
    noun_vec = CountVectorizer()
    vec_tar = LabelEncoder()
    targets = df['Category']
        
    counts = count_vec.fit_transform(df['Text'])
    
    tfidf_transformer = TfidfTransformer()
    tfidf = tfidf_transformer.fit_transform(counts)
    
    
    test = count_vec.transform(test_df['Text'])
    test = tfidf_transformer.transform(test)
    
    return tfidf, test

In [12]:
def find_params(df):
    pipeline, parameters = [dict() for i in range(2)]
    #set up cross validation folds
    cv = StratifiedShuffleSplit(df.Category, n_iter=5, test_size=.2)

    #analysis pipeline with linear svc
    pipeline['svc'] = Pipeline([
        ('vect', CountVectorizer(stop_words = ENGLISH_STOP_WORDS)),
        ('tfidf', TfidfTransformer(use_idf = True)),
        ('clf', LinearSVC())])

    parameters['svc'] = {
        'vect__ngram_range': ((1, 1), (1,3), (5, 5)),
        'vect__analyzer' : ('char_wb','word'),
        'clf__C': (1, 1e3, 1e-3)}

    #analysis pipeline with svm with rbf kernel
    pipeline['svm'] = Pipeline([
        ('vect', CountVectorizer(stop_words = ENGLISH_STOP_WORDS)),
        ('tfidf', TfidfTransformer(use_idf = True)),
        ('clf', SVC())])

    parameters['svm'] = {
        'vect__ngram_range': ((1, 1), (1,3), (5, 5)),
        'vect__analyzer' : ('char_wb','word'),
        'clf__C': (1, 1e3, 1e-3),
        'clf__gamma': (1, 1e2, 1e-2)}

    #analysis pipeline with logistic regression
    pipeline['log'] = Pipeline([
        ('vect', CountVectorizer(stop_words = ENGLISH_STOP_WORDS)),
        ('tfidf', TfidfTransformer(use_idf = True)),
        ('clf', LogisticRegression(penalty = 'l2', solver = 'lbfgs', multi_class = 'multinomial'))])
        
    parameters['log'] = {
        'vect__ngram_range': ((1, 1), (1,2), (1,3), (5, 5)),
        'vect__analyzer' : ('char_wb','word'),
        'clf__C': (1, 1e3, 1e-3)}

    # analysis pipeline with naive bayes
    pipeline['nb'] = Pipeline([
        ('vect', CountVectorizer(stop_words = ENGLISH_STOP_WORDS)),
        ('tfidf', TfidfTransformer(use_idf = True)),
        ('clf', MultinomialNB(fit_prior = True))])
    parameters['nb'] = {
        'vect__ngram_range': ((1, 1), (1,2), (1,3), (5, 5)),
        'vect__analyzer' : ('char_wb','word')}

    #analysis pipeline with kmeans
    pipeline['knn'] = Pipeline([
        ('vect', CountVectorizer(stop_words = ENGLISH_STOP_WORDS)),
        ('tfidf', TfidfTransformer(use_idf = True)),
        ('clf', KNeighborsClassifier())])
    parameters['knn'] = {
        'vect__ngram_range': ((1, 1), (1,2), (1,3), (5, 5)),
        'vect__analyzer' : ('char_wb','word'),
        'clf__n_neighbors': (8, 9, 10, 15),
        'clf__weights': ('uniform', 'distance')}


    #fit grid search instance
    for m in ['svc','svm', 'log','nb', 'knn']:
        scorer = 'accuracy'
        gs_clf = GridSearchCV(pipeline[m], parameters[m], verbose = 1, cv = cv, n_jobs = -1, scoring = scorer)

        gs_clf = gs_clf.fit(df['TokensNouns'], df.Category)

        best_parameters, score, _ = max(gs_clf.grid_scores_, key=lambda x: x[1]) #find best params
    
        print ('{0}:\n\tbest score: {1}\nparameters: {2}'.format(m, score, best_parameters))

In [13]:
# Takes in training data, training labels, predicts labels for xtest using three methods.
def ensemble_predictor(df, test_df): 
    mod1 = LinearSVC(C=1) #vec analyzer = word, ngram(1,1)
    mod2 = MultinomialNB(alpha=1.0, fit_prior=True, class_prior=None)
    mod3 = LogisticRegression(C=1)#vec analyzer = word, ngram(1,3)
    mod4 = KNeighborsClassifier(n_neighbors=15)
    mod5 = SVC(C=1, gamma=1)
    
    ytrain = df['Category']
    xtrain1, xtest1 = featurize(df, test_df, CountVectorizer(analyzer='word', ngram_range=(1,3)))
    xtrain2, xtest2 = featurize(df, test_df, CountVectorizer(analyzer='word', ngram_range=(1,1)))
    xtrain3, xtest3 = featurize(df, test_df, CountVectorizer(analyzer='word', ngram_range=(1,2)))
    xtrain4, xtest4 = featurize(df, test_df, CountVectorizer(analyzer='word', ngram_range=(1, 3)))
    #print(df.head())
    xtrain5, xtest5 = featurize(df, test_df, CountVectorizer(analyzer='word', ngram_range=(1, 3)))
    
    
    mod1.fit(xtrain1, ytrain)
    mod2.fit(xtrain2, ytrain)
    mod3.fit(xtrain3, ytrain)
    mod4.fit(xtrain4, ytrain)
    mod5.fit(xtrain5, ytrain)
    
    y1 = mod1.predict(xtest1)
    y2 = mod2.predict(xtest2)
    y3 = mod3.predict(xtest3)
    y4 = mod4.predict(xtest4)
    y5 = mod5.predict(xtest5)
    
    votes = zip(y1, y3, y4, y5)
    ypredicted = []
    count = 0
    for v in votes:
        v = list(v)
        p = max(set(v), key=v.count)
        if len(np.unique(v)) == len(v):
            count += 1
            p = v[1]
        ypredicted.append(p)
    return ypredicted

In [14]:
#evaluate using 10 fold cross-validation, inspect results
def cross_validate(targets, df):
    
    #LinearSVC
    mod = LinearSVC(C=1)
    cv = StratifiedShuffleSplit(targets, n_iter=10, test_size=.1)
    
    scores = []
    for tr, tt in cv:
        train = df.loc[tr]
        test = df.loc[tt]
        result = featurize(train, test, CountVectorizer(analyzer='word', ngram_range=(1,3)))
        xtrain = result[0]
        xtest = result[1]
        mod.fit(xtrain, targets[tr])
        scores.append(mod.score(xtest, targets[tt]))

    print('\nLinear SVC\n\t mean score: {0}'.format(np.mean(scores)))
    
    #SVC
    mod = SVC(C=1, gamma=1.0)
    cv = StratifiedShuffleSplit(targets, n_iter=10, test_size=.1)
    
    scores = []
    for tr, tt in cv:
        train = df.loc[tr]
        test = df.loc[tt]
        result = featurize(train, test, CountVectorizer(analyzer='char_wb', ngram_range=(5, 5)))
        xtrain = result[0]
        xtest = result[1]
        mod.fit(xtrain, targets[tr])
        scores.append(mod.score(xtest, targets[tt]))

    print('\nRBF SVC\n\t mean score: {0}'.format(np.mean(scores)))
    
    
    #naive bayes
    mod = MultinomialNB(alpha=1.0, fit_prior=True)
    scores = []
    for tr, tt in cv:
        train = df.loc[tr]
        test = df.loc[tt]
        result = featurize(train, test, CountVectorizer(analyzer='word', ngram_range=(1,2)))
        xtrain = result[0]
        xtest = result[1]
        mod.fit(xtrain, targets[tr])
        scores.append(mod.score(xtest, targets[tt]))
    print('\nNaive Bayes\n\t mean score: {0}'.format(np.mean(scores)))
    
    """
    mod = AdaBoostClassifier()
    scores = []
    for tr, tt in cv:
        train = df.loc[tr]
        test = df.loc[tt]
        result = featurize(train, test, CountVectorizer(analyzer='word', ngram_range=(1,2)))
        xtrain = result[0]
        xtest = result[1]
        mod.fit(xtrain, targets[tr])
        scores.append(mod.score(xtest, targets[tt]))
    print('\nAdaBoost\n\t mean score: {0}'.format(np.mean(scores)))
    """
    mod = LogisticRegression(C=1)
    scores = []
    for tr, tt in cv:
        train = df.loc[tr]
        test = df.loc[tt]
        result = featurize(train, test, CountVectorizer(analyzer='word', ngram_range=(1,2)))
        xtrain = result[0]
        xtest = result[1]
        mod.fit(xtrain, targets[tr])
        scores.append(mod.score(xtest, targets[tt]))
    print('\nLogReg\n\t mean score: {0}'.format(np.mean(scores)))
    
    mod = KNeighborsClassifier(n_neighbors=15)
    scores = []
    for tr, tt in cv:
        train = df.loc[tr]
        test = df.loc[tt]
        result = featurize(train, test, CountVectorizer(analyzer='word', ngram_range=(1, 3)))
        xtrain = result[0]
        xtest = result[1]
        mod.fit(xtrain, targets[tr])
        scores.append(mod.score(xtest, targets[tt]))
    print('\nKNN\n\t mean score: {0}'.format(np.mean(scores)))
    
      
    
    scores = []
    for tr, tt in cv:
        train = df.loc[tr]
        test = df.loc[tt]
        predictions = ensemble_predictor(train, test)
        scores.append(accuracy_score(targets[tt], predictions))
    print('\nEnsemble Predictor\n\t mean score: {0}'.format(np.mean(scores)))


In [15]:
#Only do when necessary. Will take 12 hours
#find_params(train_df)

In [16]:
#train_df.head()
cross_validate(train_df['Category'], train_df)


AdaBoost
	 mean score: 0.6270408163265306

LogReg
	 mean score: 0.686807580174927


In [17]:
#test_df = add_features(unknown_df)

In [None]:
predictions = ensemble_predictor(train_df, unknown_df)

In [None]:
df = pd.DataFrame(predictions).reset_index().head()
df.columns = (['ID','Category'])
df['ID'] = df['ID']+1
df.to_csv('drug_predictions.csv')