In [2]:
%matplotlib

Using matplotlib backend: MacOSX


In [3]:

import seaborn as sns
from matplotlib import pyplot as plt
import matplotlib as mpl
import numpy as np
%matplotlib inline
sns.set_palette('colorblind')
sns.set_style('white')

In [4]:
import pandas as pd
import nltk
import string

from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer, TfidfVectorizer
from sklearn.preprocessing import LabelEncoder
from sklearn.svm import LinearSVC
from sklearn.cross_validation import StratifiedShuffleSplit, train_test_split
from sklearn.feature_extraction.stop_words import ENGLISH_STOP_WORDS
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import SGDClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report,confusion_matrix 

In [5]:
from nltk.corpus import brown

In [6]:
from collections import Counter

In [7]:
def get_data():
    df = pd.read_csv('newtrain.csv')
    return df

In [8]:
def get_test_data():
    df = pd.read_csv('newtest.csv')
    return df

In [9]:
def add_features(df):

    #tokenize text
    pattern = r'''(?x)    # set flag to allow verbose regexps
         ([A-Z]\.)+        # abbreviations, e.g. U.S.A.
       | \w+([-']\w+)*        # words with optional internal hyphens
       | \$?\d+(\.\d+)?%?  # currency and percentages, e.g. $12.40, 82%
       | [!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~]+  # these are separate tokens (string.punctuation)
     '''
    tokenize = lambda text: nltk.regexp_tokenize(text, pattern)
    df['Tokens'] = df['Text'].apply(tokenize)
    
    def pos_tag(text):
        tuples = nltk.pos_tag(text)
        tags = []
        for t in tuples:
            tags.append(t[1])
        return tags
    def get_nouns(text):
        wnlemmatizer = nltk.WordNetLemmatizer()
        tuples = nltk.pos_tag(text)
        tags = []
        for t in tuples:
            if t[1][0] == 'N':
                tags.append(wnlemmatizer.lemmatize(t[0]))
        return tags
    df['Nouns'] = df['Tokens'].apply(get_nouns)
    df['POS'] = df['Tokens'].apply(pos_tag)
    make_string = lambda a: ' '.join(i for i in a)
    df['Nouns'] = df['Nouns'].apply(make_string)
    df['POS'] = df['POS'].apply(make_string)
    return df

In [25]:
#vectorize data
#turns words into a list of vectors - vector length is the total number of words
#Vector elements correspond to 1 word (1/0 if word is/not present in the current item)
def featurize(df, test_df, max_f, targets=True):
    count_vec = CountVectorizer(analyzer = 'char_wb', ngram_range= (5,5), stop_words = ENGLISH_STOP_WORDS)
    noun_vec = CountVectorizer(analyzer = 'char_wb')
    pos_vec = CountVectorizer()
    vec_tar = LabelEncoder()
    targets = df['Category']
        
    counts = count_vec.fit_transform(df['Text'])
    #print("Counts shape: " + str(counts.shape))
    nouns = noun_vec.fit_transform(df['Nouns'])
    #print("Nouns shape: " + str(nouns.shape))
    pos = pos_vec.fit_transform(df['POS'])
    #print("POS shape: " + str(pos.shape))
    
    
    tfidf_transformer = TfidfTransformer()
    tfidf = tfidf_transformer.fit_transform(counts)
    
    
    test = count_vec.transform(test_df['Text'])
    test = tfidf_transformer.transform(test)
    test_nouns = noun_vec.transform(test_df['Nouns'])
    test_pos = pos_vec.transform(test_df['POS'])
    
    #To combine or not to combine
    nouns_array = nouns.toarray()
    pos_array = pos.toarray()
    combined = np.hstack((nouns_array, pos_array))
    
    tnouns_array = test_nouns.toarray()
    tpos_array = test_pos.toarray()
    
    tcombined = np.hstack((tnouns_array, tpos_array))
    
    return targets, tfidf, test

In [11]:
# Just for the test data
def get_test_features(df, count_vec, transformer):
    
    test_counts = count_vec.transform(df['Text'])
    test_tfidf = transformer.tranform(test_counts)
    return test_tfidf

In [12]:
#evaluate using 10 fold cross-validation, inspect results
#KNN WITH CV#
def cross_validate(targets, df):
    #SVC
    mod = LinearSVC(C=.1)
    cv = StratifiedShuffleSplit(targets, n_iter=10, test_size=.1)
    
    scores = []
    for tr, tt in cv:
        train = df.loc[tr]
        test = df.loc[tt]
        result = featurize(train, test, 1000)
        xtrain = result[1]
        xtest = result[2]
        mod.fit(xtrain, targets[tr])
        scores.append(mod.score(xtest, targets[tt]))

    print('\nLinear SVC\n\t mean score: {0}'.format(np.mean(scores)))
    
    #svm
    #mod = SGDClassifier(loss = 'hinge',penalty = 'L2',alpha = 1e-3)
    #scores = []
    #for tr, tt in cv:
    #    mod.fit(features[tr], targets[tr])
    #    scores.append(mod.score(features[tt],targets[tt]))
    #print('\nLinear SVM\n\t mean score: {0}'.format(np.mean(scores)))
    
    
    
    #naive bayes
    mod = MultinomialNB(alpha=1.0, fit_prior=True, class_prior=None)
    scores = []
    for tr, tt in cv:
        train = df.loc[tr]
        test = df.loc[tt]
        result = featurize(train, test, 1000)
        xtrain = result[1]
        xtest = result[2]
        mod.fit(xtrain, targets[tr])
        scores.append(mod.score(xtest, targets[tt]))
    print('\nNaive Bayes\n\t mean score: {0}'.format(np.mean(scores)))
    
    
    logreg = LogisticRegression()
    scores = []
    for tr, tt in cv:
        train = df.loc[tr]
        test = df.loc[tt]
        result = featurize(train, test, 1000)
        xtrain = result[1]
        xtest = result[2]
        mod.fit(xtrain, targets[tr])
        scores.append(mod.score(xtest, targets[tt]))
    print('\nLogReg\n\t mean score: {0}'.format(np.mean(scores)))
    


In [13]:
def plot_confusion_matrix(cm, title, target_names, cmap=plt.cm.coolwarm):
    plt.figure(figsize=(8,8))
    plt.imshow(cm, interpolation='nearest', cmap=cmap)
    plt.title(title)
    plt.colorbar()
    tick_marks = np.arange(len(target_names))
    plt.xticks(tick_marks, target_names, rotation=45)
    plt.yticks(tick_marks, target_names)
    plt.tight_layout()
    plt.ylabel('True label')
    plt.xlabel('Predicted label')

In [14]:
def examine_classification(model):
    result = get_labels_features(df, 1000)
    y = result[0]
    x = result[1]
    xtrain, xtest, ytrain, ytest = train_test_split(x, labels, test_size=0.33, random_state=99)
    
    model.fit(xtrain, ytrain)
    ypredicted = model.predict(xtest)
    print(accuracy_score(ytest, ypredicted))
    print()
    print(pd.crosstab(ytest, ypredicted, 
            rownames=['True'], colnames=['Predicted'], 
            margins=True))
    print()
    cm = confusion_matrix(ytest, ypredicted)
    plot_confusion_matrix(cm, "Confusion Matrix", range(1,8))

In [15]:
df = get_data()
df = add_features(df)
df.head()

Unnamed: 0,Category,Text,Tokens,Nouns,POS
0,5,why are yawns contagious? when people yawn,"[why, are, yawns, contagious, when, people, yawn]",yawn people,WRB VBP NNS JJ WRB NNS VBP
1,6,what is trans fat? how to reduce that? i heard...,"[what, is, trans, fat, how, to, reduce, that, ...",trans tras body food,WP VBZ NNS JJ WRB TO VB IN PRP VBP IN NNS VBP ...
2,1,roth ira vs 401k? what is the difference betwe...,"[roth, ira, vs, 401k, what, is, the, differenc...",roth ira v difference roth ira prefer,NN NN NNS CD WP VBZ DT NN IN NN NN CC CD WRB M...
3,1,how many planes fedex has? i heard that it is ...,"[how, many, planes, fedex, has, i, heard, that...",plane fedex airline world,WRB JJ NNS NN VBZ PRP VBP IN PRP VBZ DT JJS NN...
4,2,what is the best photo slideshow creation appl...,"[what, is, the, best, photo, slideshow, creati...",photo slideshow creation application photo sli...,WP VBZ DT JJS NN NN NN NN WP VBZ DT JJS NN NN ...


In [16]:
test_df = get_test_data()
test_df = add_features(test_df)

In [23]:
#examine_classification(SGDClassifier(loss = 'hinge',penalty = 'L2',alpha = 1e-3))
cross_validate(df['Category'], df)


Linear SVC
	 mean score: 0.5485185185185185

Naive Bayes
	 mean score: 0.42259259259259263

LogReg
	 mean score: 0.4177777777777778


In [17]:
#test_df.head()

In [24]:
# Predicting on actual test data
result = featurize(df, test_df, 1000)
labels = result[0]
x = result[1]
print(x.shape)
test = result[2]
print (test.shape)
#cross_validate(labels, x)

(2698, 23364)
(1874, 23364)


In [1]:
#result = get_test_features(test_df, count_vec, tfidf_trans)
#test = result[1]
#print(test.shape)

In [145]:
mod = LogisticRegression()
mod.fit(x, labels)
predictions = mod.predict(test)

In [146]:
import csv
with open('submission.csv', 'w') as f:
    writer = csv.writer(f)
    writer.writerow(['ID', 'Category'])
    for i, p in enumerate(predictions):
        writer.writerow([i + 1, p])