In [1]:
import numpy as np
import pandas as pd

In [3]:
ques_set = pd.read_csv('../data/processed/ques_with_one_tag_labelled.csv')

In [4]:
ques_set.shape

(22609, 7)

In [18]:
# Read-processed-tags-data-(in-two-formats)

# Gets a random 80% of the entire set
X_train = ques_set.sample(frac=0.8, random_state=10)

# Gets the left out portion of the dataset
X_test = ques_set.loc[~ques_set.index.isin(X_train.index)]

# alternatively
# from sklearn.model_selection import train_test_split

def find_occurance(df):
    for i in range(1,8):
        category= 'f'+str(i)
        row,col= df[df['Category'].apply(lambda x: x == category)].shape
        print "Total rows of Category 'f{i}': {row}".format(i=i,row=row)
print X_train.shape
print X_test.shape
find_occurance(X_train)
print "+=========+"
find_occurance(X_test)

(18087, 7)
(4522, 7)
Total rows of Category 'f1': 440
Total rows of Category 'f2': 1117
Total rows of Category 'f3': 4599
Total rows of Category 'f4': 753
Total rows of Category 'f5': 1506
Total rows of Category 'f6': 9564
Total rows of Category 'f7': 108
Total rows of Category 'f1': 111
Total rows of Category 'f2': 279
Total rows of Category 'f3': 1120
Total rows of Category 'f4': 191
Total rows of Category 'f5': 361
Total rows of Category 'f6': 2438
Total rows of Category 'f7': 22


## Creating features (Title) and labels (Category)

In [19]:
train_features = np.array(X_train['Title']) # (X_train['Body'])
train_labels = np.array(X_train['Category'].apply(lambda x: int(x[1])))

from sklearn.feature_extraction.text import CountVectorizer
count_vect = CountVectorizer(stop_words="english")
X_train_counts = count_vect.fit_transform(train_features.astype('U'))

In [40]:
print len(count_vect.get_feature_names())
# data= count_vect.vocabulary_
# pd.Data Frame.from_dict([data])
print count_vect.vocabulary_.get(u'html')
print len(count_vect.vocabulary_)

6651
2712
6651


In [41]:
# Tfidf transformation on countervectorized training data
from sklearn.feature_extraction.text import TfidfTransformer
tf_transformer = TfidfTransformer(use_idf=False).fit(X_train_counts)
X_train_tf = tf_transformer.transform(X_train_counts)
tf_transformer.get_params(deep=True)

{'norm': u'l2', 'smooth_idf': True, 'sublinear_tf': False, 'use_idf': False}

## Training a classifier (Naive Bayes)

In [43]:
# Training the classifier (Naive Bayes)

from sklearn.naive_bayes import MultinomialNB
clf_naive = MultinomialNB().fit(X_train_tf, train_labels)

In [46]:
# Prediction on test data
test_features= np.array(X_test['Title'].apply(lambda x: str(x)))
test_labels= np.array(X_test['Category'].apply(lambda x: int(x[1])))

# Tlidf Transformation of test data
X_new_counts = count_vect.transform(test_features)
X_new_tfidf = tf_transformer.transform(X_new_counts)

predicted_naive = clf_naive.predict(X_new_tfidf)
np.mean(predicted_naive == test_labels)

0.86200796107916855

## Training classifier (SVM)

In [47]:
from sklearn.svm import SVC
clf_svc = SVC(kernel="rbf", C=10000.0)  #SVC(kernel="linear")
clf_svc.fit(X_train_tf, train_labels)
predicted_svc= clf_svc.predict(X_new_tfidf)
np.mean(predicted_svc == test_labels)
# C= 1000, rbf, accuracy=0.90070765148164533
# C=100000000.0, rbf, accuracy= 0.91110128261831047
# C=10000.0, rbf, accuracy= 0.93078283945157014
# C=100000.0, rbf, accuracy = 0.92105263157894735
# predicted_svc

0.92525431225121624