In [16]:
import numpy as np
import pandas as pd
import string
import re
import nltk

In [20]:
data = pd.read_csv('./train.tsv',sep='\t',names=['product','label'])
data.head()

Unnamed: 0,product,label
0,Calvin Klein IN2U Eau de Toilette - 150 ml (...,calvin klein
1,For The Love of Physics (Paperback),physics
2,Nike Fission Deodorant Spray - 200 ml (For Men),nike-deodrant
3,Spoken English (With CD) 2nd Edition (Paperback),spoken english
4,The C++ Programming Language 3 Edition (Paperb...,c programming


In [21]:
from sklearn.model_selection import train_test_split

In [22]:
X = data['product']
y = data['label']

In [23]:
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.2)

In [64]:
stopwords = nltk.corpus.stopwords.words('english')
def remove_punctuation(text):
    '''Removes punctuation from the corpus as they do not contribute any information.'''
    text = ''.join([char for char in text if char not in string.punctuation])
    return text

def tokenize(text):
    '''A crude form of tokenization by splitting the sentences with whitespace.'''
    tokens = re.split('\W+',text)
    return tokens

def remove_stopwords(text):
    '''Removes common words in english that occur too frequently in our text and do not add any valuable info to model.'''
    text = [word.lower() for word in text]
    clean_text = [word for word in text if word not in stopwords]
    return clean_text

def preprocess(text):
    '''Combination of all the necassary text preprocessing'''
    text = ''.join([char for char in text if char not in string.punctuation])
    tokens = re.split('\W+',text)
    tokens = [word.lower() for word in tokens]
    text = " ".join([word for word in tokens if word not in stopwords])
    
    return text

In [80]:
# Dataset split
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.2)

# Tf-idf vectorizer. Converts our tokens into numeric features. The vectorizer object is initialized with an analyzer which
# is the preprocess function
from sklearn.feature_extraction.text import TfidfVectorizer
vectorizer = TfidfVectorizer(analyzer=preprocess)
X_train_tfidf = vectorizer.fit_transform(X_train)
X_train_tfidf.shape

# Random Forest Classifier
from sklearn.ensemble import RandomForestClassifier
random_forest_model = RandomForestClassifier(n_estimators=300,max_depth=30)
random_forest_model.fit(X_train_tfidf,y_train)

# A pipeline for NLP
from sklearn.pipeline import Pipeline
rfc_pipeline = Pipeline([('tfidf',TfidfVectorizer()),('rfc',RandomForestClassifier())])
rfc_pipeline.fit(X_train,y_train)

# Evaluation
from sklearn import metrics
predictions = rfc_pipeline.predict(X_test)
print(metrics.accuracy_score(y_test,predictions))
print(metrics.classification_report(y_test,predictions))

0.7391304347826086
                            precision    recall  f1-score   support

                   axe deo       1.00      1.00      1.00         1
         best-seller books       0.00      0.00      0.00         1
             c programming       0.40      1.00      0.57         2
              calvin klein       1.00      1.00      1.00         1
                 camcorder       1.00      1.00      1.00         2
                    camera       0.67      1.00      0.80         2
                 chemistry       0.00      0.00      0.00         2
data structures algorithms       1.00      1.00      1.00         2
              dell laptops       0.00      0.00      0.00         1
               mathematics       0.00      0.00      0.00         1
             nike-deodrant       1.00      1.00      1.00         1
            sony cybershot       1.00      1.00      1.00         1
            spoken english       1.00      1.00      1.00         1
               timex watch  

  'precision', 'predicted', average, warn_for)


In [81]:
# Tf-idf vectorizer. Converts our tokens into numeric features. The vectorizer object is initialized with an analyzer which
# is the preprocess function
from sklearn.feature_extraction.text import TfidfVectorizer
vectorizer = TfidfVectorizer(analyzer=preprocess)
X_train_tfidf = vectorizer.fit_transform(X_train)
X_train_tfidf.shape

# Support Vector Machine Classifier.SVM params found using grid search.
from sklearn.svm import SVC
svm = SVC(C=10,gamma=0.1)
svm.fit(X_train_tfidf,y_train)

# A pipeline for NLP
from sklearn.pipeline import Pipeline
svm_pipeline = Pipeline([('tfidf',TfidfVectorizer()),('svm',LinearSVC())])
svm_pipeline.fit(X_train,y_train)

# Evaluation
predictions = svm_pipeline.predict(X_test)
print(metrics.accuracy_score(y_test,predictions))
print(metrics.classification_report(y_test,predictions))

0.782608695652174
                            precision    recall  f1-score   support

                   axe deo       1.00      1.00      1.00         1
         best-seller books       0.00      0.00      0.00         1
             c programming       0.67      1.00      0.80         2
              calvin klein       1.00      1.00      1.00         1
                 camcorder       1.00      1.00      1.00         2
                    camera       0.67      1.00      0.80         2
                 chemistry       0.00      0.00      0.00         2
data structures algorithms       1.00      1.00      1.00         2
              dell laptops       0.00      0.00      0.00         1
               mathematics       0.00      0.00      0.00         1
             nike-deodrant       1.00      1.00      1.00         1
                   physics       0.00      0.00      0.00         0
            sony cybershot       1.00      1.00      1.00         1
            spoken english   

  'precision', 'predicted', average, warn_for)
  'recall', 'true', average, warn_for)


In [53]:
# Grid Search for SVM

from sklearn.grid_search import GridSearchCV
from sklearn.svm import SVC
param_grid = {'C':[0.1,1,10,100,1000],'gamma':[1,0.1,0.01,0.001,0.0001]}
svm_grid = GridSearchCV(SVC(),param_grid,verbose=3)
svm_grid.fit(X_train_tfidf,y_train)
print(svm_grid.best_params_)

[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.0s remaining:    0.0s
[Parallel(n_jobs=1)]: Done   2 out of   2 | elapsed:    0.0s remaining:    0.0s


Fitting 3 folds for each of 25 candidates, totalling 75 fits
[CV] C=0.1, gamma=1 ..................................................
[CV] ......................... C=0.1, gamma=1, score=0.081081 -   0.0s
[CV] C=0.1, gamma=1 ..................................................
[CV] ......................... C=0.1, gamma=1, score=0.107143 -   0.0s
[CV] C=0.1, gamma=1 ..................................................
[CV] ......................... C=0.1, gamma=1, score=0.130435 -   0.0s
[CV] C=0.1, gamma=0.1 ................................................
[CV] ....................... C=0.1, gamma=0.1, score=0.081081 -   0.0s
[CV] C=0.1, gamma=0.1 ................................................
[CV] ....................... C=0.1, gamma=0.1, score=0.107143 -   0.0s
[CV] C=0.1, gamma=0.1 ................................................
[CV] ....................... C=0.1, gamma=0.1, score=0.130435 -   0.0s
[CV] C=0.1, gamma=0.01 ...............................................
[CV] ...........

[Parallel(n_jobs=1)]: Done  75 out of  75 | elapsed:    0.3s finished


In [65]:
svm_grid.best_params_

{'C': 10, 'gamma': 0.1}

In [70]:
# Better classifier
predictions = svm_pipeline.predict(X_test)
print(predictions)

['titan watch' 'titan watch' 'sony cybershot' 'axe deo' 'c programming'
 'timex watch' 'chemistry' 'nike-deodrant' 'dell laptops' 'dslr canon'
 'nike-deodrant' 'physics' 'sony cybershot' 'dslr canon' 'dell laptops'
 'data structures algorithms' 'spoken english' 'nike-deodrant' 'camera'
 'tommy watch' 'c programming' 'sony cybershot' 'physics']
