In [1]:
# Usage: python predict.py data/train.tsv data/test.tsv

import sys
import utils as ut

from bs4 import BeautifulSoup

import string
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer

from sklearn.feature_extraction.text import CountVectorizer

from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.multiclass import OneVsRestClassifier
from sklearn.svm import SVC

In [2]:
# Global Variables
bs_parser = "lxml"

TRANSLATOR_REMOVE_PUNCTUATION = str.maketrans({key: None for key in string.punctuation})
NLTK_STOPWORDS = set(stopwords.words('english'))
STEMMER = PorterStemmer()

In [3]:
def remove_html(s):
    soup = BeautifulSoup(s, bs_parser)
    return soup.get_text().encode('ascii', 'ignore').decode("utf-8") 

def remove_punctuation(string):
    '''
    Source: http://stackoverflow.com/questions/34293875/how-to-remove-punctuation-marks-from-a-string-in-python-3-x-using-translate
    '''
    # pass the translator to the string's translate method.
    return string.translate(TRANSLATOR_REMOVE_PUNCTUATION)

def lowercase_check_for_stopword_stem(token):
    
    token = token.lower()
    if token not in NLTK_STOPWORDS:
        return True, STEMMER.stem(token)
    else:
        return False, None

def remove_punctuation_lowercase_check_for_stopword_stem(string):
    
    string = remove_punctuation(string)
    
    # Not using nltk.word_tokenize(s) cause it takes long time to process.
    # Can be considered in case of complex text    
    token_list = string.split()

    op = []

    for token in token_list:
        a,b = lowercase_check_for_stopword_stem(token)
        if a:
            op.append(b)
  
    return ' '.join(op)

In [4]:
#Command line args
train_data_path = "data/train.tsv"
test_data_path = "data/test.tsv"

In [5]:
df_train = ut.pd.read_table(train_data_path, sep='\t', \
                            encoding='utf-8', \
                            header=0, na_values=" NaN")
print("\nRead data")

df_train['Product Long Description'].loc[ \
            df_train[ 'Product Long Description'].isnull()] = "No data "

df_train['Product Name'].loc[df_train['Product Name'].isnull()] \
                    = "No data"
       
df_train['Product Long Description'] = \
            df_train['Product Long Description'] + " " + \
            df_train['Product Name']

df = df_train.loc[:, ['item_id', 'Item Class ID', \
                          'Product Long Description', 'Product Name', \
                          'tag']]

df['tag'] = df['tag'].apply(ut.change_string_to_int_list)

df_train_without_na = df.dropna(axis=0, how='any')
df_train_without_na = df_train_without_na.set_index('item_id')
df_train_without_na.index.name = None



df_train_without_na_after_bs = df_train_without_na.copy()

df_train_without_na_after_bs['Product Long Description'] = \
    df_train_without_na_after_bs['Product Long Description'].apply(remove_html)

df_train_without_na_after_bs['Product Long Description'].loc[ \
    (df_train_without_na_after_bs['Product Long Description'] == "")] = \
    "No data " + \
    df_train_without_na_after_bs['Product Name'].loc[ \
        (df_train_without_na_after_bs['Product Long Description'] == "")]


    
df_train_without_na_after_bs_nlp = df_train_without_na_after_bs.copy()

df_train_without_na_after_bs_nlp['Product Long Description'] = \
    df_train_without_na_after_bs_nlp['Product Long Description'].apply( \
                    remove_punctuation_lowercase_check_for_stopword_stem)

 
file_to_save = ut.os.path.join(ut.DATA_DIR, 'cleaned_train2.tsv')
df_train_without_na_after_bs_nlp.to_csv(file_to_save, encoding='utf-8', \
                           sep="\t",index=False)



# df_train_cleaned = preprocess_write_data( \
#                         df_train_without_na, "train_cleaned.tsv")

# df_train_cleaned['tag'] = df_train_cleaned['tag'].apply( \
#                                     ut.change_string_to_int_list)


Read data


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self._setitem_with_indexer(indexer, value)


In [121]:
df_test_raw = ut.pd.read_table(test_data_path, sep='\t', encoding='utf-8', \
                            header=0, na_values=" NaN")
print("\nRead data")

df_test = df_test_raw.loc[:, ['item_id', 'Item Class ID','Product Long Description', 'Product Name']]


df_test['Product Long Description'].loc[ \
            df_test[ 'Product Long Description'].isnull()] = "No data"

df_test['Product Name'].loc[df_test['Product Name'].isnull()] \
                    = "No data"

df_test['Product Long Description'] = \
            df_test['Product Long Description'] + " "+ \
            df_test['Product Name']
        
        

df_test_after_bs = df_test.copy()

df_test_after_bs['Product Long Description'] = \
    df_test_after_bs['Product Long Description'].apply(remove_html)
    
    
df_test_after_bs['Product Long Description'].loc[ \
    (df_test_after_bs['Product Long Description'] == "")] = \
    "No data " + \
    df_test_after_bs['Product Name'].loc[ \
        (df_test_after_bs['Product Long Description'] == "")]
    
    
    
    
df_test_after_bs_nlp = df_test.copy()

df_test_after_bs_nlp['Product Long Description'] = \
    df_test_after_bs_nlp['Product Long Description'].apply( \
            remove_punctuation_lowercase_check_for_stopword_stem)
    
    
file_to_save = ut.os.path.join(ut.DATA_DIR, 'cleaned_test2.tsv')
df_test_after_bs_nlp.to_csv(file_to_save, encoding='utf-8', \
                           sep="\t",index=False)
    


Read data


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self._setitem_with_indexer(indexer, value)


In [123]:
sum(df_test_raw['Product Long Description'].isnull())

6

In [122]:
df_train_clean = df_train_without_na_after_bs_nlp.copy()
df_test_clean = df_test_after_bs_nlp.copy()

print(df_train_clean.shape)
print(df_test_clean.shape)

(10578, 4)
(10593, 4)


In [91]:
def generate_features_from_data_frame(df_cleaned, npl_features=1000):
    
    vectorizer = CountVectorizer(analyzer = "word", \
                                 tokenizer = None, \
                                 preprocessor = None, \
                                 ngram_range = (1, 1), \
                                 strip_accents = 'unicode', \
                                 max_features = npl_features)

    feature_matrix = vectorizer.fit_transform( \
                            df_cleaned['Product Long Description'])
    
    return feature_matrix.toarray()
    
#     mlb = MultiLabelBinarizer()
#     dummy_item_class_id = mlb.fit_transform(df_cleaned['Item Class ID'])
        
#     return ut.np.concatenate((feature_matrix.toarray(), \
#                               dummy_item_class_id), axis=1)



In [92]:
X_train = generate_features_from_data_frame(df_train_clean)
X_train.shape

(10578, 1000)

In [93]:
X_test = generate_features_from_data_frame(df_test_clean)
X_test.shape

(10593, 1000)

In [94]:
mlb = MultiLabelBinarizer()
dummy_tags = mlb.fit_transform(df_train_clean['tag'])

In [95]:
classif = OneVsRestClassifier(SVC(kernel='linear'))
classif.fit(X_train, dummy_tags)

# print(classif.score(X, dummy_tags))

OneVsRestClassifier(estimator=SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape=None, degree=3, gamma='auto', kernel='linear',
  max_iter=-1, probability=False, random_state=None, shrinking=True,
  tol=0.001, verbose=False),
          n_jobs=1)

In [96]:
yt = classif.predict(X_test)

In [97]:
t = mlb.inverse_transform(yt)

In [98]:
prediction = []

In [99]:
e_c = 0
for e in t:
    if e==():
        e_c+=1
    prediction.append(list(e))
    
e_c

2520

In [100]:
prediction

[[4537, 127175, 522484],
 [],
 [4537],
 [95987, 1085065],
 [650659],
 [4483],
 [],
 [133270, 522484, 529295],
 [95987, 522484, 1070524],
 [4537],
 [4483, 1070524],
 [529295],
 [],
 [106546, 1229817],
 [4483, 447913],
 [4537],
 [4537],
 [4457, 4483, 447913, 522484, 1071165, 1229821],
 [],
 [4536],
 [4483],
 [95987],
 [4483, 1070524],
 [],
 [4483],
 [648819, 1229817],
 [127175],
 [127175],
 [4483, 95987, 106546, 522484, 1070524],
 [4483, 1071165],
 [],
 [4537, 4538],
 [4483, 1229817],
 [4483, 4536, 581514, 1070524],
 [95987],
 [127175],
 [4457, 4483, 95987, 1071165],
 [5065],
 [95987, 522484, 1071165, 1225174],
 [4538],
 [4483, 522484],
 [1229819],
 [95987, 127175, 522484, 1070524],
 [581514],
 [106546, 447913],
 [1225174],
 [4537, 447913, 1071165, 1225174],
 [4537, 4538, 447913],
 [4536],
 [4483],
 [4457, 4483, 4537, 447913],
 [4483],
 [4537],
 [4483, 522484, 1070524],
 [4537, 4538],
 [4537, 522484],
 [4483, 127175],
 [4457],
 [4537],
 [4537, 4538, 1229821],
 [4457],
 [4537, 1229819, 12

In [105]:
df_res = ut.pd.Series(prediction, name='tag', index=df_test.item_id)

In [106]:
df_res.head()

item_id
10593    [4537, 127175, 522484]
10594                        []
10595                    [4537]
10596          [95987, 1085065]
10597                  [650659]
Name: tag, dtype: object

In [107]:
df_res.to_csv('result-svc.tsv', sep="\t", encoding='utf-8', header=True)

In [108]:
ut.get_distribution(df_res)

4457        510
4483       3116
4536        498
4537       2287
4538        955
5065        143
62056       112
95987       796
106546      604
127175      531
133270        5
447913      908
522484      892
529295      335
581514      410
645319        8
648819       37
650659      499
1070524     596
1071165     389
1084835     126
1085065     260
1180168      77
1225174     329
1229817     571
1229818       1
1229819     142
1229820     107
1229821     387
1229823     127
1229825       9
3304195     237
dtype: int64

In [109]:
from sklearn.tree import DecisionTreeClassifier

def my_Tree(X_train, Y_train):
    
    classif = OneVsRestClassifier(DecisionTreeClassifier(random_state=0))
    classif.fit(X_train, Y_train)
    
#     %time clf = MultinomialNB().fit(X_train, Y_train) 
    
    return classif

In [110]:
classif_tree_bow = my_Tree(X_train, dummy_tags)

In [117]:
yt2 = classif_tree_bow.predict(X_test)
t = mlb.inverse_transform(yt2)

In [118]:
prediction2 = []
e_c = 0
for e in t:
    if e==():
        e_c+=1
    prediction2.append(list(e))
    
e_c

1757

In [119]:
df_res2 = ut.pd.Series(prediction2, name='tag', index=df_test.item_id)

In [120]:
df_res2.to_csv('result-tree.tsv', sep="\t", encoding='utf-8', header=True)